In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv("supervised_employee_feedback.csv", encoding='latin1')
df = df[df['Sentiment'].isin(['Positive', 'Neutral', 'Negative'])]
X = df['cleaned_summary'].fillna('')
y = df['Sentiment'].fillna('')
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1, 2))
X_vect = vectorizer.fit_transform(X)
smote = SMOTE(sampling_strategy='not majority', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_vect, y)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)


In [3]:
param_grid = {
    'C': [0.1, 0.5, 1.0, 5.0],
    'solver': ['liblinear', 'saga'],
    'penalty': ['l1', 'l2']
}

grid = GridSearchCV(
    LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    param_grid,
    cv=3,
    scoring='f1_macro',
    n_jobs=-1,
    verbose=1
)
grid.fit(X_train, y_train)
model = grid.best_estimator_

y_pred = model.predict(X_test)
print("Best Parameters:", grid.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro'))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1 Score:", f1_score(y_test, y_pred, average='macro'))
print("Classification Report:\n", classification_report(y_test, y_pred))

with open("sentiment_model.pkl", "wb") as f:
    pickle.dump(model, f)
with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer,f)

Fitting 3 folds for each of 16 candidates, totalling 48 fits




Best Parameters: {'C': 5.0, 'penalty': 'l1', 'solver': 'saga'}
Accuracy: 0.9654787854209966
Precision: 0.9659654236176093
Recall: 0.9654785616706602
F1 Score: 0.9655916946828657
Classification Report:
               precision    recall  f1-score   support

    Negative       0.98      0.96      0.97      6575
     Neutral       0.94      0.97      0.95      6576
    Positive       0.99      0.97      0.98      6576

    accuracy                           0.97     19727
   macro avg       0.97      0.97      0.97     19727
weighted avg       0.97      0.97      0.97     19727

