In [1]:
import pandas as pd

In [3]:
data = pd.read_csv('processed_data.csv')

In [7]:
X = data['join_processed_review'].astype(str)
y = data['overall'].astype(int)

In [9]:
from sklearn.model_selection import train_test_split 

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000))
])


In [25]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

param_dist = {
    'tfidf__max_features': [1000, 2000, 3000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'clf__C': uniform(0.1, 10)
}

In [27]:
search = RandomizedSearchCV(model, param_distributions=param_dist,
                            n_iter=10, cv=3, verbose=2, n_jobs=-1, random_state=42)

search.fit(train_X, train_y)
print(search.best_params_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits




{'clf__C': 1.6601864044243653, 'tfidf__max_features': 3000, 'tfidf__ngram_range': (1, 1)}


In [29]:
from sklearn.metrics import classification_report, accuracy_score

pred_y = search.best_estimator_.predict(test_X)

print("Accuracy:", accuracy_score(test_y, pred_y))
print("\nClassification Report:\n", classification_report(test_y, pred_y))


Accuracy: 0.8118006103763988

Classification Report:
               precision    recall  f1-score   support

           1       0.74      0.46      0.57        54
           2       0.00      0.00      0.00        14
           3       0.00      0.00      0.00        25
           4       0.25      0.03      0.05       110
           5       0.82      0.99      0.90       780

    accuracy                           0.81       983
   macro avg       0.36      0.30      0.30       983
weighted avg       0.72      0.81      0.75       983



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
