In [9]:
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

In [10]:
data = pd.read_csv('processed_data.csv')

In [11]:
X = data['join_processed_review'].astype(str)
y = data['overall'].astype(int)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('etc', ExtraTreesClassifier())])

In [19]:
params = [{
    'etc__n_estimators': [100, 500, 1000],        
    'etc__max_depth': [10, 30, None],             
    'etc__min_samples_split': [2, 10, 50],
    'etc__min_samples_leaf': [1, 5, 20],
    'etc__max_leaf_nodes': [None, 100, 500, 800, 1000],
    'bow__max_features': [1000, 2000]               
}]

In [21]:
search = RandomizedSearchCV(pipeline, 
                            param_distributions=params,
                            n_iter=10,
                            cv=3,
                            verbose=2,
                            n_jobs=-1,
                            random_state=42)

search.fit(X_train, y_train)
print("Best Parameters:", search.best_params_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters: {'etc__n_estimators': 1000, 'etc__min_samples_split': 2, 'etc__min_samples_leaf': 1, 'etc__max_leaf_nodes': 500, 'etc__max_depth': 30, 'bow__max_features': 2000}


In [23]:
model = search.best_estimator_

model.fit(X_train, y_train)

In [25]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7995930824008138

Classification Report:
               precision    recall  f1-score   support

           1       0.73      0.15      0.25        54
           2       0.00      0.00      0.00        14
           3       0.00      0.00      0.00        25
           4       0.00      0.00      0.00       110
           5       0.80      1.00      0.89       780

    accuracy                           0.80       983
   macro avg       0.31      0.23      0.23       983
weighted avg       0.68      0.80      0.72       983



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
