# Using Logistic Regression for Review Classification

## Loading and Pre-Processing The Data

In [1]:
import pandas as pd

In [2]:
df = pd.read_json('../checkpoint-1/processed_data.json', orient='records', lines=True)

df.head()

Unnamed: 0,category,rating,label,text_,text_processed,text_tokenized,text_embeddings
0,Home and Kitchen,5,CG,"Love this! Well made, sturdy, and very comfor...",love well made sturdy comfortable love itvery ...,"[love, well, made, sturdy, comfortable, love, ...","[-0.06552676860000001, 0.0019102807000000001, ..."
1,Home and Kitchen,5,CG,"love it, a great upgrade from the original. I...",love great upgrade original ive mine couple years,"[love, great, upgrade, original, ive, mine, co...","[-0.11993740500000001, 0.0896221548, 0.0629631..."
2,Home and Kitchen,5,CG,This pillow saved my back. I love the look and...,pillow saved back love look feel pillow,"[pillow, saved, back, love, look, feel, pillow]","[-0.0531752855, 0.0186075028, 0.0380044468, 0...."
3,Home and Kitchen,1,CG,"Missing information on how to use it, but it i...",missing information use great product price,"[missing, information, use, great, product, pr...","[0.007157366300000001, 0.0636947528, 0.0084662..."
4,Home and Kitchen,5,CG,Very nice set. Good quality. We have had the s...,nice set good quality set two months,"[nice, set, good, quality, set, two, months]","[-0.0369832478, 0.011753053400000001, 0.041596..."


In [3]:
df = df.dropna()

In [4]:
X = df['text_embeddings'].tolist()
y = df['label'].tolist()

len(X), len(y)

(40408, 40408)

In [5]:
from sklearn.preprocessing import LabelBinarizer

lb = LabelBinarizer()

y_encoded = lb.fit_transform(y).reshape(-1) # flattening the array

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, train_size=0.8)

In [10]:
print(f"Num Samples in Training Set - {len(X_train)}, {len(y_train)}")
print(f"Num Samples in Testing Set  - {len(X_test)}, {len(y_test)}")

Num Samples in Training Set - 32326, 32326
Num Samples in Testing Set  - 8082, 8082


In [7]:
from sklearn.preprocessing import MinMaxScaler

mmscaler = MinMaxScaler()

X_train_scaled = mmscaler.fit_transform(X_train)
X_test_scaled = mmscaler.transform(X_test)

## Getting A Baseline Score

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

logreg = LogisticRegression(max_iter=250) # Using the default hyperparameters

logreg.fit(X_train_scaled, y_train)

y_preds_baseline = logreg.predict(X_test_scaled)

print(f"Classification Report for Baseline Model - \n {classification_report(y_test, y_preds_baseline)}")

Classification Report for Baseline Model - 
               precision    recall  f1-score   support

           0       0.58      0.58      0.58      4013
           1       0.59      0.59      0.59      4069

    accuracy                           0.58      8082
   macro avg       0.58      0.58      0.58      8082
weighted avg       0.58      0.58      0.58      8082



## Performing Hyperparameter Tuning

In [13]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'],
    'max_iter': [250, 500, 1000]
}

logreg = LogisticRegression()

random_search = RandomizedSearchCV(estimator=logreg, param_distributions=param_grid, cv=5, scoring='accuracy', verbose=3, n_jobs=-1, n_iter=5)

random_search.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits




In [14]:
random_search.best_params_

{'solver': 'saga', 'penalty': 'l2', 'max_iter': 250, 'C': 100}

In [15]:
random_search.best_score_

0.5820086123644196

## Making Predictions with Best Model

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

logreg_best = LogisticRegression(solver='saga', penalty='l2', max_iter=250, C=100)

logreg_best.fit(X_train_scaled, y_train)

y_preds_best = logreg_best.predict(X_test_scaled)

print(f"Best Model Performance - {classification_report(y_test, y_preds_best)}")

Best Model Performance -               precision    recall  f1-score   support

           0       0.59      0.55      0.57      4120
           1       0.57      0.61      0.59      3962

    accuracy                           0.58      8082
   macro avg       0.58      0.58      0.58      8082
weighted avg       0.58      0.58      0.58      8082





## Saving The Model

In [13]:
import joblib

joblib.dump(logreg_best, "logistic-regression.joblib")

['logistic-regression.joblib']