In [17]:
import pandas as pd
import os
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
dataset_path = r'C:\Users\kevin\Desktop\VSCode\Zecser\data\cleaned_dataset\cleaned_Reviews.csv'
df = pd.read_csv(dataset_path)

In [3]:
df.head()

Unnamed: 0,Score,Text,Cleaned_Text
0,1,I used to purchase them at wholefoods market. ...,used purchase wholefoods market bought pack gr...
1,1,Since the ingredient information is missing on...,since ingredient information missing amazoncom...
2,1,"Being a cinnamon candy nut at Valentines, I wa...",cinnamon candy nut valentine pretty excited le...
3,1,This has to be the world's smallest box of cho...,world smallest box chocolate even advertised l...
4,1,This item can be purchased in stores for much ...,item purchased store much le weight listedonly...


In [4]:
df.isnull().sum()  # Check for missing values

Score           0
Text            0
Cleaned_Text    1
dtype: int64

In [5]:
df.dropna(inplace=True)

In [6]:
df.isnull().sum()

Score           0
Text            0
Cleaned_Text    0
dtype: int64

In [7]:
df.duplicated().sum()

0

In [8]:
print(df['Score'].value_counts())

Score
1    20790
2    20790
3    20790
4    20790
5    20789
Name: count, dtype: int64


In [9]:
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X = vectorizer.fit_transform(df['Cleaned_Text'])
y = df['Score']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
clf = LogisticRegression(class_weight='balanced', max_iter=1000)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.5137085137085137

In [12]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.59      0.65      0.62      4160
           2       0.44      0.41      0.42      4204
           3       0.43      0.40      0.41      4217
           4       0.48      0.44      0.46      4188
           5       0.61      0.69      0.64      4021

    accuracy                           0.51     20790
   macro avg       0.51      0.52      0.51     20790
weighted avg       0.51      0.51      0.51     20790



In [19]:
model_dir = r'C:\Users\kevin\Desktop\VSCode\Zecser\models'
os.makedirs(model_dir, exist_ok=True)
joblib.dump(clf, os.path.join(model_dir, 'logistic_regression_model.pkl'))
joblib.dump(vectorizer, os.path.join(model_dir, 'tfidf_vectorizer.pkl'))
print("Model and vectorizer saved successfully.")

Model and vectorizer saved successfully.
