In [3]:
pip install datasets

Collecting datasetsNote: you may need to restart the kernel to use updated packages.

  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting filelock (from datasets)
  Downloading filelock-3.15.1-py3-none-any.whl.metadata (2.8 kB)
Collecting numpy>=1.17 (from datasets)
  Downloading numpy-1.26.4-cp310-cp310-win_amd64.whl.metadata (61 kB)
     ---------------------------------------- 0.0/61.0 kB ? eta -:--:--
     -------------------- ------------------- 30.7/61.0 kB ? eta -:--:--
     -------------------- ------------------- 30.7/61.0 kB ? eta -:--:--
     -------------------------------------- 61.0/61.0 kB 461.8 kB/s eta 0:00:00
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-win_amd64.whl.metadata (3.1 kB)
Collecting pyarrow-hotfix (from datasets)
  Using cached pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collectin

In [4]:
import datasets 
datasets.logging.set_verbosity_error()

from datasets import load_dataset

dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Amazon_Fashion", trust_remote_code=True)

print(dataset["full"][0])


  from .autonotebook import tqdm as notebook_tqdm


{'rating': 5.0, 'title': 'Pretty locket', 'text': 'I think this locket is really pretty. The inside back is a solid silver depression and the front is a dome that is not solid (knotted). You could use it to store a small photo, lock of hair, etc but I use it when I need to carry medication with me. Closes securely. High quality & very pretty.', 'images': [], 'asin': 'B00LOPVX74', 'parent_asin': 'B00LOPVX74', 'user_id': 'AGBFYI2DDIKXC5Y4FARTYDTQBMFQ', 'timestamp': 1578528394489, 'helpful_vote': 3, 'verified_purchase': True}


In [5]:
import pandas as pd

# Convert to Pandas DataFrame for easier manipulation
df = pd.DataFrame(dataset['full'])

# Display basic information
print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500939 entries, 0 to 2500938
Data columns (total 10 columns):
 #   Column             Dtype  
---  ------             -----  
 0   rating             float64
 1   title              object 
 2   text               object 
 3   images             object 
 4   asin               object 
 5   parent_asin        object 
 6   user_id            object 
 7   timestamp          int64  
 8   helpful_vote       int64  
 9   verified_purchase  bool   
dtypes: bool(1), float64(1), int64(2), object(6)
memory usage: 174.1+ MB
None
   rating                 title  \
0     5.0         Pretty locket   
1     5.0                     A   
2     2.0             Two Stars   
3     1.0       Wonâ€™t buy again   
4     5.0  I LOVE these glasses   

                                                text images        asin  \
0  I think this locket is really pretty. The insi...     []  B00LOPVX74   
1                                              Great     []  B

In [6]:
df.dropna(subset=['text', 'rating'], inplace=True)

# Remove duplicates
df.drop_duplicates(subset=['text', 'user_id'], inplace=True)

# Preprocess text (e.g., lowercasing, removing punctuation)
import re

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

df['cleaned_text'] = df['text'].apply(preprocess_text)

print(df[['cleaned_text', 'rating']].head())

                                        cleaned_text  rating
0  i think this locket is really pretty the insid...     5.0
1                                              great     5.0
2  one of the stones fell out within the first 2 ...     2.0
3  crappy socks money wasted bought to wear with ...     1.0
4  i love these glasses  they fit perfectly over ...     5.0


In [10]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.0-cp310-cp310-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.13.1-cp310-cp310-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.6 kB ? eta -:--:--
     -------------------- ------------------- 30.7/60.6 kB ? eta -:--:--
     ---------------------------------------- 60.6/60.6 kB 1.1 MB/s eta 0:00:00
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.0-cp310-cp310-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.1/11.0 MB 2.0 MB/s eta 0:00:06
   - -------------------------------------- 0.3/11.0 MB 3.7 MB/s eta 0:00:03
   - --------------------------------------

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the cleaned text
X = vectorizer.fit_transform(df['cleaned_text'])

# Convert ratings to numerical labels
y = df['rating']

print(X.shape)
print(y.shape)

(2467824, 5000)
(2467824,)


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape)


(1974259, 5000) (493565, 5000)


In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))


Accuracy: 0.702752423692928
              precision    recall  f1-score   support

         1.0       0.60      0.77      0.67     59926
         2.0       0.34      0.10      0.15     34291
         3.0       0.38      0.34      0.36     48692
         4.0       0.47      0.24      0.32     69719
         5.0       0.80      0.94      0.87    280937

    accuracy                           0.70    493565
   macro avg       0.52      0.48      0.48    493565
weighted avg       0.66      0.70      0.67    493565



In [14]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring='accuracy')

# Fit to the training data
grid_search.fit(X_train, y_train)

# Print best parameters and score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

# Evaluate the best model
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred_best)}")
print(classification_report(y_test, y_pred_best))


Best parameters: {'C': 1}
Best score: 0.7037080743597055
Accuracy: 0.702752423692928
              precision    recall  f1-score   support

         1.0       0.60      0.77      0.67     59926
         2.0       0.34      0.10      0.15     34291
         3.0       0.38      0.34      0.36     48692
         4.0       0.47      0.24      0.32     69719
         5.0       0.80      0.94      0.87    280937

    accuracy                           0.70    493565
   macro avg       0.52      0.48      0.48    493565
weighted avg       0.66      0.70      0.67    493565



In [15]:
import joblib

# Save the model to a file
joblib.dump(best_model, 'best_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

In [16]:
# Load the model and vectorizer
model = joblib.load('best_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Example usage on new data
new_reviews = ["This product is amazing!", "Not worth the price."]
new_reviews_cleaned = [preprocess_text(review) for review in new_reviews]
new_reviews_vectorized = vectorizer.transform(new_reviews_cleaned)
predictions = model.predict(new_reviews_vectorized)

print(predictions)


[5. 1.]


In [25]:
new_reviews = ["not good", ""]
new_reviews_cleaned = [preprocess_text(review) for review in new_reviews]
new_reviews_vectorized = vectorizer.transform(new_reviews_cleaned)
predictions = model.predict(new_reviews_vectorized)
print(predictions)


[3. 5.]
