In [None]:
# Import necessary libraries
# Pandas- Dataset manipulation
import pandas as pd
import numpy as np

# Regex and string manipulation
import re
import string

# NLTK
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
# Uncomment if not downloaded
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

# Scikit-Learn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Models
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


# Metrics
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Import data set
# reviews = pd.read_csv("/content/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv", on_bad_lines='skip')
reviews = pd.read_csv("./dataset/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv", encoding='utf-8')

In [4]:
reviews_df = pd.DataFrame(reviews)

In [7]:
reviews_df.head()

Unnamed: 0,id,dateAdded,dateUpdated,name,asins,brand,categories,primaryCategories,imageURLs,keys,...,reviews.didPurchase,reviews.doRecommend,reviews.id,reviews.numHelpful,reviews.rating,reviews.sourceURLs,reviews.text,reviews.title,reviews.username,sourceURLs
0,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,,3,https://www.amazon.com/product-reviews/B00QWO9...,I order 3 of them and one of the item is bad q...,... 3 of them and one of the item is bad quali...,Byger yang,"https://www.barcodable.com/upc/841710106442,ht..."
1,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,,4,https://www.amazon.com/product-reviews/B00QWO9...,Bulk is always the less expensive way to go fo...,... always the less expensive way to go for pr...,ByMG,"https://www.barcodable.com/upc/841710106442,ht..."
2,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,,5,https://www.amazon.com/product-reviews/B00QWO9...,Well they are not Duracell but for the price i...,... are not Duracell but for the price i am ha...,BySharon Lambert,"https://www.barcodable.com/upc/841710106442,ht..."
3,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,,5,https://www.amazon.com/product-reviews/B00QWO9...,Seem to work as well as name brand batteries a...,... as well as name brand batteries at a much ...,Bymark sexson,"https://www.barcodable.com/upc/841710106442,ht..."
4,AVpgNzjwLJeJML43Kpxn,2015-10-30T08:59:32Z,2019-04-25T09:08:16Z,AmazonBasics AAA Performance Alkaline Batterie...,"B00QWO9P0O,B00LH3DMUO",Amazonbasics,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,https://images-na.ssl-images-amazon.com/images...,"amazonbasics/hl002619,amazonbasicsaaaperforman...",...,,,,,5,https://www.amazon.com/product-reviews/B00QWO9...,These batteries are very long lasting the pric...,... batteries are very long lasting the price ...,Bylinda,"https://www.barcodable.com/upc/841710106442,ht..."


# Exploratory Data Analysis

In [9]:
# Verify size of the dataset
print(f"Number of rows in the dataset: {reviews_df.shape[0]}")
print(f"Number of columns in the dataset: {reviews_df.shape[1]}")

Number of rows in the dataset: 28332
Number of columns in the dataset: 24


In [314]:
# Check dataset types
reviews.dtypes

id                      object
dateAdded               object
dateUpdated             object
name                    object
asins                   object
brand                   object
categories              object
primaryCategories       object
imageURLs               object
keys                    object
manufacturer            object
manufacturerNumber      object
reviews.date            object
reviews.dateSeen        object
reviews.didPurchase     object
reviews.doRecommend     object
reviews.id             float64
reviews.numHelpful     float64
reviews.rating           int64
reviews.sourceURLs      object
reviews.text            object
reviews.title           object
reviews.username        object
sourceURLs              object
dtype: object

In [316]:
# Check for NaN values
reviews_df.isnull().sum()

id                         0
dateAdded                  0
dateUpdated                0
name                       0
asins                      0
brand                      0
categories                 0
primaryCategories          0
imageURLs                  0
keys                       0
manufacturer               0
manufacturerNumber         0
reviews.date               0
reviews.dateSeen           0
reviews.didPurchase    28323
reviews.doRecommend    12246
reviews.id             28291
reviews.numHelpful     12217
reviews.rating             0
reviews.sourceURLs         0
reviews.text               0
reviews.title              0
reviews.username           5
sourceURLs                 0
dtype: int64

### Keep the following columns for model use
- name
- categories
- primary category
- reviews.doRecommend
- reviews.numHelpful
- reviews.rating
- reviews.text

In [7]:
# Keep necessary columns for analysis
reduced_review_df = reviews_df.iloc[:, [6, 7, 18, 20]]
reduced_review_df.head()

Unnamed: 0,categories,primaryCategories,reviews.rating,reviews.text
0,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,3,I order 3 of them and one of the item is bad q...
1,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,4,Bulk is always the less expensive way to go fo...
2,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,5,Well they are not Duracell but for the price i...
3,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,5,Seem to work as well as name brand batteries a...
4,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,5,These batteries are very long lasting the pric...


In [13]:
rows = reduced_review_df.shape[0]
cols = reduced_review_df.shape[1]

print(f"Num of rows for the reduced df: {rows}")
print(f"Num of columns for the reduced df: {cols}")

Num of rows for the reduced df: 28332
Num of columns for the reduced df: 4


In [9]:
# Measure the length of each review in the reviews.text column
each_review_length = reduced_review_df['reviews.text'].apply(lambda rev: len(rev.split()))

In [324]:
reduced_review_df.head()

Unnamed: 0,categories,primaryCategories,reviews.rating,reviews.text
0,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,3,I order 3 of them and one of the item is bad q...
1,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,4,Bulk is always the less expensive way to go fo...
2,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,5,Well they are not Duracell but for the price i...
3,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,5,Seem to work as well as name brand batteries a...
4,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,5,These batteries are very long lasting the pric...


# Traditional ML Model Approach

In [11]:
# Data cleaning and preprocessing
# Remove punctuation
pattern = f"[{re.escape(string.punctuation)}]"

chosen_cols = reduced_review_df['reviews.text']
no_punkt_reviews = chosen_cols.apply(lambda rev: re.sub(pattern, "", rev))

display(no_punkt_reviews)

0        I order 3 of them and one of the item is bad q...
1        Bulk is always the less expensive way to go fo...
2        Well they are not Duracell but for the price i...
3        Seem to work as well as name brand batteries a...
4        These batteries are very long lasting the pric...
                               ...                        
28327    I got 2 of these for my 8 yr old twins My 11 y...
28328    I bought this for my niece for a Christmas gif...
28329    Very nice for light internet browsing keeping ...
28330    This Tablet does absolutely everything I want ...
28331    At ninety dollars the expectionations are low ...
Name: reviews.text, Length: 28332, dtype: object

In [13]:
import re

def clean_special_syllables(text):
    # Eliminar palabras que contienen caracteres especiales no alfanuméricos como ä, ú, etc.
    # cleaned_text = re.sub(r'\b\w*[^a-zA-Z0-9\s]+\w*\b', '', text)
    cleaned_text = re.sub(r'\b\w*[^a-zA-Z\s]+\w*\b', '', text)
    # Remover espacios adicionales
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

# Aplicar la función al dataset
clean_special_chars = no_punkt_reviews.apply(lambda rev: clean_special_syllables(rev))
# clean_special_chars = no_punkt_reviews['reviews.text'].apply(clean_special_syllables)

clean_special_chars

0        I order of them and one of the item is bad qua...
1        Bulk is always the less expensive way to go fo...
2        Well they are not Duracell but for the price i...
3        Seem to work as well as name brand batteries a...
4        These batteries are very long lasting the pric...
                               ...                        
28327    I got of these for my yr old twins My yr old h...
28328    I bought this for my niece for a Christmas gif...
28329    Very nice for light internet browsing keeping ...
28330    This Tablet does absolutely everything I want ...
28331    At ninety dollars the expectionations are low ...
Name: reviews.text, Length: 28332, dtype: object

In [15]:
# Convert entire text into lowercase for consistency
lower_case = clean_special_chars.apply(lambda rev: rev.lower())
lower_case

0        i order of them and one of the item is bad qua...
1        bulk is always the less expensive way to go fo...
2        well they are not duracell but for the price i...
3        seem to work as well as name brand batteries a...
4        these batteries are very long lasting the pric...
                               ...                        
28327    i got of these for my yr old twins my yr old h...
28328    i bought this for my niece for a christmas gif...
28329    very nice for light internet browsing keeping ...
28330    this tablet does absolutely everything i want ...
28331    at ninety dollars the expectionations are low ...
Name: reviews.text, Length: 28332, dtype: object

In [278]:
# Convert entire text into lowercase for consistency
# lower_case = no_punkt_reviews.apply(lambda rev: rev.lower())
# lower_case

In [17]:
# Tokenize words
tokenized_reviews = lower_case.apply(lambda rev: word_tokenize(rev))

# Check output
tokenized_reviews

0        [i, order, of, them, and, one, of, the, item, ...
1        [bulk, is, always, the, less, expensive, way, ...
2        [well, they, are, not, duracell, but, for, the...
3        [seem, to, work, as, well, as, name, brand, ba...
4        [these, batteries, are, very, long, lasting, t...
                               ...                        
28327    [i, got, of, these, for, my, yr, old, twins, m...
28328    [i, bought, this, for, my, niece, for, a, chri...
28329    [very, nice, for, light, internet, browsing, k...
28330    [this, tablet, does, absolutely, everything, i...
28331    [at, ninety, dollars, the, expectionations, ar...
Name: reviews.text, Length: 28332, dtype: object

In [19]:
# Check for stop words
stop_words = set(stopwords.words('english'))

# Remove stops words
no_stopwords_reviews = tokenized_reviews.apply(lambda rev: [word for word in rev if word.lower() not in stop_words])

no_stopwords_reviews

0        [order, one, item, bad, quality, missing, back...
1        [bulk, always, less, expensive, way, go, produ...
2                           [well, duracell, price, happy]
3        [seem, work, well, name, brand, batteries, muc...
4                 [batteries, long, lasting, price, great]
                               ...                        
28327    [got, yr, old, twins, yr, old, one, one, bette...
28328    [bought, niece, christmas, giftshe, years, old...
28329    [nice, light, internet, browsing, keeping, top...
28330    [tablet, absolutely, everything, want, watch, ...
28331    [ninety, dollars, expectionations, low, still,...
Name: reviews.text, Length: 28332, dtype: object

In [336]:
# Lemmatize words
# wordnet_lemma = WordNetLemmatizer()

# lemmatized_reviews = tokenized_reviews.apply(lambda rev: [wordnet_lemma.lemmatize(word, pos='v') for word in rev])

# lemmatized_reviews

0        [i, order, of, them, and, one, of, the, item, ...
1        [bulk, be, always, the, less, expensive, way, ...
2        [well, they, be, not, duracell, but, for, the,...
3        [seem, to, work, as, well, as, name, brand, ba...
4        [these, batteries, be, very, long, last, the, ...
                               ...                        
28327    [i, get, of, these, for, my, yr, old, twin, my...
28328    [i, buy, this, for, my, niece, for, a, christm...
28329    [very, nice, for, light, internet, browse, kee...
28330    [this, tablet, do, absolutely, everything, i, ...
28331    [at, ninety, dollars, the, expectionations, be...
Name: reviews.text, Length: 28332, dtype: object

In [21]:
# Lemmatize words
wordnet_lemma = WordNetLemmatizer()

lemmatized_reviews = no_stopwords_reviews.apply(lambda rev: [wordnet_lemma.lemmatize(word, pos='v') for word in rev])

lemmatized_reviews

0        [order, one, item, bad, quality, miss, backup,...
1        [bulk, always, less, expensive, way, go, produ...
2                           [well, duracell, price, happy]
3        [seem, work, well, name, brand, batteries, muc...
4                    [batteries, long, last, price, great]
                               ...                        
28327    [get, yr, old, twin, yr, old, one, one, better...
28328    [buy, niece, christmas, giftshe, years, old, l...
28329    [nice, light, internet, browse, keep, top, ema...
28330    [tablet, absolutely, everything, want, watch, ...
28331    [ninety, dollars, expectionations, low, still,...
Name: reviews.text, Length: 28332, dtype: object

In [23]:
# Join processed reviews
processed_reviews = lemmatized_reviews.apply(lambda tokens:" ".join(tokens))
processed_reviews

0        order one item bad quality miss backup spring ...
1          bulk always less expensive way go products like
2                                well duracell price happy
3        seem work well name brand batteries much bette...
4                          batteries long last price great
                               ...                        
28327    get yr old twin yr old one one better perfect ...
28328           buy niece christmas giftshe years old love
28329    nice light internet browse keep top email view...
28330    tablet absolutely everything want watch tv sho...
28331    ninety dollars expectionations low still good ...
Name: reviews.text, Length: 28332, dtype: object

In [25]:
# Vectorizartion to convert textual data into numerical vectors
# tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2),  max_features=5000)
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

tfidf_matrix = tfidf_vectorizer.fit_transform(processed_reviews)

vocab = tfidf_vectorizer.get_feature_names_out()

print(vocab)

['aa' 'aaa' 'aaaaa' ... 'zippy' 'zone' 'zoom']


In [27]:
#Create document term matrix representing the frequency of the words
doc_term_matrix_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=vocab)

In [29]:
doc_term_matrix_tfidf

Unnamed: 0,aa,aaa,aaaaa,aaas,aas,abc,abcs,abilities,ability,abke,...,youtube,youve,yr,yrs,yup,zero,zipper,zippy,zone,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28327,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.651199,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28329,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28330,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [116]:
# # Drop the original 'review.text' column from the DataFrame
# reduced_review_df.drop('reviews.text', axis=1, inplace=True)

# # Concatenate the TF-IDF DataFrame with the original DataFrame (which no longer contains 'review.text')
# reduced_review_df = pd.concat([reduced_review_df, doc_term_matrix_tfidf], axis=1)

In [None]:
# # Get final data going to be used for models
# # Initialize the encoder
# encoder = OneHotEncoder(sparse_output=False)

# # Select the categorical columns
# cat_cols = reduced_review_df[['name', 'categories', 'primaryCategories']]

# # Fit and transform the categorical columns
# encoded_cols = encoder.fit_transform(cat_cols)

# # Convert the encoded columns into a DataFrame
# encoded_cols_df = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names_out(['name', 'categories', 'primaryCategories']))

# # Concatenate the encoded columns back to the original dataframe
# reduced_review_df = pd.concat([reduced_review_df, encoded_cols_df], axis=1)

# # Drop the original categorical columns if no longer needed
# reduced_review_df.drop(['name', 'categories', 'primaryCategories'], axis=1, inplace=True)

In [118]:
# # Check the result
# reduced_review_df.head()

## Model Selection

In [41]:
# Test the following models to see which is best by using an ML pipeline with GridSearchCV:
models = {
    # Naive-Bayes
    'Naive-Bayes' : MultinomialNB(),

    # # Logistic Regression
    'Logistic Regression' : LogisticRegression(),

    # # Random Forest
    'Random Forest' : RandomForestClassifier(),

    # Support Vector Machine
    # 'Support Vector Machine' : SVC(),
}

In [43]:
# Set up GridSearchCV
param_grid = {
    'Naive-Bayes': {
        'classifier__alpha': [0.5, 1.0, 2.0],
    },
    'Logistic Regression': {
        'classifier__C': [0.1, 1, 10],
        'classifier__solver': ['liblinear', 'saga'],
    },
    'Random Forest': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 10, 20],
        'classifier__min_samples_split': [2, 5],
    },
    # 'Support Vector Machine': {
    #     'classifier__C': [0.1, 1, 10],
    #     'classifier__kernel': ['linear', 'rbf'],
    # },
}

## Model Training

In [44]:
display(reduced_review_df.head())

Unnamed: 0,categories,primaryCategories,reviews.rating,reviews.text
0,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,3,I order 3 of them and one of the item is bad q...
1,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,4,Bulk is always the less expensive way to go fo...
2,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,5,Well they are not Duracell but for the price i...
3,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,5,Seem to work as well as name brand batteries a...
4,"AA,AAA,Health,Electronics,Health & Household,C...",Health & Beauty,5,These batteries are very long lasting the pric...


In [33]:
def map_sentiment(rating):
    if rating in [1, 2, 3]:
        return 'Negativo'
    elif rating == 4:
        return 'Neutral'
    elif rating == 5:
        return 'Positivo'

In [45]:
# Partition data into train and test

# X = reduced_review_df.drop('reviews.rating', axis=1)
# y = reduced_review_df['reviews.rating']

X = tfidf_matrix
y = reduced_review_df['reviews.rating'].apply(map_sentiment)

# Split data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(22665, 5000)
(5667, 5000)
(22665,)
(5667,)


In [47]:
# Loop through the models to set up the pipeline and perform GridSearchCV
for model_name, model in models.items():
    # Create a pipeline for each model
    if model_name == 'Naive-Bayes':  # Check if the current model is Naive-Bayes
        pipeline = Pipeline([
            # Remove StandardScaler for Naive-Bayes
            ('classifier', model)
        ])
    else:
        pipeline = Pipeline([
            ('scaler', StandardScaler(with_mean=False)),  # Disable centering for sparse data
            ('classifier', model)
        ])
    # Set up GridSearchCV for the current model
    grid_search = GridSearchCV(pipeline, param_grid[model_name], cv=5, scoring='accuracy')

    # Fit the grid search
    grid_search.fit(X_train, y_train)

    # Best parameters and model
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best score for {model_name}: {grid_search.best_score_}")

    # Predict and evaluate the model
    y_pred = grid_search.predict(X_test)
    print(f"Accuracy on test set for {model_name}: {accuracy_score(y_test, y_pred)}\n")

Best parameters for Naive-Bayes: {'classifier__alpha': 0.5}
Best score for Naive-Bayes: 0.7511140525038607
Accuracy on test set for Naive-Bayes: 0.7586024351508734





Best parameters for Logistic Regression: {'classifier__C': 1, 'classifier__solver': 'saga'}
Best score for Logistic Regression: 0.7673064195896757
Accuracy on test set for Logistic Regression: 0.7840127051349921

Best parameters for Random Forest: {'classifier__max_depth': None, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}
Best score for Random Forest: 0.8502095742333996
Accuracy on test set for Random Forest: 0.8731251102876302



In [37]:
# Train and Test Neural Networks
from sklearn.neural_network import MLPClassifier

# Define and train the neural network
nn_model = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=500)
nn_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = nn_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    Negativo       0.72      0.74      0.73       579
     Neutral       0.78      0.62      0.69      1093
    Positivo       0.89      0.94      0.91      3995

    accuracy                           0.86      5667
   macro avg       0.80      0.77      0.78      5667
weighted avg       0.85      0.86      0.85      5667



In [None]:
# Generate classification report and confusion matrix
print("\nClassification Report:")
print(classification_report(all_true_labels, all_predictions, target_names=label_encoder.classes_))

# Confusion matrix
conf_matrix = confusion_matrix(all_true_labels, all_predictions)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

## Select best model based on accuracy, precision, recall, and F1-score.
The best model for this sentiment analysis task is:

1. Best parameters for Naive-Bayes: {'classifier__alpha': 0.5}
- Best score for Naive-Bayes: 0.7074285714285715
- Accuracy on test set for Naive-Bayes: 0.7046666666666667

2. Best parameters for Logistic Regression: {'classifier__C': 1, 'classifier__solver': 'saga'}
- Best score for Logistic Regression: 0.64
- Accuracy on test set for Logistic Regression: 0.6706666666666666

3. Best parameters for Random Forest: {'classifier__max_depth': None, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 200}
- Best score for Random Forest: 0.7237142857142856
- Accuracy on test set for Random Forest: 0.7406666666666667

4. Best parameters for Support Vector Machine: {'classifier__C': 1, 'classifier__kernel': 'rbf'}
- Best score for Support Vector Machine: 0.7337142857142858
- Accuracy on test set for Support Vector Machine: 0.7366666666666667


## Model Metrics

In [None]:
# Evaluate model performance on seperate test dataset based on various evaluation metrics
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           1       1.00      0.19      0.32        16
           2       1.00      0.05      0.09        22
           3       1.00      0.09      0.17        65
           4       0.95      0.15      0.26       353
           5       0.73      1.00      0.84      1044

    accuracy                           0.74      1500
   macro avg       0.93      0.30      0.34      1500
weighted avg       0.80      0.74      0.66      1500



In [None]:
#Output preview
"""
Model achieve an accuracy of X% on the test dataset.
Precision, recall, and F1-score for each class are as follows:
Class Positive: Precision=X%, Recall=X%, F1-score=X%
Class Negative: Precision=X%, Recall=X%, F1-score=X%
Class Neutral: Precision=X%, Recall=X%, F1-score=X%
Confusion matrix showing table and graphical representations
"""

# Deep Learning Model- Transformers Approach using Hugging Face

In [None]:
#! pip install transformers

In [142]:
# Hugging Face Transformer Models
#from transformers import BertModel
from transformers import RobertaModel
from transformers import DistilBertModel
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Torch
import torch

# Pipeline
from transformers import pipeline

# Data Cleaning and Preprocessing

In [213]:
reviews_df = pd.read_csv("./dataset/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv")

In [215]:
reviews_df.head(3)

Unnamed: 0,id,dateAdded,dateUpdated,name,asins,brand,categories,primaryCategories,imageURLs,keys,...,reviews.dateSeen,reviews.doRecommend,reviews.id,reviews.numHelpful,reviews.rating,reviews.sourceURLs,reviews.text,reviews.title,reviews.username,sourceURLs
0,AVqVGZNvQMlgsOJE6eUY,2017-03-03T16:56:05Z,2018-10-25T16:36:31Z,"Amazon Kindle E-Reader 6"" Wifi (8th Generation...",B00ZV9PXP2,Amazon,"Computers,Electronics Features,Tablets,Electro...",Electronics,https://pisces.bbystatic.com/image2/BestBuy_US...,allnewkindleereaderblack6glarefreetouchscreend...,...,"2018-05-27T00:00:00Z,2017-09-18T00:00:00Z,2017...",False,,0,3,http://reviews.bestbuy.com/3545/5442403/review...,I thought it would be as big as small paper bu...,Too small,llyyue,https://www.newegg.com/Product/Product.aspx%25...
1,AVqVGZNvQMlgsOJE6eUY,2017-03-03T16:56:05Z,2018-10-25T16:36:31Z,"Amazon Kindle E-Reader 6"" Wifi (8th Generation...",B00ZV9PXP2,Amazon,"Computers,Electronics Features,Tablets,Electro...",Electronics,https://pisces.bbystatic.com/image2/BestBuy_US...,allnewkindleereaderblack6glarefreetouchscreend...,...,"2018-05-27T00:00:00Z,2017-07-07T00:00:00Z,2017...",True,,0,5,http://reviews.bestbuy.com/3545/5442403/review...,This kindle is light and easy to use especiall...,Great light reader. Easy to use at the beach,Charmi,https://www.newegg.com/Product/Product.aspx%25...
2,AVqVGZNvQMlgsOJE6eUY,2017-03-03T16:56:05Z,2018-10-25T16:36:31Z,"Amazon Kindle E-Reader 6"" Wifi (8th Generation...",B00ZV9PXP2,Amazon,"Computers,Electronics Features,Tablets,Electro...",Electronics,https://pisces.bbystatic.com/image2/BestBuy_US...,allnewkindleereaderblack6glarefreetouchscreend...,...,2018-05-27T00:00:00Z,True,,0,4,https://reviews.bestbuy.com/3545/5442403/revie...,Didnt know how much i'd use a kindle so went f...,Great for the price,johnnyjojojo,https://www.newegg.com/Product/Product.aspx%25...


In [217]:
# Reduce DF to relevant columns for model training
reduced_review = reviews_df.iloc[:, [ 6, 7,  18, 20]]

In [219]:
reduced_review.head()

Unnamed: 0,categories,primaryCategories,reviews.rating,reviews.text
0,"Computers,Electronics Features,Tablets,Electro...",Electronics,3,I thought it would be as big as small paper bu...
1,"Computers,Electronics Features,Tablets,Electro...",Electronics,5,This kindle is light and easy to use especiall...
2,"Computers,Electronics Features,Tablets,Electro...",Electronics,4,Didnt know how much i'd use a kindle so went f...
3,"Computers,Electronics Features,Tablets,Electro...",Electronics,5,I am 100 happy with my purchase. I caught it o...
4,"Computers,Electronics Features,Tablets,Electro...",Electronics,5,Solid entry level Kindle. Great for kids. Gift...


In [221]:
# Map sentiment to rating into category for model use
def map_sentiment(rating):
    if rating <= 3:
        return 0  # NEGATIVE
    elif rating == 4:
        return 1  # NEUTRAL
    else:
        return 2  # POSITIVE

In [223]:
# Split data for training and testing
X = reduced_review['reviews.text']
y = reduced_review['reviews.rating'].apply(map_sentiment)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Intialize model

In [225]:
import pandas as pd
import re
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Step 1: Load dataset
# df = final_dataset.copy()

df = reduced_review.copy()

# Step 2: Preprocessing
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text).lower().strip()
    return text

df['cleaned_text'] = df['reviews.text'].apply(preprocess_text)

# Map ratings to sentiment classes
rating_to_sentiment = {1: 0, 2: 0, 3: 0, 4: 1, 5: 2}
df['sentiment'] = df['reviews.rating'].map(rating_to_sentiment)

# Step 3: Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['cleaned_text'].tolist(), df['sentiment'].tolist(), test_size=0.2, random_state=42
)

# Step 4: Tokenization using a pre-trained model
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=256, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=256, return_tensors="pt")

# Step 5: Dataset class for PyTorch
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)

# Step 6: Load pre-trained transformer model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

# Use GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Step 7: Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)

# Step 8: Training loop
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Training loop
epochs = 5
for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch + 1}/{epochs} completed.')

# Step 9: Evaluation
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=-1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch['labels'].cpu().numpy())

# Classification report
print("Classification Report:")
print(classification_report(all_labels, all_preds, target_names=['Negative', 'Neutral', 'Positive']))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5 completed.
Epoch 2/5 completed.
Epoch 3/5 completed.
Epoch 4/5 completed.
Epoch 5/5 completed.
Classification Report:
              precision    recall  f1-score   support

    Negative       0.90      0.43      0.58        63
     Neutral       0.52      0.53      0.52       234
    Positive       0.84      0.88      0.86       703

    accuracy                           0.77      1000
   macro avg       0.75      0.61      0.65      1000
weighted avg       0.77      0.77      0.76      1000



In [194]:
# Initialize tokenizer
distilbert_tokernizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', torch_dtype=torch.float16, attn_implementation="sdpa")

# Initialize model
distilbert_model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [196]:
# Tokenize review text
train_tokens = distilbert_tokernizer(list(X_train), truncation=True, padding=True, return_tensors='pt') # Truncates long sentences, Pads sentences to make them same length
test_tokens = distilbert_tokernizer(list(X_train), truncation=True, padding=True, return_tensors='pt')

In [198]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.long)  # Convert labels to tensor here

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]  # Labels are already tensors
        return item

train_dataset = SentimentDataset(train_tokens, y_train.tolist())
test_dataset = SentimentDataset(test_tokens, y_test.tolist())

In [200]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [202]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./distilbert-sentiment",  # Where to save the model
    eval_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",  # Save checkpoint every epoch
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,  # Adjust as needed
    learning_rate=2e-5,  # Standard for fine-tuning Transformers
    weight_decay=0.01,  # Regularization
    logging_dir="./logs",  # Log directory
    logging_steps=50,
    load_best_model_at_end=True,  # Save the best model based on eval loss
    metric_for_best_model="accuracy",
    report_to="none"  # Disable WandB logging
)

In [204]:
!pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
Installing collected packages: evaluate
Successfully installed evaluate-0.4.3


In [206]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)  # Convert logits to class labels
    return metric.compute(predictions=predictions, references=labels)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [208]:
from transformers import Trainer

trainer = Trainer(
    model=distilbert_model,
    args=training_args,
    train_dataset=train_loader,
    eval_dataset=test_loader,
    compute_metrics=compute_metrics,
)

In [None]:
# Train model
trainer.train()

In [None]:
predictions_output = trainer.predict(test_dataset)

In [None]:
print(classification_report(train_dataset, p, target_names=['Negative', 'Neutral', 'Positive']))

In [None]:
# Bonus: use GenAI to summarize reviews broken down into review scores (0-5)
# Aditionally, break down by product category
    # If too many categories, only select top K categories

In [None]:
# Visualize with Tableau or Plotly