# **Project NLP | Automated Customers Reviews**

This business case outlines the development of an NLP model to automate the processing of 
customer feedback for a retail company. 

Its goal is to evaluate how a traditional ML solutions (NaiveBayes, SVM, RandomForest, etc) 
compares against a Deep Learning solution (e.g, a Transformer from HuggingFace) when trying to 
analyse a user review, in terms of its score (positive, negative or neutral).


In [4]:
# Traditional Models (Naive Bayes, Logistic Regression, Random Forest)
# Please note that the deep learning model is executed with a separate script (code_distilBERT.ipynb),
# which has also been submitted in this project's folder

# Imports the necessary Libraries
import pandas as pd
import re
import string
import nltk
import pickle
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import os

# Makes sure the required NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/sylviaperez-
[nltk_data]     montero/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/sylviaperez-
[nltk_data]     montero/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/sylviaperez-
[nltk_data]     montero/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
# Loads the dataset (and initial save checkpoint)
df = pd.read_csv("/Users/sylviaperez-montero/Desktop/Project/Amazon Data.csv", low_memory=False)
pickle.dump(df, open("dataset.pkl", "wb"))

# Drops the unnecessary columns and saves the cleaned dataset
columns_to_drop = [
    'asins', 'reviews.date', 'reviews.dateAdded', 'reviews.dateSeen', 
    'reviews.id', 'reviews.didPurchase', 'name',
    'reviews.userCity', 'reviews.userProvince', 'reviews.sourceURLs'
]
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])
pickle.dump(df, open("cleaned_dataset.pkl", "wb"))

print(df.head())

                     id   brand  \
0  AVqkIhwDv8e3D1O-lebb  Amazon   
1  AVqkIhwDv8e3D1O-lebb  Amazon   
2  AVqkIhwDv8e3D1O-lebb  Amazon   
3  AVqkIhwDv8e3D1O-lebb  Amazon   
4  AVqkIhwDv8e3D1O-lebb  Amazon   

                                          categories  \
0  Electronics,iPad & Tablets,All Tablets,Fire Ta...   
1  Electronics,iPad & Tablets,All Tablets,Fire Ta...   
2  Electronics,iPad & Tablets,All Tablets,Fire Ta...   
3  Electronics,iPad & Tablets,All Tablets,Fire Ta...   
4  Electronics,iPad & Tablets,All Tablets,Fire Ta...   

                                                keys manufacturer  \
0  841667104676,amazon/53004484,amazon/b01ahb9cn2...       Amazon   
1  841667104676,amazon/53004484,amazon/b01ahb9cn2...       Amazon   
2  841667104676,amazon/53004484,amazon/b01ahb9cn2...       Amazon   
3  841667104676,amazon/53004484,amazon/b01ahb9cn2...       Amazon   
4  841667104676,amazon/53004484,amazon/b01ahb9cn2...       Amazon   

  reviews.doRecommend  reviews.numHel

In [8]:
# Defines a text preprocessing function
def preprocess_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r"\d+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"\s+", " ", text).strip()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)

# Applies text preprocessing (and save checkpoint)
if 'reviews.text' in df.columns:
    df["cleaned_reviews_text"] = df["reviews.text"].apply(preprocess_text)
if 'reviews.title' in df.columns:
    df["cleaned_reviews_title"] = df["reviews.title"].apply(preprocess_text)
pickle.dump(df, open("preprocessed_dataset.pkl", "wb"))

print(df.head())

                     id   brand  \
0  AVqkIhwDv8e3D1O-lebb  Amazon   
1  AVqkIhwDv8e3D1O-lebb  Amazon   
2  AVqkIhwDv8e3D1O-lebb  Amazon   
3  AVqkIhwDv8e3D1O-lebb  Amazon   
4  AVqkIhwDv8e3D1O-lebb  Amazon   

                                          categories  \
0  Electronics,iPad & Tablets,All Tablets,Fire Ta...   
1  Electronics,iPad & Tablets,All Tablets,Fire Ta...   
2  Electronics,iPad & Tablets,All Tablets,Fire Ta...   
3  Electronics,iPad & Tablets,All Tablets,Fire Ta...   
4  Electronics,iPad & Tablets,All Tablets,Fire Ta...   

                                                keys manufacturer  \
0  841667104676,amazon/53004484,amazon/b01ahb9cn2...       Amazon   
1  841667104676,amazon/53004484,amazon/b01ahb9cn2...       Amazon   
2  841667104676,amazon/53004484,amazon/b01ahb9cn2...       Amazon   
3  841667104676,amazon/53004484,amazon/b01ahb9cn2...       Amazon   
4  841667104676,amazon/53004484,amazon/b01ahb9cn2...       Amazon   

  reviews.doRecommend  reviews.numHel

In [10]:
# Drops rows with missing target values (and save checkpoint)
df = df.dropna(subset=['reviews.rating', 'cleaned_reviews_text'])
pickle.dump(df, open("final_dataset.pkl", "wb"))

# Splits data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df["cleaned_reviews_text"], df["reviews.rating"], test_size=0.2, random_state=42
)
pickle.dump((X_train, X_test, y_train, y_test), open("train_test_split.pkl", "wb"))

print(X_train.head())
print(y_train.head())

# Vectorization using TF-IDF and CountVectorizer
# note: the parameters here were iterated several times to see if performance was optimized
# For example, smooth_idf=False and sublinear_tf=True to increase the difference between TF-IDF and Count
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    stop_words="english",
    ngram_range=(1, 3),
    sublinear_tf=True,  # Scales term frequencies logarithmically
    smooth_idf=False,  # Prevents reducing IDF weighting for common words
    min_df=2  # removes words that appear in only 1 document, thus reducing noise
)

count_vectorizer = CountVectorizer(max_features=5000, stop_words="english", ngram_range=(1, 2))

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)
pickle.dump((tfidf_vectorizer, count_vectorizer, X_train_tfidf, X_test_tfidf, X_train_count, X_test_count), open("vectorized_data.pkl", "wb"))



20350    kindle loved screen broke bought replacement i...
10527    fire decent tablet mostly us amazon store limi...
7132     wanted inexpensive tablet would allow surf web...
32227    great response quick navigation device make pl...
24204    got son birthday gift low tech mom im still le...
Name: cleaned_reviews_text, dtype: object
20350    5.0
10527    4.0
7132     4.0
32227    5.0
24204    4.0
Name: reviews.rating, dtype: float64


In [None]:
# Initial Model Training
models = {
    "Naive Bayes (TF-IDF)": MultinomialNB(),
    "Naive Bayes (Count)": MultinomialNB(),
    "Logistic Regression (TF-IDF)": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "Logistic Regression (Count)": LogisticRegression(max_iter=1000, class_weight="balanced")
}

results = {}

for name, model in models.items():
    if "tfidf" in name.lower():
        model.fit(X_train_tfidf, y_train)
        y_pred = model.predict(X_test_tfidf)
    else:
        model.fit(X_train_count, y_train)
        y_pred = model.predict(X_test_count)
    
    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Classification Report": classification_report(y_test, y_pred),
        "Confusion Matrix": confusion_matrix(y_test, y_pred)
    }
    pickle.dump(model, open(f"{name.replace(' ', '_').replace('(', '').replace(')', '')}.pkl", "wb"))
pickle.dump(results, open("model_results.pkl", "wb"))

# Loads the optimized results only if the file exists
if os.path.exists("optimized_model_results.pkl"):
    optimized_results = pickle.load(open("optimized_model_results.pkl", "rb"))
else:
    optimized_results = {}

# Gets the common models for comparison
common_models = list(set(results.keys()).intersection(set(optimized_results.keys())))
common_models.sort()

In [None]:
# Further optimization begins here - handling imbalance
smote = SMOTE(sampling_strategy={1: 324, 2: 324, 3: 1197}, random_state=42)
X_train_tfidf_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)
X_train_count_resampled, y_train_resampled = smote.fit_resample(X_train_count, y_train)
pickle.dump((X_train_tfidf_resampled, X_train_count_resampled, y_train_resampled), open("balanced_data.pkl", "wb"))

# Optimized Model Training
models_optimized = {
    "Naive Bayes (TF-IDF)": MultinomialNB(),
    "Naive Bayes (Count)": MultinomialNB(),
    "Logistic Regression (TF-IDF)": LogisticRegression(class_weight="balanced", max_iter=1000),
    "Logistic Regression (Count)": LogisticRegression(class_weight="balanced", max_iter=1000),
    "Random Forest (TF-IDF)": RandomForestClassifier(n_estimators=200, class_weight="balanced"),
    "Random Forest (Count)": RandomForestClassifier(n_estimators=200, class_weight="balanced")
}

optimized_results = {}

for name, model in models_optimized.items():
    if "tfidf" in name.lower():
        model.fit(X_train_tfidf_resampled, y_train_resampled)
        y_pred = model.predict(X_test_tfidf)
    else:
        model.fit(X_train_count_resampled, y_train_resampled)
        y_pred = model.predict(X_test_count)
    
    optimized_results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Classification Report": classification_report(y_test, y_pred),
        "Confusion Matrix": confusion_matrix(y_test, y_pred)
    }
    pickle.dump(model, open(f"optimized_{name.replace(' ', '_').replace('(', '').replace(')', '')}.pkl", "wb"))
pickle.dump(optimized_results, open("optimized_model_results.pkl", "wb"))


In [None]:
# Generates the comparison table
comparison_df = pd.DataFrame({
    "Model": common_models,
    "Initial Accuracy": [results[m]["Accuracy"] for m in common_models],
    "Optimized Accuracy": [optimized_results[m]["Accuracy"] for m in common_models]
})
print("\nComparison of Initial and Optimized Models:")
print(comparison_df.to_string(index=False))

# Displays the Random Forest model results
rf_models = [m for m in optimized_results.keys() if "Random Forest" in m]
rf_comparison_df = pd.DataFrame({
    "Model": rf_models,
    "Optimized Accuracy": [optimized_results[m]["Accuracy"] for m in rf_models]
})
print("\nRandom Forest Model Performance:")
print(rf_comparison_df.to_string(index=False))

In [None]:
# Checks to see how closely related are TF-IDF and count
import numpy as np

# Compare the sum of feature weights in both vectorized forms
tfidf_sum = np.sum(X_train_tfidf.toarray(), axis=1)
count_sum = np.sum(X_train_count.toarray(), axis=1)

# Compute correlation
correlation = np.corrcoef(tfidf_sum, count_sum)[0, 1]

print(f"Correlation between TF-IDF and Count features: {correlation}")
print(df.columns.tolist())  # 🔍 Debug: See available columns

In [None]:
# Checks in which ways, if any, they are similar or different
tfidf_top_words = tfidf_vectorizer.get_feature_names_out()[:10]
count_top_words = count_vectorizer.get_feature_names_out()[:10]

print("TF-IDF Top Words:", tfidf_top_words)
print("CountVectorizer Top Words:", count_top_words)