In [2]:
import pandas as pd
import numpy as np 

import warnings
warnings.filterwarnings("ignore")


### load data

In [3]:

train_df = pd.read_csv('/kaggle/input/amazon-reviews/train.csv',header=None, names=['polarity', 'title', 'review'])
train_df.head()

Unnamed: 0,polarity,title,review
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,2,Amazing!,This soundtrack is my favorite music of all ti...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."


In [4]:
train_df.sample(5)

Unnamed: 0,polarity,title,review
2273200,2,Factory_Reconditioned Breville XXJE95XL Two Speed,Thank you i have been using this product to be...
1041403,2,A review from Spain,Estamos ante otra maravillosa grabación del pi...
2586353,1,Possibly the worst movie I have ever seen!,All I can say about this movie is that I find ...
2710561,1,Unreliable,This must be the most frustrating device it ha...
2285668,1,Not 4 gauge! More like 8 gauge with thick insu...,The cable included in this kit appears to be 8...


### check class imbalance

In [5]:
train_df.polarity.value_counts()

polarity
2    1800000
1    1800000
Name: count, dtype: int64

In [6]:
# map polarity : positive ->0 | negative -> 1
train_df['label'] = train_df.polarity.map({2:0, 1:1})
train_df.sample(3)

Unnamed: 0,polarity,title,review,label
2980654,2,great show,I was excited that the season's were reasonabl...,0
2682254,1,"Oh, Please...","I do enjoy reading various view points, but th...",1
2114258,2,Worked like a charm!,"Before this plan, bedtime was a 2 hour process...",0


No class imbalance

In [7]:
# check missing value in review column 
len(train_df[train_df.review.isnull()])

0

In [8]:
train_df.review.isnull().any()

False

In [9]:
# check missing value in title column 
len(train_df[train_df.title.isnull()])

207

a 207 NaN value in title, this will create a problem if we planning to combine the title nad review in a single row 

In [10]:
# combine review title with review body 
# fillna : replaces any NaN values in the title  an empty string 

train_df['full_review'] = train_df.title.fillna('') + " " + train_df.review 

In [11]:
train_df.head()

Unnamed: 0,polarity,title,review,label,full_review
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...,0,Stuning even for the non-gamer This sound trac...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...,0,The best soundtrack ever to anything. I'm read...
2,2,Amazing!,This soundtrack is my favorite music of all ti...,0,Amazing! This soundtrack is my favorite music ...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...,0,Excellent Soundtrack I truly like this soundtr...
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine...",0,"Remember, Pull Your Jaw Off The Floor After He..."


In [12]:
# to check that the new 'full_review' is not affected by the Nan values in title
train_df[train_df.title.isnull()]

Unnamed: 0,polarity,title,review,label,full_review
13265,1,,Couldn't get the device to work with my networ...,1,Couldn't get the device to work with my netwo...
26554,1,,What separates this band from Evanescence (bes...,1,What separates this band from Evanescence (be...
26827,2,,Falkenbach returns with more of the Viking/Fol...,0,Falkenbach returns with more of the Viking/Fo...
36598,2,,I returned this because I received the same on...,0,I returned this because I received the same o...
37347,2,,This book is a great fantasy. I love this amaz...,0,This book is a great fantasy. I love this ama...
...,...,...,...,...,...
3403351,1,,It is not a game. It is only a memory cardIt w...,1,It is not a game. It is only a memory cardIt ...
3455848,1,,"The sleeve is not bad, but the vacuum is worth...",1,"The sleeve is not bad, but the vacuum is wort..."
3493132,2,,Al Spath's diary is a must for all poker playe...,0,Al Spath's diary is a must for all poker play...
3565886,1,,IBD should sell single issues for the iPad to ...,1,IBD should sell single issues for the iPad to...


In [13]:
train_df.title[0]

'Stuning even for the non-gamer'

In [14]:
train_df.review[0]

'This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'

In [15]:
train_df.full_review[0]

'Stuning even for the non-gamer This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'

In [16]:
# check dublicate 

duplicate_rows = train_df.duplicated() 
duplicate_rows.any() 

False

No Dublicates in the data 

In [17]:
len(train_df.full_review[0])

425

In [18]:
len(train_df.full_review[100])

104

In [19]:
train_df.full_review[100]

'textbook Book shipped quickly and was in excellent condition as stated. Easy transaction would buy again'

In [20]:
len(train_df.full_review[1])

508

In [21]:
# check avergae lenght of a full review 
full_review_avg_length = np.mean([ len(train_df.full_review[0]) for i in range(len(train_df.full_review))])
print(full_review_avg_length) 

425.0


In [22]:
# check for very short reviews 

short_reviews = train_df[train_df['full_review'].apply(lambda x: len(x.split()) < 10)]

short_reviews.label.value_counts()

print(f"Number of short reviews: {len(short_reviews)}")

Number of short reviews: 25


In [23]:
short_reviews.head() 

Unnamed: 0,polarity,title,review,label,full_review
372567,2,Great Purchase,Great Purchase CKVKLJ;JNVKZ;KCVJJVK'ZJXC'VKLZX...,0,Great Purchase Great Purchase CKVKLJ;JNVKZ;KCV...
408934,2,oooooooooo....?....i think i like it,yeah...yeah...hmmmmmm. yeah.dance punk?yes...d...,0,oooooooooo....?....i think i like it yeah...ye...
567147,2,Great,TeTerriffic!Terriffic!Terriffic!Terriffic!Terr...,0,Great TeTerriffic!Terriffic!Terriffic!Terriffi...
759194,1,Disappointing,Thisis.the.very.first.book.by.her.that.I.had.a...,1,Disappointing Thisis.the.very.first.book.by.he...
817508,1,dags go bam here,gogogogagagagagagamandiffheuejkuawieuajdsifjhj...,1,dags go bam here gogogogagagagagagamandiffheue...


In [24]:
short_reviews.full_review.iloc[0]

"Great Purchase Great Purchase CKVKLJ;JNVKZ;KCVJJVK'ZJXC'VKLZXVMK'ZLXMCV'ZLKCV'LKZMX'LKVCCVCVCVCVCVCVCVCVC"

There is a lot of garbage and noise that need our attention 

### a random sample of train_df for ease of experiments

In [25]:
# a lighter df to experiment with 
df_train_sample  = train_df.sample(100000, random_state=42)

In [26]:
df_train_sample.label.value_counts() 

label
1    50013
0    49987
Name: count, dtype: int64

In [27]:
len(df_train_sample)

100000

### Remove URL and HTML tags if any

In [28]:
from bs4 import BeautifulSoup
has_html_tags = df_train_sample['full_review'].apply(lambda x: bool(BeautifulSoup(x, "html.parser").find(True)))


In [29]:
df_train_sample[has_html_tags].full_review.iloc[1]

'Machine worked great for a while... ...but a part broke and I went to order a replacement part <which you can only get from the company directly> and it has been over three months. All they say is "sorry, it will ship some day". That is a big help. Worst customer service for a company I have ever dealt with. Great machine unless you need to deal with the company.Hope this helps.'

there is indeed html tags that will require our attention 

In [30]:
# check for urls 
import re
has_url = [bool(re.findall(r'http\S+', df_train_sample.full_review.iloc[i])) for i in range(len(df_train_sample))]
df_train_sample[has_url]

Unnamed: 0,polarity,title,review,label,full_review
1346235,2,Amazing first single!!!,"The first single from this album is called ""Th...",0,Amazing first single!!! The first single from ...
1191339,2,Very helpful.,I ordered this to accompany the Pimsleur Langu...,0,Very helpful. I ordered this to accompany the ...
2934680,2,not another fad diet,"To all the critiques out there, this is not a ...",0,not another fad diet To all the critiques out ...
672714,2,A dream come true,This dictionary is everything you could ever w...,0,A dream come true This dictionary is everythin...
2358540,1,"THIS IS ""NOT"" A MINI CASSETTE RECORDER, BUYER ...",This recorder is NOT a mini recorder as seller...,1,"THIS IS ""NOT"" A MINI CASSETTE RECORDER, BUYER ..."
...,...,...,...,...,...
629718,1,If it sounds like a duck ...,"Yet another nonsensical tome by ""Dr."" Hulda R....",1,If it sounds like a duck ... Yet another nonse...
869392,2,Amazing DVD Set,"This set has fantastic video quality, menus, e...",0,Amazing DVD Set This set has fantastic video q...
3167596,1,Widescreen now available,Both Borders and Barnes and Noble now carry a ...,1,Widescreen now available Both Borders and Barn...
3541401,2,DogRead book of the month,This book was chose to be on the prestigious '...,0,DogRead book of the month This book was chose ...


both URL and HTML tags do not affect the feeling of the customer, hence they are just noise in our case.

### Preprocessing Steps:

1. **Remove URLs**: Strips any URLs from the review, as they don't contribute meaningfully to sentiment analysis.

2. **Remove Gibberish & Excessive Repeated Characters**: Identifies and removes sequences with too many repeated characters (e.g., "ooooo" becomes "o") and repeated words (e.g., "great great" becomes "great").

3. **Remove Punctuation, Numbers, and Long Words**: Filters out punctuation, numbers, and excessively long words (over 20 characters).

4. **Lowercase Text**: Converts all text to lowercase.

5. **Remove Stopwords (except "not", "no", and "nor")**: Removes common stopwords except negations to preserve their importance.

6. **Lemmatization**: Converts words to their base form (e.g., "running" to "run").

7. **Word Embeding*


In [31]:
import re
import spacy
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from tqdm import tqdm
import pandas as pd
from joblib import Parallel, delayed



nlp = spacy.load('en_core_web_lg')

# Custom stopwords list to keep "not", "no", "nor"
stopwords_to_keep = {"not", "no", "nor"}
custom_stopwords = nlp.Defaults.stop_words - stopwords_to_keep

# Pre-compile regex patterns for optimization
url_pattern = re.compile(r'http\S+|www\S+|https\S+')
repeated_char_pattern = re.compile(r'(.)\1{3,}')
repeated_word_pattern = re.compile(r'\b(\w+)( \1\b)+')

def remove_gibberish(text):
    # Remove excessive repeated characters 
    text = repeated_char_pattern.sub(r'\1', text)
    # Remove repeated words 
    text = repeated_word_pattern.sub(r'\1', text)
    return text

def preprocess_text(text):
    text = url_pattern.sub('', text)
    text = remove_gibberish(text)
    
    doc = nlp(text)
    
    tokens = [token.lemma_.lower() for token in doc if not (
        token.is_punct or token.is_digit or len(token.text) > 20 or token.text in custom_stopwords)]
    
    clean_text = " ".join(tokens)
    
    # return review vector representation  
    return nlp(clean_text).vector


def combined_preprocessing(text_series):
    # Use parallel processing with joblib to preprocess text in parallel
    results = Parallel(n_jobs=-1, backend="multiprocessing")(
        delayed(preprocess_text)(text) for text in tqdm(text_series, desc="Processing texts")
    )
    return pd.Series(results)


# Function to stack the preprocessed vectors
def stack_preprocessed(preprocessed_series):
    return np.stack(preprocessed_series)


# Custom transformer for stacking
class StackingTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return stack_preprocessed(X)


preprocess_pipe = Pipeline([
    ('preprocess', FunctionTransformer(combined_preprocessing, validate=False)),  # Preprocess the text
    ('stacking', StackingTransformer()),
    ('scaling', MinMaxScaler())
])


## Train/Test Split (Using Subset Of Data)

In [32]:
from sklearn.model_selection import train_test_split

train_df_sample  = train_df.sample(1000, random_state = 42)

X_train,X_test, y_train, y_test = train_test_split(
    train_df_sample.full_review,
    train_df_sample.label,
    random_state=42
)

X_train.shape, X_test.shape


((750,), (250,))

## Test that every thing is working

In [33]:
from sklearn.linear_model import LogisticRegression

log_reg = make_pipeline(preprocess_pipe, LogisticRegression())

log_reg.fit(X_train, y_train)

Processing texts: 100%|██████████| 750/750 [00:09<00:00, 80.15it/s]


In [34]:
from sklearn.metrics import classification_report

print(classification_report(y_test, log_reg.predict(X_test))) 

Processing texts: 100%|██████████| 250/250 [00:03<00:00, 75.47it/s]


              precision    recall  f1-score   support

           0       0.92      0.80      0.85       129
           1       0.81      0.93      0.86       121

    accuracy                           0.86       250
   macro avg       0.87      0.86      0.86       250
weighted avg       0.87      0.86      0.86       250



## Modeling

- Working with only a random subset of the data.
- The preprocessing step is not in the pipeline to save some time. 

In [35]:
from sklearn.model_selection import train_test_split

train_df_sample  = train_df.sample(200000, random_state = 42)

X_train,X_test, y_train, y_test = train_test_split(
    train_df_sample.full_review,
    train_df_sample.label,
    test_size=.2,
    random_state=42
)

X_train.shape, X_test.shape

((160000,), (40000,))

In [36]:
# To test more models, we will do the preprocessing in a seperate step to save us some time 

# Import necessary libraries
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm



X_train_pre = preprocess_pipe.fit_transform(X_train)
X_test_pre = preprocess_pipe.fit_transform(X_test)



Processing texts: 100%|██████████| 160000/160000 [30:11<00:00, 88.34it/s] 
Processing texts: 100%|██████████| 40000/40000 [07:46<00:00, 85.67it/s] 


In [37]:
###### Dictionary of models
models = {
    "Logistic Regression": LogisticRegression(solver='lbfgs', max_iter=1000),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "MLP Neural Network": MLPClassifier(max_iter=1000)
}



def evaluate_model(model_name, model, X_train_pre, y_train):
    scores = cross_val_score(model, X_train_pre, y_train, cv=5, scoring='f1_macro', n_jobs=-1) 
    return model_name, scores.mean()

results = []
for model_name, model in tq dm(models.items(), desc="Evaluating models"):
    model_name, score = evaluate_model(model_name, model, X_train_pre, y_train)
    results.append((model_name, score))


Evaluating models: 100%|██████████| 5/5 [3:04:25<00:00, 2213.13s/it]  


In [40]:
results_df = pd.DataFrame(results, columns=['Model', 'Mean Accuracy'])
results_df


Unnamed: 0,Model,Mean Accuracy
0,Logistic Regression,0.850273
1,SVM,0.869705
2,Random Forest,0.814059
3,XGBoost,0.84048
4,MLP Neural Network,0.860325


In [41]:


best_model_name = results_df.loc[results_df['Mean Accuracy'].idxmax(), 'Model']
best_score = results_df['Mean Accuracy'].max()

best_model = models[best_model_name]

print(f"Best model: {best_model_name} with Mean Accuracy: {best_score}")




Best model: SVM with Mean Accuracy: 0.869704761754836
The best model SVM has been saved as 'best_model_SVM.pkl'.


In [51]:
best_model.fit(X_train_pre, y_train) 

In [52]:
# Save the best model using joblib
joblib.dump(best_model, f'best_model_{best_model_name}.pkl')

print(f"The best model {best_model_name} has been saved as 'best_model_{best_model_name}.pkl'.")

The best model SVM has been saved as 'best_model_SVM.pkl'.


In [53]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib


y_pred = best_model.predict(X_test_pre)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy on test set: {accuracy}")
print("Classification Report:")
print(report)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy on test set: 0.86425
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.90      0.87     19912
           1       0.89      0.83      0.86     20088

    accuracy                           0.86     40000
   macro avg       0.87      0.86      0.86     40000
weighted avg       0.87      0.86      0.86     40000

Confusion Matrix:
[[17920  1992]
 [ 3438 16650]]
