## Load Required Libraries

In [68]:
#  Data manipulation & visualization
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

#  NLP libraries: spaCy
import spacy
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

sns.set()

#### Downloading the spacy model for text preprocessing

In [69]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m81.8 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


#### Import Google Drive

In [70]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Load the dataset

In [71]:
df =pd.read_csv('/content/drive/My Drive/fullstack-imbalanced-dataset.csv')
df

Unnamed: 0.1,Unnamed: 0,review_text,Rating,Summary,review_length
0,0,this is a very nice smooth mellow blend the va...,4,Smooth creamy taste,16
1,1,there is a reason that they call this talking ...,4,Super sparkly sparkling water.,88
2,2,dont know how they taste as they were a gift f...,4,Yummy chocolates delivered on time,43
3,3,good lemon ginger tea i like mine slightly swe...,4,lemon and ginger,48
4,4,i have been using international delight french...,4,Delightful!,72
...,...,...,...,...,...
125913,125913,i love getting this kind of stuff delivered so...,5,Nice to have delivered!,67
125914,125914,for those that dont know what this stuff is it...,5,Sprinkle Sandwiches a Dutch Classic- Delicious!,58
125915,125915,the title and description are not the same if ...,1,WRONG PRODUCT,24
125916,125916,pros slightly better nutrient content than oth...,2,"Very salty, bad flavor balance",52


#### Exploratory Data Analysis

In [72]:
#shape of the dataset
df.shape

(125918, 5)

In [73]:
#display 3 full sample reviews per rating
for rating in sorted(df['Rating'].unique()):
    print(f"\n⭐ Rating {rating}")
    reviews = df[df['Rating'] == rating].tail(3)
    for _, row in reviews.iterrows():
        print(f"\n- {row['review_text']}\n")


⭐ Rating 1

- i ended up throwing this stuff out it tasted that bad imagine blending grass in a blender until it is liquid and drinking it that is what this stuff tastes like


- with the increased popularity of argon and coconut oils being used by women of all ethnicities i mistakenly thought this product would be appropriate for my hair i use argon oil now and it really helps to calm frizz and flyaways for my colored and overly dry hairi have thick coarse wavy hair that is color treated i have used hot oils and leave in oils for years without having any problems but this product seems to be intended mostly for ethnic hair no where on the product does it say this but the advertisement has an ethnic model and other products in the line are clearly not intended for caucasian hairsince there are no directions on this product i would imagine i would use it the same way i use my argon oil but that results is limp greasy looking hair even when using the product sparingly and the ingredient

In [74]:
df['word_count'] = df['review_text'].apply(lambda x: len(str(x).split()))

In [75]:
#display the statistics of the word count
min_word_counts = df.groupby('Rating')['word_count'].describe()
print(min_word_counts)

          count       mean        std   min   25%   50%    75%    max
Rating                                                               
1       12592.0  78.221410  62.892217  10.0  36.0  60.0   98.0  496.0
2       20719.0  85.786283  67.254613   6.0  39.0  66.0  109.0  499.0
3       29648.0  91.219172  73.110825   5.0  40.0  69.0  118.0  500.0
4       37775.0  86.504911  71.642348   5.0  37.0  64.0  111.0  499.0
5       25184.0  70.599825  59.891781   5.0  31.0  52.0   87.0  496.0


### Train -Test -Split

In [76]:
df['combined_text'] = df['Summary'].astype(str) + " " + df['review_text'].astype(str)
X = df['combined_text']
y = df['Rating']

In [77]:
#  Split training and testing data
X_train, X_test, y_train, y_test = train_test_split(df['review_text'],df['Rating'], test_size=0.2, random_state=42,shuffle=True,stratify=df['Rating'] )

In [78]:
print("Train Rating Distribution:\n", y_train.value_counts(normalize=True))
print("\nTest Rating Distribution:\n", y_test.value_counts(normalize=True))

Train Rating Distribution:
 Rating
4    0.299998
3    0.235452
5    0.200002
2    0.164542
1    0.100006
Name: proportion, dtype: float64

Test Rating Distribution:
 Rating
4    0.299992
3    0.235467
5    0.200008
2    0.164549
1    0.099984
Name: proportion, dtype: float64


### Stopword Removal and Lemmatization

In [79]:
# Words to keep (do NOT remove)
words_to_keep = {
    'not', 'no', 'never', 'none', 'nor', 'neither', "n't", "n‘t", "n’t",
    'but', 'although', 'though', 'however', 'unless', 'whereas', 'yet', 'still',
    'very', 'too', 'quite', 'rather', 'really', 'somewhat', 'much',
    'i', 'we', 'you', 'my', 'our', 'me', 'us', 'your', 'yours',
    'always','amount','almost','anyone','can','cannot','could','did','do','enough',
    'except','should','may','might','some','most','more','mostly'
}

for w in words_to_keep:
    if w in nlp.Defaults.stop_words:
        nlp.Defaults.stop_words.remove(w)
        nlp.vocab[w].is_stop = False


def spacy_preprocess_pipe(texts):
    processed = []
    for doc in nlp.pipe(texts, batch_size=1000, disable=["ner", "parser"]):  # disable unneeded parts
        tokens = [
            token.lemma_
            for token in doc
            if not token.is_stop and not token.is_space and token.is_alpha
        ]
        processed.append(" ".join(tokens))
    return processed

In [80]:
#print the list of stopwords
print(sorted(nlp.Defaults.stop_words))
# Print total number of unique stopwords in spaCy's English model
print("Number of spaCy stopwords:", len(nlp.Defaults.stop_words))

["'d", "'ll", "'m", "'re", "'s", "'ve", 'a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'alone', 'along', 'already', 'also', 'am', 'among', 'amongst', 'an', 'and', 'another', 'any', 'anyhow', 'anything', 'anyway', 'anywhere', 'are', 'around', 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'both', 'bottom', 'by', 'ca', 'call', 'does', 'doing', 'done', 'down', 'due', 'during', 'each', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'few', 'fifteen', 'fifty', 'first', 'five', 'for', 'former', 'formerly', 'forty', 'four', 'from', 'front', 'full', 'further', 'get', 'give', 'go', 'had', 'has', 'have', 'he', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'hundred', 'if', 'in', 'i

In [81]:
X_train = pd.Series(spacy_preprocess_pipe(X_train))
X_test = pd.Series(spacy_preprocess_pipe(X_test))

In [82]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Define tokenizer parameters
vocab_size = 10000
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")

# Fit tokenizer on training data
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)


In [83]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = 210

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

In [84]:
import pickle


tokenizer_path = '/content/drive/MyDrive/tokenizer.pkl'
maxlen_path = '/content/drive/MyDrive/max_len.pkl'

# Save tokenizer
with open(tokenizer_path, 'wb') as f:
    pickle.dump(tokenizer, f)

# Save max_len
with open(maxlen_path, 'wb') as f:
    pickle.dump(max_len, f)


In [85]:
#  Fix the labels (make them start from 0)
y_train = y_train - 1
y_test = y_test - 1

In [86]:
import numpy as np

embedding_index = {}
embedding_dim = 300  # FastText uses 300-dim vectors

# Change path accordingly if it's in Drive
with open("/content/drive/MyDrive/wiki.simple.vec", encoding='utf-8') as f:
    next(f)  # skip header line (e.g., "111051 300")
    for line in f:
        values = line.rstrip().split(' ')
        word = values[0]
        try:
            vector = np.asarray(values[1:], dtype='float32')
            if vector.shape[0] == embedding_dim:  # ensure it's valid
                embedding_index[word] = vector
        except ValueError:
            continue  # skip malformed lines


In [87]:
from tensorflow.keras.preprocessing.text import Tokenizer


word_index = tokenizer.word_index
vocab_size = len(word_index) + 1  # +1 for padding token

embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector




In [88]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.utils.class_weight import compute_class_weight


model = Sequential()


class_weights = compute_class_weight(class_weight='balanced',
                                     classes=np.unique(y_train),
                                     y=y_train)

# Convert to dictionary format for Keras
class_weight_dict = dict(enumerate(class_weights))


model.add(Embedding(input_dim=vocab_size,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],
                    input_length=max_len,  # use same padding length
                    trainable=True))
model.add(Bidirectional(LSTM(128, return_sequences=False)))
model.add(Dropout(0.5))  # optional but helps reduce overfitting
model.add(Dense(32, activation='relu'))
model.add(Dense(5, activation='softmax'))  # 5 classes for ratings

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Train
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(X_train_pad, y_train, epochs=20, batch_size=64, validation_data=(X_test_pad, y_test),class_weight=class_weight_dict,callbacks=[early_stop])




Epoch 1/20
[1m1574/1574[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 50ms/step - accuracy: 0.3563 - loss: 1.3690 - val_accuracy: 0.4832 - val_loss: 1.1541
Epoch 2/20
[1m1574/1574[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 49ms/step - accuracy: 0.5064 - loss: 1.0832 - val_accuracy: 0.4924 - val_loss: 1.1548
Epoch 3/20
[1m1574/1574[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 50ms/step - accuracy: 0.5536 - loss: 0.9829 - val_accuracy: 0.4911 - val_loss: 1.1603
Epoch 4/20
[1m1574/1574[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 50ms/step - accuracy: 0.5883 - loss: 0.8978 - val_accuracy: 0.5054 - val_loss: 1.1400
Epoch 5/20
[1m1574/1574[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 50ms/step - accuracy: 0.6308 - loss: 0.8078 - val_accuracy: 0.4968 - val_loss: 1.2329
Epoch 6/20
[1m1574/1574[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 50ms/step - accuracy: 0.6710 - loss: 0.7224 - val_accuracy: 0.4943 - val_loss: 1.3189
Epoc

In [89]:
model.save("/content/drive/MyDrive/fullstack_imbalanced_model1.h5")



In [90]:
import numpy as np
from sklearn.metrics import classification_report

# 1. Predict probabilities on test data
y_pred_probs = model.predict(X_test_pad)

# 2. Convert probabilities to predicted classes (0 to 4 for 5 classes)
y_pred = np.argmax(y_pred_probs, axis=1)

# 3. Print classification report (y_test are true labels)
print(classification_report(y_test, y_pred, digits=4))


[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step
              precision    recall  f1-score   support

           0     0.4379    0.6334    0.5179      2518
           1     0.4135    0.4122    0.4128      4144
           2     0.4621    0.4503    0.4561      5930
           3     0.5845    0.4450    0.5053      7555
           4     0.5768    0.6734    0.6214      5037

    accuracy                         0.5054     25184
   macro avg     0.4950    0.5229    0.5027     25184
weighted avg     0.5113    0.5054    0.5030     25184



In [91]:
from sklearn.metrics import classification_report
import numpy as np

# Predict on training data
y_train_pred_probs = model.predict(X_train_pad)
y_train_pred = np.argmax(y_train_pred_probs, axis=1)

# Print classification report for training data
print(classification_report(y_train, y_train_pred))


[1m3148/3148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 8ms/step
              precision    recall  f1-score   support

           0       0.61      0.86      0.71     10074
           1       0.59      0.59      0.59     16575
           2       0.61      0.58      0.59     23718
           3       0.73      0.56      0.64     30220
           4       0.68      0.82      0.74     20147

    accuracy                           0.65    100734
   macro avg       0.64      0.68      0.65    100734
weighted avg       0.66      0.65      0.65    100734



In [92]:
#  raw new text input:
new_text = ['awesome product']

# Step 1: Tokenize the new text (convert words to integer sequences)
sequences = tokenizer.texts_to_sequences(new_text)

# Step 2: Pad sequences to the same max length the model expects
X_new = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

# Step 3: Predict with the deep learning model
y_pred_probs = model.predict(X_new)

# Step 4: Convert probabilities to class labels (e.g., 0 to 4)
y_pred = np.argmax(y_pred_probs, axis=1)

print(y_pred)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[4]
