In [2]:
import pandas as pd
import numpy as np
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

### Embedding
    -- converting each word into numbers so that the computer can understand
    -- Embedding layer takes each word and turns it into a small list of numbers (called vector) that captures the word's meaning
    -- words that are similar or related will have vectors that look alike - this helps the computer to understand the connection b/w words

### LSTM (Long Short-Term Memory)
    -- it is aspecial knd of memory for computer to remember information from a sequence, like words in a sentence
    -- just like a smart note-taker who remembers what's imp in a story even if it was said earlier

### Bidirectional
    -- reads the sequence twice (forward & backward)

### Dense
    -- it is the final decision-maker that combines everything that computer has learned and gives the final answer/prediction
    -- in sentiment analysis - this review is +ve/-ve

In [14]:
# download needed NLTK data
nltk.data.path.append(r'C:\Users\tande\AppData\Roaming\nltk_data')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tande\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tande\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize.treebank import TreebankWordTokenizer

In [20]:
# load dataset
df = pd.read_csv(r"C:\Users\tande\OneDrive - Pyramid Foods\sample data.csv")

In [21]:
# data cleaning and tokenization with NLTK

tokenizer = TreebankWordTokenizer()

def clean_text(text):
    tokens = tokenizer.tokenize(text.lower()) # changes letters into lower-case, word_tokenize = breaks the text into individual words
    tokens = [t for t in tokens if t not in string.punctuation] # removes all punctuation marks (comma, period, exclamation)
    tokens = [t for t in tokens if t not in stopwords.words('english')] # removes common 'stop-words' - 'is', 'the', 'and', 'a', 'in'
    return ' '.join(tokens) # putting remaining words back together

In [22]:
df['clean_review'] = df['REVIEW'].apply(clean_text) # takes the 'review' column in dataframe, apply() -> run funtion on each review one-by-one

In [24]:
# transforming text into token sequences with keras tokenizer
max_words = 5000
max_len = 20
tokenizers = Tokenizer(num_words=max_words)
tokenizers.fit_on_texts(df['clean_review'])
sequences = tokenizers.texts_to_sequences(df['clean_review'])

In [26]:
# padding the token sequences

padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

In [45]:
# encode target labels

le = LabelEncoder()
y = le.fit_transform(df['RATING'])

In [46]:
# split the data

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df[['REVIEW', 'RATING']], test_size=0.2, random_state=42, shuffle=True)

In [47]:
y_train = y_train['RATING']
y_test = y_test['RATING']
reviews_test = y_test.index

In [55]:
# buid and compile bidirectional LSTM model

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=32, input_length=max_len))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(6, activation='softmax')) # 0-5 classes for rating 1-5
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [49]:
# train model

model.fit(X_train, y_train, epochs=10, batch_size=2, verbose=1)

Epoch 1/10
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 0.2062 - loss: 1.7015
Epoch 2/10
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.4000 - loss: 1.5383
Epoch 3/10
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.7875 - loss: 1.0137
Epoch 4/10
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.9750 - loss: 0.3954
Epoch 5/10
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 1.0000 - loss: 0.1498
Epoch 6/10
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 1.0000 - loss: 0.0704
Epoch 7/10
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 1.0000 - loss: 0.0434
Epoch 8/10
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 1.0000 - loss: 0.0298
Epoch 9/10
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x2976be8dd90>

In [50]:
# evaluate model

loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f'Evaluation Loss: {loss:.4f}, Accuracy: {accuracy:.4f}')

Evaluation Loss: 0.0182, Accuracy: 1.0000


In [51]:
from sklearn.metrics import classification_report, confusion_matrix

# Predict class probabilities
y_pred_probs = model.predict(X_test)
# Convert probabilities to class labels (choose class with highest probability)
y_pred = np.argmax(y_pred_probs, axis=1)

# Classification report for detailed metrics
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 358ms/step
Classification Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         7
           2       1.00      1.00      1.00         7
           3       1.00      1.00      1.00         7
           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00        14

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40

Confusion Matrix:
[[ 7  0  0  0  0]
 [ 0  7  0  0  0]
 [ 0  0  7  0  0]
 [ 0  0  0  5  0]
 [ 0  0  0  0 14]]


In [53]:
import numpy as np

# Predict class probabilities and convert to predicted class labels
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# Create a DataFrame to compare
comparison_df = pd.DataFrame({
    'Review': df.loc[y_test.index, 'clean_review'],  # original reviews in the test set
    'Actual Rating': y_test,
    'Predicted Rating': y_pred
})

# Show first 10 rows
print(comparison_df.head(10))

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
                                                Review  Actual Rating  \
95   absolutely fantastic product exceeded expectat...              5   
15                                       loved it. buy              5   
30   absolutely fantastic product exceeded expectat...              5   
158                        mediocre experience overall              2   
128                                  works major flaws              2   
115  absolutely fantastic product exceeded expectat...              5   
69                                horrible want refund              1   
170                        works perfectly great value              5   
174                    completely useless disappointed              1   
45                                       loved it. buy              5   

     Predicted Rating  
95                  5  
15                  5  
30                  5  
158                 2  
128 