In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dense
from keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load dataset
file_path = '/content/drive/MyDrive/ML/Amazon_Unlocked_Mobile.csv'
data = pd.read_csv(file_path)

In [None]:
# Drop rows with missing values
data = data.dropna()

In [None]:
# Text preprocessing
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = " ".join([word for word in text.split() if word not in stopwords.words('english')])
    return text

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
data['CleanedReviews'] = data['Reviews'].apply(clean_text)

In [None]:
# Label encoding
def sentiment(rating):
    if rating < 3:
        return 'negative'
    elif rating == 3:
        return 'neutral'
    else:
        return 'positive'

In [None]:
data['Sentiment'] = data['Rating'].apply(sentiment)

In [None]:
# Split dataset into training and testing sets
X = data['CleanedReviews']
y = data['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Tokenization and padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_len = max([len(seq) for seq in X_train_seq])
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len)

In [None]:
# Load pre-trained word embeddings (GloVe)
embedding_dict = {}
with open('/content/drive/MyDrive/ML/glove.6B.100d.txt', 'r') as file:
    for line in file:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], 'float32')
        embedding_dict[word] = vector

vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, 100))

# Iterate over each word in the tokenizer's word index
for word, i in tokenizer.word_index.items():
    # Get the corresponding word vector from the embedding dictionary
    embedding_vector = embedding_dict.get(word)
    # If the word vector exists, add it to the embedding matrix
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
# Create a bidirectional LSTM model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_len, weights=[embedding_matrix], trainable=False))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(3, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [None]:
# Train the model
early_stop = EarlyStopping(monitor='val_loss', patience=3)
history = model.fit(X_train_padded, y_train_encoded, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stop])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# Test and evaluate the model
y_pred = model.predict(X_test_padded)
y_pred_classes = np.argmax(y_pred, axis=1)

print("Classification Report:\n", classification_report(y_test_encoded, y_pred_classes, target_names=label_encoder.classes_))

Classification Report:
               precision    recall  f1-score   support

    negative       0.88      0.90      0.89     15609
     neutral       0.72      0.55      0.63      5228
    positive       0.95      0.97      0.96     46030

    accuracy                           0.92     66867
   macro avg       0.85      0.81      0.82     66867
weighted avg       0.91      0.92      0.91     66867



In [None]:
#Baseline
vocab_size = len(tokenizer.word_index) + 1
model_without_glove = Sequential()
model_without_glove.add(Embedding(vocab_size, 100, input_length=max_len))
model_without_glove.add(Bidirectional(LSTM(128)))
model_without_glove.add(Dense(3, activation='softmax'))
model_without_glove.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
early_stop = EarlyStopping(monitor='val_loss', patience=3)
history_without_glove = model_without_glove.fit(X_train_padded, y_train_encoded, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stop])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


In [None]:
y_pred_without_glove = model_without_glove.predict(X_test_padded)
y_pred_classes_without_glove = np.argmax(y_pred_without_glove, axis=1)



In [None]:
print("Classification Report without GloVe embeddings (baseline):\n", classification_report(y_test_encoded, y_pred_classes_without_glove, target_names=label_encoder.classes_))

Classification Report without GloVe embeddings (baseline):
               precision    recall  f1-score   support

    negative       0.90      0.88      0.89     15609
     neutral       0.70      0.60      0.65      5228
    positive       0.95      0.97      0.96     46030

    accuracy                           0.92     66867
   macro avg       0.85      0.82      0.83     66867
weighted avg       0.92      0.92      0.92     66867

