In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dense
from keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/Sentiment Analysis/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Sentiment Analysis/test.csv')

In [None]:
nltk.download('stopwords')
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = " ".join([word for word in text.split() if word not in stopwords.words('english')])
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
train_df['CleanedText'] = train_df['text'].apply(clean_text)

In [None]:
polarity_mapping = {'positive': 2, 'neutral': 1, 'negative': 0}
train_df['PolarityLabel'] = train_df['polarity'].map(polarity_mapping)

In [None]:
X = train_df['CleanedText']
y = train_df['PolarityLabel']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

In [None]:
max_len = max([len(seq) for seq in X_train_seq])
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len)
X_val_padded = pad_sequences(X_val_seq, maxlen=max_len)

In [None]:
embedding_dict = {}
with open('/content/drive/MyDrive/ML/glove.6B.100d.txt', 'r') as file:
    for line in file:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], 'float32')
        embedding_dict[word] = vector

vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
    embedding_vector = embedding_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
# Bidirectional LSTM model with GloVe embeddings
model_with_glove = Sequential()
model_with_glove.add(Embedding(vocab_size, 100, input_length=max_len, weights=[embedding_matrix], trainable=False))
model_with_glove.add(Bidirectional(LSTM(128)))
model_with_glove.add(Dense(3, activation='softmax'))
model_with_glove.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
early_stop = EarlyStopping(monitor='val_loss', patience=3)
model_with_glove.fit(X_train_padded, y_train, epochs=10, batch_size=32, validation_data=(X_val_padded, y_val), callbacks=[early_stop])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


<keras.src.callbacks.History at 0x7a08640c7610>

In [None]:
# Prepare the test dataset for prediction
test_df['CleanedText'] = test_df['text'].apply(clean_text)
test_seq = tokenizer.texts_to_sequences(test_df['CleanedText'])
test_padded = pad_sequences(test_seq, maxlen=max_len)

In [None]:
predictions_with_glove = model_with_glove.predict(test_padded)
predicted_classes_with_glove = np.argmax(predictions_with_glove, axis=1)



In [None]:
submission_with_glove = pd.DataFrame({'Id': test_df['id'], 'Polarity': predicted_classes_with_glove})
submission_with_glove.to_csv('/content/drive/MyDrive/Sentiment Analysis/submission_with_glove.csv', index=False)

In [None]:
error_analysis_data = X_val.sample(n=80, random_state=1)

In [None]:
error_analysis_seq = tokenizer.texts_to_sequences(error_analysis_data)
error_analysis_padded = pad_sequences(error_analysis_seq, maxlen=max_len)

In [None]:
error_analysis_predictions = model_with_glove.predict(error_analysis_padded)
error_analysis_predicted_classes = np.argmax(error_analysis_predictions, axis=1)
error_analysis_actual_classes = y_val.loc[error_analysis_data.index]



In [None]:
for predicted_class, actual_class, text in zip(error_analysis_predicted_classes, error_analysis_actual_classes, error_analysis_data):
    if predicted_class != actual_class:
        print(f"Text: {text}\nPredicted: {predicted_class}\nActual: {actual_class}\n")

Text: awesome im actually familiar sdl im going look
Predicted: 1
Actual: 2

Text: logger logger new loggergetlogger id worried impending logpocalypse
Predicted: 2
Actual: 0

Text: rendering issue fixed ie need worry
Predicted: 1
Actual: 2

Text: use symlinks instead alias sad ui go terminal type first path base path original file second base path symlink filefolder etc
Predicted: 0
Actual: 1

Text: came across idiom opensource python choked drink rather even code read see result typical idiom python performance hack runs fast onceoff needs code review
Predicted: 1
Actual: 0

Text: shameeeee
Predicted: 1
Actual: 0

Text: know exact contents csv files dont worry escaping characters etc used dottrace jetbrains profiling actually use code project csvreader parts code performance important piece code reason asked
Predicted: 2
Actual: 1

Text: excellent example implement twodimensional arraylist use specific case need reinvent wheel go
Predicted: 2
Actual: 1

Text: false false false im afra

In [None]:
# Bidirectional LSTM model without GloVe embeddings
model_without_glove = Sequential()
model_without_glove.add(Embedding(vocab_size, 100, input_length=max_len))
model_without_glove.add(Bidirectional(LSTM(128)))
model_without_glove.add(Dense(3, activation='softmax'))
model_without_glove.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
model_without_glove.fit(X_train_padded, y_train, epochs=10, batch_size=32, validation_data=(X_val_padded, y_val), callbacks=[early_stop])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


<keras.src.callbacks.History at 0x7a080b905a50>

In [None]:
predictions_without_glove = model_without_glove.predict(test_padded)
predicted_classes_without_glove = np.argmax(predictions_without_glove, axis=1)



In [None]:
submission_without_glove = pd.DataFrame({'Id': test_df['id'], 'Polarity': predicted_classes_without_glove})
submission_without_glove.to_csv('/content/drive/MyDrive/Sentiment Analysis/submission_without_glove.csv', index=False)

In [None]:
error_analysis_data = X_val.sample(n=80, random_state=1)

In [None]:
error_analysis_seq = tokenizer.texts_to_sequences(error_analysis_data)
error_analysis_padded = pad_sequences(error_analysis_seq, maxlen=max_len)

In [None]:
error_analysis_predictions = model_with_glove.predict(error_analysis_padded)
error_analysis_predicted_classes = np.argmax(error_analysis_predictions, axis=1)
error_analysis_actual_classes = y_val.loc[error_analysis_data.index]



In [None]:
for predicted_class, actual_class, text in zip(error_analysis_predicted_classes, error_analysis_actual_classes, error_analysis_data):
    if predicted_class != actual_class:
        print(f"Text: {text}\nPredicted: {predicted_class}\nActual: {actual_class}\n")

Text: awesome im actually familiar sdl im going look
Predicted: 1
Actual: 2

Text: logger logger new loggergetlogger id worried impending logpocalypse
Predicted: 2
Actual: 0

Text: rendering issue fixed ie need worry
Predicted: 1
Actual: 2

Text: use symlinks instead alias sad ui go terminal type first path base path original file second base path symlink filefolder etc
Predicted: 0
Actual: 1

Text: came across idiom opensource python choked drink rather even code read see result typical idiom python performance hack runs fast onceoff needs code review
Predicted: 1
Actual: 0

Text: shameeeee
Predicted: 1
Actual: 0

Text: know exact contents csv files dont worry escaping characters etc used dottrace jetbrains profiling actually use code project csvreader parts code performance important piece code reason asked
Predicted: 2
Actual: 1

Text: excellent example implement twodimensional arraylist use specific case need reinvent wheel go
Predicted: 2
Actual: 1

Text: false false false im afra