# Sentiment Analysis using LSTM

## Import the libraries

In [None]:
%%capture
!pip install gensim

# Restart runtime session (shortkey: ctrl+m . ):
#       Runtime -> Restart session

In [None]:
import numpy as np
import pandas as pd
import nltk
import pickle
import kagglehub
import tensorflow as tf
from gensim.models import KeyedVectors
from nltk import word_tokenize
from nltk.corpus import stopwords
from keras.src.layers import Embedding
from tensorflow.keras.models import Model
from keras.src.legacy.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.utils.np_utils import to_categorical
from tensorflow.keras.layers import Input, Bidirectional, LSTM, Dense, Lambda, Reshape
import sys
import os

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Import data

In [None]:
!#!/bin/bash
!curl -L -o sentiment-analysis-dataset.zip\
  https://www.kaggle.com/api/v1/datasets/download/abhi8923shriv/sentiment-analysis-dataset

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 54.4M  100 54.4M    0     0  9819k      0  0:00:05  0:00:05 --:--:-- 13.1M


In [None]:
!unzip ./sentiment-analysis-dataset.zip

Archive:  ./sentiment-analysis-dataset.zip
  inflating: test.csv                
  inflating: testdata.manual.2009.06.14.csv  
  inflating: train.csv               
  inflating: training.1600000.processed.noemoticon.csv  


In [None]:
train = pd.read_csv('/content/train.csv', encoding='latin1')
test = pd.read_csv('/content/test.csv', encoding='latin1')

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   textID            27481 non-null  object 
 1   text              27480 non-null  object 
 2   selected_text     27480 non-null  object 
 3   sentiment         27481 non-null  object 
 4   Time of Tweet     27481 non-null  object 
 5   Age of User       27481 non-null  object 
 6   Country           27481 non-null  object 
 7   Population -2020  27481 non-null  int64  
 8   Land Area (Km²)   27481 non-null  float64
 9   Density (P/Km²)   27481 non-null  int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 2.1+ MB


## Feature Engineering

In [None]:
def remove_stopwords(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return ' '.join(filtered_tokens)

In [None]:
train = train.dropna()

X_train = train[['text']]
y_train = train['sentiment'].map({'positive': 1, 'negative': -1, 'neutral': 0})

test = test.dropna()
X_test = test[['text']]
y_test = test['sentiment'].map({'positive': 1, 'negative': -1, 'neutral': 0})

In [None]:
sum(y_test.isna()), y_test.shape

(0, (3534,))

In [None]:
# Flatten the text data
train_texts = X_train['text'].astype(str).apply(remove_stopwords)
test_texts = X_test['text'].astype(str).apply(remove_stopwords)

y_train = to_categorical(y_train.astype(int).tolist(), num_classes=3)
y_test = to_categorical(y_test.astype(int).tolist(), num_classes=3)

In [None]:
# Tokenize
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)
train_seq = tokenizer.texts_to_sequences(train_texts)
test_seq = tokenizer.texts_to_sequences(test_texts)

max_len = max(max(len(seq) for seq in train_seq), 50)

X_train_pad = pad_sequences(train_seq, max_len)
X_test_pad = pad_sequences(test_seq, max_len)

word_idx = tokenizer.word_index

# print first 5 keys
for key in list(word_idx.keys())[:5]:
    print(key, word_idx[key])


day 1
good 2
get 3
like 4
go 5


In [None]:
X_train_pad.shape

(27480, 50)

In [None]:
X_test_pad.shape

(3534, 50)

In [None]:
path = kagglehub.dataset_download("leadbest/googlenewsvectorsnegative300")
path += '/GoogleNews-vectors-negative300.bin'
print("Path to dataset files:", path)

word2vec = KeyedVectors.load_word2vec_format(path, binary=True)


Downloading from https://www.kaggle.com/api/v1/datasets/download/leadbest/googlenewsvectorsnegative300?dataset_version_number=2...


100%|██████████| 3.17G/3.17G [02:35<00:00, 21.9MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/leadbest/googlenewsvectorsnegative300/versions/2/GoogleNews-vectors-negative300.bin


In [None]:
embedding_dim = 300
vocab_size = len(word_idx)+1
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in word_idx.items():
    if word in word2vec:
        embedding_matrix[i] = word2vec[word]


In [None]:
embedding_matrix.shape

(23350, 300)

## Building LSTM Attention Sentiment Classifier

In [None]:
# Define the attention layer
class AttentionLayer(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)
        self.u = None
        self.b = None
        self.W = None

    def build(self, input_shape):
        """Trainable weights for attention mechanism"""
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], input_shape[-1]),
                                 initializer="glorot_uniform", trainable=True)
        self.b = self.add_weight(name="att_bias", shape=(input_shape[-1],),
                                 initializer="zeros", trainable=True)
        self.u = self.add_weight(name="att_u", shape=(input_shape[-1],),
                                 initializer="glorot_uniform", trainable=True)

    def call(self, inputs):
        # Score computation
        v = tf.tanh(tf.tensordot(inputs, self.W, axes=1) + self.b)
        vu = tf.tensordot(v, self.u, axes=1)

        alphas = tf.nn.softmax(vu)

        # weighted sum of input
        output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), axis=1)
        return output, alphas

In [None]:
# Sample Bi-LSTM model with Attention
def create_model(input_shape):
    inputs = Input(shape=input_shape)

    embedding_layer = Embedding(
        input_dim = vocab_size,
        output_dim = embedding_dim,
        input_length = max_len,
        trainable=True)(inputs)

    #Bi LSTM layer
    lstm_out = Bidirectional(LSTM(64, return_sequences=True))(embedding_layer)

    # Add Attention Layer
    attention_out, attention_weights = AttentionLayer()(lstm_out)

    reshaped = Reshape((1, 128))(attention_out)

    # LSTM layer post attention
    lstm_after_attn = LSTM(64, return_sequences=False)(reshaped)

    # Flatten Layer
    dense = Dense(128, activation='relu')(lstm_after_attn)

    # Final Dense Layer
    outputs = Dense(3, activation='softmax')(lstm_after_attn)

    # Define the model
    return Model(inputs, outputs)


In [None]:
# Set input shapes and compile the model
input_shape = (50,)

model = create_model(input_shape)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()



In [None]:
# Train Model
model.fit(X_train_pad, np.array(y_train), epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 16ms/step - accuracy: 0.5422 - loss: 0.9130 - val_accuracy: 0.7249 - val_loss: 0.6790
Epoch 2/10
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 15ms/step - accuracy: 0.8005 - loss: 0.5058 - val_accuracy: 0.7118 - val_loss: 0.7118
Epoch 3/10
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 14ms/step - accuracy: 0.8879 - loss: 0.3184 - val_accuracy: 0.7043 - val_loss: 0.8160
Epoch 4/10
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 14ms/step - accuracy: 0.9305 - loss: 0.2087 - val_accuracy: 0.6830 - val_loss: 1.0028
Epoch 5/10
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 14ms/step - accuracy: 0.9548 - loss: 0.1357 - val_accuracy: 0.6727 - val_loss: 1.1349
Epoch 6/10
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 14ms/step - accuracy: 0.9682 - loss: 0.0976 - val_accuracy: 0.6585 - val_loss: 1.3991
Epoch 7/10
[1m6

<keras.src.callbacks.history.History at 0x7f1a3f4ea890>

## Test the model on test dataset

In [None]:
# Test Model
loss, accuracy = model.evaluate(X_test_pad, np.array(y_test))


[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.6496 - loss: 1.9803


In [None]:
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Test Accuracy: 64.40%


After the model is trained on train_set, we can see that it acheives an accuracy of 59.47% which is pretty good given dataset size, and simplicity of the network. Given more data, it is possible that this network would perform better.

## Use the trained model

In [36]:
def preprocess_text(text: str):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return ' '.join(filtered_tokens)

In [37]:
def predict_sentiment(text: str):
    cleaned = preprocess_text(text)
    seq = tokenizer.texts_to_sequences([cleaned])
    padded = pad_sequences(seq, maxlen=max_len)
    pred = model.predict(padded)[0]
    label_map = {0: 'neutral', 1: 'positive', 2: 'negative'}
    predicted_class = np.argmax(pred)
    return label_map[predicted_class], float(pred[predicted_class])

In [45]:
sentiment, conf = predict_sentiment("It is an excellent movie. Would love to watch again!")
print(f"Sentiment: {sentiment} (Confidence: {conf})")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Sentiment: positive (Confidence: 0.9999539852142334)


In [46]:
sentiment, conf = predict_sentiment("The novel is really lengthy and slow. Storyline is so uninteresting and terrible.")
print(f"Sentiment: {sentiment} (Confidence: {conf})")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
Sentiment: negative (Confidence: 0.9986549615859985)


## Save model for future use

In [None]:
model.save("sentiment_model.keras")

In [None]:
with open("preprocessing.pkl", "wb") as f:
    pickle.dump({
        "tokenizer": tokenizer,
        "max_len": max_len,
        "stop_words": stop_words
    }, f)