<a href="https://colab.research.google.com/github/iammuhammad41/Sentiment-Classification/blob/main/sentiment-analysis-lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
from keras.src.layers import Embedding
from keras.src.legacy.preprocessing.text import Tokenizer
from keras.src.utils import pad_sequences
from nltk import word_tokenize
from nltk.corpus import stopwords
from tensorflow.keras.layers import Input, Bidirectional, LSTM, Dense, Lambda
from tensorflow.keras.models import Model
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
import kagglehub
import nltk
from tensorflow.python.keras.utils.np_utils import to_categorical


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [2]:
nltk.download('punkt')
nltk.download('stopwords')


stop_words = set(stopwords.words('english'))

In [None]:
df_train = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/train.csv', encoding='latin1')
df_test = pd.read_csv('/kaggle/input/sentiment-analysis-dataset/test.csv', encoding='latin1')

In [None]:
def remove_stopwords(text):
    tokens = word_tokenize(text.lower())  # tokenize and lowercase
    filtered = [word for word in tokens if word.isalnum() and word not in stop_words]
    return " ".join(filtered)

In [None]:
X_train = df_train[['text']]
y_train = df_train['sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2})  # or binary
df_test = df_test.dropna()
X_test = df_test[['text']]
y_test = df_test['sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2})  # Only if present

# Flatten the DataFrame columns
train_texts = X_train['text'].astype(str).apply(remove_stopwords)
test_texts = X_test['text'].astype(str).apply(remove_stopwords)

train_y = y_train.astype(int).tolist()
test_y = y_test.astype(int).tolist()

y_train_cat = to_categorical(train_y, num_classes=3)
y_test_cat = to_categorical(test_y, num_classes=3)

In [3]:
# Tokenize
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

max_len = max(max(len(seq) for seq in train_sequences), 50)

X_train_pad = pad_sequences(train_sequences, maxlen=max_len)
X_test_pad = pad_sequences(test_sequences, maxlen=max_len)

word_index = tokenizer.word_index

# Print first 5 keys
for key in list(word_index.keys())[:5]:
    print(key, word_index[key])

In [4]:
X_train_pad.shape

In [5]:
X_test_pad.shape

In [6]:
# Build Embedding Matrix from Word2Vec
path = kagglehub.dataset_download("leadbest/googlenewsvectorsnegative300")
path += '/GoogleNews-vectors-negative300.bin'
print("Path to dataset files:", path)

word2vec = KeyedVectors.load_word2vec_format(path, binary=True)

embedding_dim = 300
vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in word_index.items():
    if word in word2vec:
        embedding_matrix[i] = word2vec[word]

In [7]:
embedding_matrix.shape

In [None]:
class AttentionLayer(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)
        self.u = None
        self.b = None
        self.W = None

    def build(self, input_shape):
        # Trainable weights for attention mechanism
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], input_shape[-1]),
                                 initializer="glorot_uniform", trainable=True)
        self.b = self.add_weight(name="att_bias", shape=(input_shape[-1],),
                                 initializer="zeros", trainable=True)
        self.u = self.add_weight(name="att_u", shape=(input_shape[-1],),
                                 initializer="glorot_uniform", trainable=True)

        super(AttentionLayer, self).build(input_shape)

    def call(self, inputs):
        # Score computation
        v = tf.tanh(tf.tensordot(inputs, self.W, axes=1) + self.b)
        vu = tf.tensordot(v, self.u, axes=1)
        alphas = tf.nn.softmax(vu)

        # Weighted sum of input
        output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), axis=1)
        return output, alphas

# Sample Bi-LSTM model with Attention
def create_model(input_shape):
    inputs = Input(shape=input_shape)

    embedding_layer = Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        input_length=max_len,
        trainable=False)(inputs)

    # Bi-LSTM layer
    lstm_out = Bidirectional(LSTM(64, return_sequences=True))(embedding_layer)

    # Add Attention layer
    attention_out, attention_weights = AttentionLayer()(lstm_out)

    reshaped = Lambda(lambda x: tf.expand_dims(x, axis=1))(attention_out)  # (batch, 1, features)

    # LSTM  layer post attention
    lstm_after_attn = LSTM(64, return_sequences=False)(reshaped)

    # flatten layer
    dense = Dense(128, activation='relu')(lstm_after_attn)

    # Final Dense layer
    outputs = Dense(3, activation='softmax')(dense)

    # Define the model
    return Model(inputs, outputs)

In [8]:
# Set input shape and compile the model
input_shape = (50,)
model = create_model(input_shape)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [9]:
# Train Model
model.fit(X_train_pad, np.array(y_train_cat), epochs=50, batch_size=32, validation_split=0.2)

# Test Model
loss, accuracy = model.evaluate(X_test_pad, np.array(y_test_cat))
print(f"Test Accuracy: {accuracy * 100:.2f}%")