<a href="https://colab.research.google.com/github/hdas25/Deep-Learning-Assignments/blob/main/Assignment_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Section 1 — Install Required Libraries and Import Dependencies

In [1]:
# Step 1 – Install only what we need that Colab lacks
!pip install -q kaggle nltk

import nltk, tensorflow as tf, pandas as pd, numpy as np, re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# ALWAYS import Keras from tensorflow, NOT the standalone package
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense

nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

STOP_WORDS = set(stopwords.words('english'))
STEMMER     = PorterStemmer()

print("TensorFlow version:", tf.__version__)  # should show 2.18.x


TensorFlow version: 2.18.0


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Section 2 — Upload the Dataset (CSV File) via Colab File Uploader

In [2]:
from google.colab import files
uploaded = files.upload()
filename = next(iter(uploaded))

Saving amazon_reviews.csv to amazon_reviews.csv


Section 3 — Preprocess Dataset: Filter, Label Sentiment, and Prepare Text Column

In [3]:
import pandas as pd

df_raw = pd.read_csv(filename)
print("Columns:", df_raw.columns.tolist())
df_raw = df_raw[['reviewText', 'overall']].dropna().reset_index(drop=True)
df_raw = df_raw[df_raw['overall'] != 3]  # remove neutral ratings

# Create sentiment labels
def label_sentiment(star):
    if star <= 2:
        return 'Negative'
    elif star >= 4:
        return 'Positive'

df_raw['Sentiment'] = df_raw['overall'].apply(label_sentiment)
df_raw = df_raw[['reviewText', 'Sentiment']].rename(columns={'reviewText': 'Text'})
df_raw.head()


Columns: ['Unnamed: 0', 'reviewerName', 'overall', 'reviewText', 'reviewTime', 'day_diff', 'helpful_yes', 'helpful_no', 'total_vote', 'score_pos_neg_diff', 'score_average_rating', 'wilson_lower_bound']


Unnamed: 0,Text,Sentiment
0,No issues.,Positive
1,"Purchased this for my device, it worked as adv...",Positive
2,it works as expected. I should have sprung for...,Positive
3,This think has worked out great.Had a diff. br...,Positive
4,"Bought it with Retail Packaging, arrived legit...",Positive


Section 4 — Text Preprocessing: Clean, Remove Stopwords, Apply Stemming

In [4]:
import re, nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

STOP_WORDS = set(stopwords.words('english'))
STEMMER = PorterStemmer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    words = [STEMMER.stem(w) for w in text.split() if w not in STOP_WORDS and w.isalpha()]
    return " ".join(words)

df_raw['cleaned_text'] = df_raw['Text'].apply(preprocess)
df_raw.head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Text,Sentiment,cleaned_text
0,No issues.,Positive,issu
1,"Purchased this for my device, it worked as adv...",Positive,purchas devic work advertis never much phone m...
2,it works as expected. I should have sprung for...,Positive,work expect sprung higher capac think made bit...
3,This think has worked out great.Had a diff. br...,Positive,think work greathad diff bran card went south ...
4,"Bought it with Retail Packaging, arrived legit...",Positive,bought retail packag arriv legit orang envelop...


Section 5 — Label Encoding: Convert Sentiment Labels to Numerical Format

In [5]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_raw['label'] = le.fit_transform(df_raw['Sentiment'])
print("Label mapping:", dict(zip(le.classes_, le.transform(le.classes_))))


Label mapping: {'Negative': np.int64(0), 'Positive': np.int64(1)}


Section 6 — Tokenization and Padding: Prepare Text Sequences for Model Input

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

VOCAB_SIZE = 10000
MAX_LEN = 150

tok = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tok.fit_on_texts(df_raw['cleaned_text'])

X = pad_sequences(tok.texts_to_sequences(df_raw['cleaned_text']), maxlen=MAX_LEN)
y = df_raw['label'].values


Section 7 — Train-Test Split: Divide Dataset for Training and Evaluation

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)


Section 8 — Model Architecture: Define and Compile a BiLSTM-based Sentiment Classifier

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, GlobalAveragePooling1D, Dense, Dropout

EMB_DIM = 64

model = Sequential([
    Embedding(VOCAB_SIZE, EMB_DIM),  # input_length removed
    Bidirectional(LSTM(64, return_sequences=True)),
    GlobalAveragePooling1D(),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Build model so summary works (input shape = batch, sequence_length)
model.build(input_shape=(None, MAX_LEN))

# ✅ Compile the model (must do before training)
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()


Section 9 — Model Training: Train the BiLSTM Sentiment Classifier on Preprocessed Data

In [9]:
history = model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=256,
    validation_data=(X_test, y_test)
)


Epoch 1/5
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 734ms/step - accuracy: 0.8583 - loss: 0.4208 - val_accuracy: 0.9319 - val_loss: 0.2275
Epoch 2/5
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 782ms/step - accuracy: 0.9219 - loss: 0.2586 - val_accuracy: 0.9319 - val_loss: 0.2272
Epoch 3/5
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 686ms/step - accuracy: 0.9265 - loss: 0.2504 - val_accuracy: 0.9319 - val_loss: 0.2215
Epoch 4/5
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 735ms/step - accuracy: 0.9234 - loss: 0.2492 - val_accuracy: 0.9319 - val_loss: 0.2203
Epoch 5/5
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 708ms/step - accuracy: 0.9304 - loss: 0.2354 - val_accuracy: 0.9309 - val_loss: 0.2170


Section 10 — Model Evaluation: Evaluate Accuracy on Test Data

In [10]:
loss, acc = model.evaluate(X_test, y_test)
print(f"Accuracy: {acc*100:.2f}%")

[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 60ms/step - accuracy: 0.9240 - loss: 0.2328
Accuracy: 93.09%
