<a href="https://colab.research.google.com/github/keerthanab2201/Sentiment-Analysis-using-Deep-Learning/blob/main/new_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Data Collection

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Load csv file and preview
import pandas as pd
df= pd.read_csv("/content/drive/MyDrive/datasets/Amazon-Product-Reviews-Sentiment-Analysis-in-Python-Dataset.csv")
print(df.head())

                                              Review  Sentiment
0  Fast shipping but this product is very cheaply...          1
1  This case takes so long to ship and it's not e...          1
2  Good for not droids. Not good for iPhones. You...          1
3  The cable was not compatible between my macboo...          1
4  The case is nice but did not have a glow light...          1


In [3]:
# Save as a JSON file(records format)
df.to_json("amazon_reviews_data.json", orient="records", lines=True)
print("✅ Conversion complete: Saved as reviews_data.json")

✅ Conversion complete: Saved as reviews_data.json


##Data Pre-Processing

* lowercase
* stopword removal
* punctuation removal
* one word review removal
* contraction removal
* tokenization
* part of speech tagging

In [7]:
# installing dependencies
!pip install contractions textblob gensim beautifulsoup4
!python -m textblob.download_corpora
!pip uninstall -y tensorflow keras transformers
!pip install --force-reinstall \
    tensorflow==2.10.1 \
    transformers==4.36.2 \
    datasets==2.16.1 \
    evaluate==0.4.1 \
    tensorflow-addons==0.20.0

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
Finished.
Found existing installation: tensorflow 2.18.0
Uninstalling tensorflow-2.18.0:
  Successfully uninstalled tensorflow-2.18.0
Found existing installation: keras 3.8.0
Uninstalling keras-3.8.0:
  Successful

In [8]:
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [9]:
# Step 2: Import Modules
import re
import pandas as pd
import nltk
import numpy as np
import gensim
from bs4 import BeautifulSoup
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from contractions import fix as expand_contractions
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from transformers import DataCollatorWithPadding
from datasets import Dataset
import evaluate
import tensorflow_addons as tfa

ModuleNotFoundError: No module named 'keras.src.engine'

In [None]:
print("GPU:", tf.config.list_physical_devices('GPU'))
tf.keras.mixed_precision.set_global_policy('mixed_float16')

In [None]:
# load JSON dataset and inspect columns
df = pd.read_json("amazon_reviews_data.json", lines=True)
print(df.columns)

In [None]:
# Drop missing values and filter required columns
text_col = "Review"
label_col = "Sentiment"
df = df[[text_col, label_col]].dropna() #these are the two columns
df.columns = ["text", "rating"]  # Normalize column names

In [None]:
# Define preprocessing function
def preprocess_pipeline(text):
    text = str(text)
    text = expand_contractions(text.lower())            # Lowercase + contractions
    text = re.sub(r"(.)\1{2,}", r"\1\1", text)          # Elongated word normalization
    tokens = word_tokenize(text)                        # Tokenization
    # Optional: Stopwords (skip for deep models)
    # tokens = [t for t in tokens if t not in stop_words]
    tokens = [re.sub(r"[^\w\s!?]", "", t) for t in tokens]  # 5. Remove selective punctuation
    tokens = [t for t in tokens if t.strip() != ""]
    if len(tokens) <= 1:
        return None
    pos_tags = nltk.pos_tag(tokens)
    clean_text = " ".join(tokens)
    polarity_score = TextBlob(clean_text).sentiment.polarity
    return {
        "clean_text": clean_text,
        "tokens": tokens,
        "pos_tags": pos_tags,
        "score": polarity_score
    }


In [None]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')

In [None]:
# Apply preprocessing function
processed = df["text"].apply(preprocess_pipeline)
df = df[processed.notnull()].copy()
df["processed"] = processed[processed.notnull()].values

In [None]:
# Extract cleaned data for tokenization
texts = df["processed"].apply(lambda x: x["clean_text"]).tolist()
labels = df["rating"].tolist()
scores = df["processed"].apply(lambda x: x["score"]).tolist()
# Result: Three lists containing the text data, labels, and scores respectively

In [None]:
# Word Embeddings
# Keras Tokenizer- converts raw text into numerical sequences (each word= unique integer index) that can be later processed by Keras layers like Embedding
MAX_VOCAB = 10000
tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="")
tokenizer.fit_on_texts(texts)
# sequence padding- adding placeholder values (often zeros) to shorter sequences in a dataset to make them all the same length
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
padded_sequences = pad_sequences(sequences, maxlen=200, padding='post')

In [None]:
import zipfile

with zipfile.ZipFile("/content/drive/MyDrive/glove.6B.zip", "r") as zip_ref:
    zip_ref.extractall("glove")

In [None]:
vocab_size= 15000
# Load GloVe and Create Embedding Matrix
#GloVe(Global Vectors for Word Representation)- converts words into numerical vectors(embeddings) that capture semantic relationships between words- unsupervised learning algorithm

embedding_index = {}
with open("glove/glove.6B.100d.txt", encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embedding_index[word] = coefs

embedding_dim = 100  # matching model spec
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
    if i < vocab_size:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [None]:
# Remove neutral reviews and relabel
df = df[df['rating'] != 3]
df['label'] = df['rating'].apply(lambda x: 1 if x >= 4 else 0) # creates binary labels: 1 for positive (rating ≥4), 0 for negative

In [None]:
# Get cleaned texts again
texts = df["processed"].apply(lambda x: x["clean_text"]).tolist()
labels = df["label"].tolist()

In [None]:
# Prepare tokenizer and sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

In [None]:
vocab_size = 15000
input_length = 400
# creates vocabulary of 15,000 most frequent words, converts text to integer sequences and pads sequences to length 400 (truncating longer texts or padding shorter ones)

In [None]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token='')
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
padded_sequences = pad_sequences(sequences, maxlen=input_length, padding='post')

word_index = tokenizer.word_index

In [None]:
# setup embedding matrix
# initializes embedding matrix with zeros and fill it with pre-trained GloVe vectors where available
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
    if i >= vocab_size:
        continue
    vec = embedding_index.get(word)
    if vec is not None:
        embedding_matrix[i] = vec

In [None]:
# train test split- 70% training / 10% validation / 20% testing
from sklearn.model_selection import train_test_split

# Step 1: Split 20% test set
X_temp, X_test, y_temp, y_test = train_test_split(
    padded_sequences, labels, test_size=0.2, random_state=42, stratify=labels
)

# Step 2: Split remaining 80% into 70% train and 10% validation
# 70% of full data = 87.5% of remaining 80%
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.125, random_state=42, stratify=y_temp
)

print(f"Train size     : {len(X_train)}")
print(f"Validation size: {len(X_val)}")
print(f"Test size      : {len(X_test)}")


##TCN Model

In [None]:
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import (
    Input, Embedding, Conv1D, BatchNormalization, Activation,
    Add, Dropout, Dense, Multiply, Permute, Lambda, Flatten, RepeatVector
)
from tcn import TCN
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam

In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=MAX_LENGTH,
        padding="max_length",
        add_special_tokens=True
    )

# Convert to datasets (as in your original)
train_dataset = Dataset.from_dict({"text": X_train, "label": y_train})
val_dataset = Dataset.from_dict({"text": X_val, "label": y_val})

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

In [None]:
# ===== MODEL OPTIMIZATIONS =====
# Hyperparameters tuned for 95%+ accuracy
MAX_LENGTH = 256  # Increased from your original 128
BATCH_SIZE = 16    # Reduced for better gradient estimates
LEARNING_RATE = 1.5e-5
EPOCHS = 4
WARMUP_STEPS = 500

In [None]:
# ===== KEY MODIFICATIONS FOR HIGHER ACCURACY =====
# 1. Custom Model Architecture
def build_enhanced_model():
    # Load base model (preserving your approach)
    base_model = TFDistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-uncased',
        num_labels=2
    )

    # Build custom classifier head
    input_ids = tf.keras.Input(shape=(MAX_LENGTH,), dtype=tf.int32, name="input_ids")
    attention_mask = tf.keras.Input(shape=(MAX_LENGTH,), dtype=tf.int32, name="attention_mask")

    # Enhanced architecture
    distilbert_output = base_model.distilbert(input_ids=input_ids, attention_mask=attention_mask)[0]
    pooled_output = tf.keras.layers.GlobalAveragePooling1D()(distilbert_output)
    dropout = tf.keras.layers.Dropout(0.2)(pooled_output)
    dense = tf.keras.layers.Dense(128, activation='gelu')(dropout)
    output = tf.keras.layers.Dense(2, activation='softmax')(dense)

    model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)

    # 2. Advanced Optimizer with Warmup
    optimizer = tfa.optimizers.AdamW(
        learning_rate=tf.keras.optimizers.schedules.PolynomialDecay(
            initial_learning_rate=LEARNING_RATE,
            decay_steps=WARMUP_STEPS,
            end_learning_rate=0,
            power=1.0
        ),
        weight_decay=0.01
    )

    model.compile(
        optimizer=optimizer,
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
        metrics=['accuracy']
    )
    return model

model = build_enhanced_model()


In [None]:
# 3. Enhanced Training Setup
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

tf_train_set = tokenized_train.to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    label_cols=["label"],
    shuffle=True,
    batch_size=BATCH_SIZE,
    collate_fn=data_collator
)

tf_val_set = tokenized_val.to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    label_cols=["label"],
    shuffle=False,
    batch_size=BATCH_SIZE,
    collate_fn=data_collator
)


In [None]:
# Train the model
history = model.fit(
    tf_train_set,
    validation_data=tf_val_set,
    epochs=EPOCHS,
    callbacks=callbacks
)

In [None]:
# ===== EVALUATION =====
# Load your test data (preserving your method)
test_texts = pd.read_csv('your_test_data.csv')['text'].tolist()
test_labels = pd.read_csv('your_test_data.csv')['label'].tolist()

test_dataset = Dataset.from_dict({"text": test_texts, "label": test_labels})
tokenized_test = test_dataset.map(tokenize_function, batched=True)

tf_test_set = tokenized_test.to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    label_cols=["label"],
    shuffle=False,
    batch_size=BATCH_SIZE,
    collate_fn=data_collator
)

In [None]:
# Final evaluation
results = model.evaluate(tf_test_set)
print(f"\nFinal Test Accuracy: {results[1]:.2%}")

if results[1] < 0.95:
    print("\n🔧 Additional Optimization Options:")
    print("1. Add StratifiedKFold cross-validation")
    print("2. Incorporate back-translation augmentation")
    print("3. Try ensemble of multiple models")
else:
    print("\n🎉 Success! Achieved 95%+ accuracy")

In [None]:
# Model definition

# Enhanced TCN Block with Residuals

def TCN_block(x, filters, kernel_size, dilation_rate, l2_reg=1e-4):
    shortcut = x

    # Dilated convolution
    x = Conv1D(
        filters, kernel_size,
        padding='same',
        dilation_rate=dilation_rate,
        kernel_regularizer=l2(l2_reg)
    )(x)
    x = BatchNormalization()(x)
    x = Activation('swish')(x)  # Swish outperforms ReLU
    x = Dropout(0.2)(x)

    # Residual connection
    if shortcut.shape[-1] != filters:
        shortcut = Conv1D(filters, 1, padding='same')(shortcut)

    return Add()([x, shortcut])

# Self-Attention Mechanism
def attention_block(x):
    attention = Dense(1, activation='tanh')(x)
    attention = Flatten()(attention)
    attention = Activation('softmax')(attention)
    attention = RepeatVector(x.shape[-1])(attention)
    attention = Permute([2, 1])(attention)
    return Multiply()([x, attention])

# Learning Rate Schedule
def lr_scheduler(epoch, lr):
    warmup_epochs = 10
    if epoch < warmup_epochs:
        return lr * (epoch + 1) / warmup_epochs  # Warmup
    return lr * tf.math.exp(-0.1)  # Exponential decay

# Model architecture
def build_tcn_model(input_length, vocab_size, embedding_dim, embedding_matrix):
    inputs = Input(shape=(input_length,))

    # Embedding Layer (trainable for fine-tuning)
    x = Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        trainable=True
    )(inputs)

    # Stacked TCN Blocks
    x = TCN_block(x, 128, 3, 1)  # dilation_rate=1
    x = TCN_block(x, 128, 3, 2)  # dilation_rate=2
    x = TCN_block(x, 128, 3, 4)  # dilation_rate=4
    x = TCN_block(x, 128, 3, 8)  # dilation_rate=8 (captures long-range dependencies)

    # Attention
    x = attention_block(x)
    x = Lambda(lambda x: tf.reduce_sum(x, axis=1))(x)  # Weighted sum

    # Classifier Head
    x = Dense(256, activation='swish', kernel_regularizer=l2(1e-4))(x)
    x = Dropout(0.3)(x)
    outputs = Dense(1, activation='sigmoid')(x)

    tcn_model = Model(inputs=inputs, outputs=outputs)

    # Label smoothing + Adam with warmup
    loss = tf.keras.losses.BinaryCrossentropy(label_smoothing=0.1)
    optimizer = Adam(learning_rate=1e-3)

    tcn_model.compile(
        optimizer=optimizer,
        loss=loss,
        metrics=['accuracy', tf.keras.metrics.AUC()]
    )

    return tcn_model

In [None]:
# Initialize & Train
tcn_model = build_tcn_model(
    input_length=400,
    vocab_size=15000,
    embedding_dim=100,
    embedding_matrix=embedding_matrix
)

# Class weights (even if balanced, helps robustness)
class_weights = {0: 1.2, 1: 0.8}  # Slight adjustment to favor minority class

tcn_history = tcn_model.fit(
    X_train, np.array(y_train),
    epochs=50,
    batch_size=64,
    validation_data=(X_val, np.array(y_val)),
    class_weight=class_weights,
    callbacks=[
        LearningRateScheduler(lr_scheduler),
        tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)
    ]
)

# Test-Time Augmentation (TTA)
def predict_with_tta(model, X, n_samples=5):
    predictions = []
    for _ in range(n_samples):
        preds = model.predict(X, verbose=0).flatten()  # Enable dropout
        predictions.append(preds)
    return np.mean(predictions, axis=0)

y_pred_tta = (predict_with_tta(tcn_model, X_test) > 0.5).astype(int)

In [None]:
# Callbacks
early_stop = EarlyStopping(monitor='val_accuracy', patience=4, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1, min_lr=1e-6)
checkpoint = ModelCheckpoint("best_hybrid_model.h5", monitor='val_accuracy', save_best_only=True, verbose=1)

In [None]:
# Model training
history = tcn_model.fit(
    X_train, np.array(y_train),
    validation_data=(X_val, np.array(y_val)),
    epochs=20,
    batch_size=32,
    callbacks=[early_stop, lr_scheduler, checkpoint],
    verbose=1
)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Predict
y_pred_prob = tcn_model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int).flatten()

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn + fp)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print results
print("Confusion Matrix (TCN):")
print(cm)
print(f"\nAccuracy     : {accuracy:.4f}")
print(f"Precision    : {precision:.4f}")
print(f"Recall       : {recall:.4f}")
print(f"F1 Score     : {f1:.4f}")
print(f"Specificity  : {specificity:.4f}")

In [None]:
import matplotlib.pyplot as plt

# plot accuracy and loss vs epoch
import matplotlib.pyplot as plt

# Plot Accuracy
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy vs Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)
plt.show()

# Plot Loss
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss vs Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()
