# Exploration of Dataset

In [1]:
import pandas as pd
import numpy as np

# Load the datasets
train_df = pd.read_csv('./datasets/train.csv')
valid_df = pd.read_csv('./datasets/valid.csv')
test_df = pd.read_csv('./datasets/test.csv')

# Display dataset shapes
print(f"Train Shape: {train_df.shape}")
print(f"Valid Shape: {valid_df.shape}")
print(f"Test Shape: {test_df.shape}")
print()

# Preview the training data
print(train_df.head())
print()

# Check for class balance in the training set
print(train_df['label'].value_counts())
print()

# Check for any missing values
print(train_df.isnull().sum())


Train Shape: (21464, 2)
Valid Shape: (716, 2)
Test Shape: (966, 2)

                                                text  label
0  states slow to shut down weak teacher educatio...      0
1    drone places fresh kill on steps of white house      1
2  report: majority of instances of people gettin...      1
3  sole remaining lung filled with rich, satisfyi...      1
4                       the gop's stockholm syndrome      0

label
0    11248
1    10216
Name: count, dtype: int64

text     0
label    0
dtype: int64


Baseline Model with Bag of Words and Logistic Regression

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

# 1. Preprocessing: Convert text to numerical vectors (Bag of Words)
# We limit to the top 5000 most frequent words to keep it simple
vectorizer = CountVectorizer(stop_words='english', max_features=5000)

# Fit on training data, then transform valid and test data
X_train = vectorizer.fit_transform(train_df['text'])
X_valid = vectorizer.transform(valid_df['text'])
X_test = vectorizer.transform(test_df['text'])

y_train = train_df['label']
y_valid = valid_df['label']
y_test = test_df['label']

# 2. Model: Train a simple Logistic Regression model
baseline_model = LogisticRegression(max_iter=1000, random_state=42)
baseline_model.fit(X_train, y_train)

# 3. Evaluation: specific metrics on validation set
valid_preds = baseline_model.predict(X_valid)

print(f"Baseline Validation Accuracy: {accuracy_score(y_valid, valid_preds):.4f}")
print(f"Baseline Validation F1 Score: {f1_score(y_valid, valid_preds):.4f}")

Baseline Validation Accuracy: 0.7598
Baseline Validation F1 Score: 0.7485


Feature Engineering with TF-IDF and N-grams

Idea: weight words to give less importance to common words and more importance to unique words

-> This might signal sarcasm?

N-gram Idea: model seeing pairs of words together might be important for sarcasm b/c gives more context

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 1. Preprocessing: Use TF-IDF and include Bigrams (1-word and 2-word combinations)
# We increase max_features slightly to accommodate new bigrams
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10000, ngram_range=(1, 2))

X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['text'])
X_valid_tfidf = tfidf_vectorizer.transform(valid_df['text'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['text'])

# 2. Model: Retrain Logistic Regression on these new features
tfidf_model = LogisticRegression(max_iter=1000, random_state=42)
tfidf_model.fit(X_train_tfidf, y_train)

# 3. Evaluation
valid_preds_tfidf = tfidf_model.predict(X_valid_tfidf)

print(f"TF-IDF + Bigram Accuracy: {accuracy_score(y_valid, valid_preds_tfidf):.4f}")
print(f"TF-IDF + Bigram F1 Score: {f1_score(y_valid, valid_preds_tfidf):.4f}")

TF-IDF + Bigram Accuracy: 0.7709
TF-IDF + Bigram F1 Score: 0.7595


Support Vector Machine
Idea: Good at classification tasks

In [4]:
from sklearn.svm import LinearSVC

# 1. Model: Support Vector Machine (Linear Kernel)
# LinearSVC is faster and often better for text than standard SVC
svm_model = LinearSVC(random_state=42, max_iter=10000)
svm_model.fit(X_train_tfidf, y_train)

# 2. Evaluation
valid_preds_svm = svm_model.predict(X_valid_tfidf)

print(f"SVM Validation Accuracy: {accuracy_score(y_valid, valid_preds_svm):.4f}")
print(f"SVM Validation F1 Score: {f1_score(y_valid, valid_preds_svm):.4f}")

SVM Validation Accuracy: 0.7751
SVM Validation F1 Score: 0.7636


LSTM: process text as a sequence rather than Bag of Words

Idea: capture more structure in the sarcasm string

In [5]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# 1. Preprocessing: Convert text to sequences of integers
# We limit the vocab to 10,000 words and sequence length to 100
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_df['text'])

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(train_df['text']), maxlen=100)
X_valid_seq = pad_sequences(tokenizer.texts_to_sequences(valid_df['text']), maxlen=100)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(test_df['text']), maxlen=100)

# 2. Model: Define a simple LSTM network 
model = Sequential([
    Embedding(input_dim=10000, output_dim=32, input_length=100),
    LSTM(32),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 3. Training
model.fit(X_train_seq, y_train, epochs=5, batch_size=64, validation_data=(X_valid_seq, y_valid))

# 4. Evaluation
lstm_probs = model.predict(X_valid_seq)
lstm_preds = (lstm_probs > 0.5).astype(int)

print(f"LSTM Validation Accuracy: {accuracy_score(y_valid, lstm_preds):.4f}")
print(f"LSTM Validation F1 Score: {f1_score(y_valid, lstm_preds):.4f}")

Epoch 1/5




[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 31ms/step - accuracy: 0.7839 - loss: 0.4366 - val_accuracy: 0.8547 - val_loss: 0.3438
Epoch 2/5
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 31ms/step - accuracy: 0.8961 - loss: 0.2816 - val_accuracy: 0.8520 - val_loss: 0.3683
Epoch 3/5
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 31ms/step - accuracy: 0.9357 - loss: 0.1728 - val_accuracy: 0.8478 - val_loss: 0.3973
Epoch 4/5
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 31ms/step - accuracy: 0.9535 - loss: 0.1293 - val_accuracy: 0.8436 - val_loss: 0.4528
Epoch 5/5
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 31ms/step - accuracy: 0.9675 - loss: 0.0938 - val_accuracy: 0.8254 - val_loss: 0.5266
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
LSTM Validation Accuracy: 0.8254
LSTM Validation F1 Score: 0.8207


BiLSTM to read strings bidirectionally

Idea: typically better to gain more information.

Optimization: Dropout so that the network is not overly reliant on specific features (prevents overfitting)

In [6]:
from tensorflow.keras.layers import Bidirectional, Dropout

# 1. Model: Define a Bidirectional LSTM with Dropout
# We wrap the LSTM layer in 'Bidirectional'
bilstm_model = Sequential([
    Embedding(input_dim=10000, output_dim=32, input_length=100),
    Bidirectional(LSTM(32)),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

bilstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 2. Training
# We use the same sequence data (X_train_seq) prepared in the previous step
bilstm_model.fit(X_train_seq, y_train, epochs=5, batch_size=64, validation_data=(X_valid_seq, y_valid))

# 3. Evaluation
bilstm_probs = bilstm_model.predict(X_valid_seq)
bilstm_preds = (bilstm_probs > 0.5).astype(int)

print(f"Bi-LSTM Validation Accuracy: {accuracy_score(y_valid, bilstm_preds):.4f}")
print(f"Bi-LSTM Validation F1 Score: {f1_score(y_valid, bilstm_preds):.4f}")

Epoch 1/5




[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 34ms/step - accuracy: 0.7676 - loss: 0.4734 - val_accuracy: 0.8422 - val_loss: 0.3555
Epoch 2/5
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 36ms/step - accuracy: 0.9009 - loss: 0.2549 - val_accuracy: 0.8464 - val_loss: 0.3856
Epoch 3/5
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 41ms/step - accuracy: 0.9346 - loss: 0.1796 - val_accuracy: 0.8436 - val_loss: 0.4123
Epoch 4/5
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 38ms/step - accuracy: 0.9526 - loss: 0.1373 - val_accuracy: 0.8380 - val_loss: 0.4360
Epoch 5/5
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 38ms/step - accuracy: 0.9674 - loss: 0.0996 - val_accuracy: 0.8310 - val_loss: 0.5597
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step
Bi-LSTM Validation Accuracy: 0.8310
Bi-LSTM Validation F1 Score: 0.8264


Why did the accuracy go down bro :(

Build Ensemble Model

In [7]:
# 1. Gather predictions from our top 3 models
# Note: We flatten the neural network arrays to make them match the SVM's shape
pred_1 = valid_preds_svm
pred_2 = lstm_preds.flatten()
pred_3 = bilstm_preds.flatten()

# 2. Voting Logic: Sum the predictions
# If sum is 2 or 3, it means the majority voted 1 (Sarcasm)
total_votes = pred_1 + pred_2 + pred_3
ensemble_preds = (total_votes >= 2).astype(int)

# 3. Evaluation
print(f"Ensemble Validation Accuracy: {accuracy_score(y_valid, ensemble_preds):.4f}")
print(f"Ensemble Validation F1 Score: {f1_score(y_valid, ensemble_preds):.4f}")

Ensemble Validation Accuracy: 0.8310
Ensemble Validation F1 Score: 0.8254


Evaluation on Test Set

In [8]:
from sklearn.metrics import classification_report, confusion_matrix

# 1. Get predictions for the Test Set from all 3 models
# SVM (uses TF-IDF features)
pred_test_svm = svm_model.predict(X_test_tfidf)

# LSTM (uses Sequence features)
pred_test_lstm = (model.predict(X_test_seq) > 0.5).astype(int).flatten()

# Bi-LSTM (uses Sequence features)
pred_test_bilstm = (bilstm_model.predict(X_test_seq) > 0.5).astype(int).flatten()

# 2. Ensemble Voting (Majority Vote)
# Sum the predictions (0 or 1). If sum is 2 or 3, majority is 1.
test_votes = pred_test_svm + pred_test_lstm + pred_test_bilstm
pred_test_ensemble = (test_votes >= 2).astype(int)

# 3. Report detailed metrics
print("Final Evaluation on Test Set:")
print(classification_report(test_df['label'], pred_test_ensemble, digits=4))

# 4. Confusion Matrix (Row: True, Col: Predicted)
print("\nConfusion Matrix:")
print(confusion_matrix(test_df['label'], pred_test_ensemble))

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
Final Evaluation on Test Set:
              precision    recall  f1-score   support

           0     0.8593    0.8707    0.8650       526
           1     0.8430    0.8295    0.8362       440

    accuracy                         0.8520       966
   macro avg     0.8511    0.8501    0.8506       966
weighted avg     0.8518    0.8520    0.8519       966


Confusion Matrix:
[[458  68]
 [ 75 365]]


Save Weights

# Trying Improvements to the model below.

Text Preprocessing: 

In [9]:
import re
from scipy.sparse import hstack

# 1. Feature Extraction (Run this on RAW text to capture capitalization/punctuation)
def extract_features(df):
    features = pd.DataFrame()
    # Punctuation counts (sarcasm indicators)
    features['exclamation_count'] = df['text'].str.count('!')
    features['question_count'] = df['text'].str.count('\?')
    features['ellipsis_count'] = df['text'].str.count(r'\.\.\.')
    
    # Capitalization (sarcasm often uses ALL CAPS or Mixed Caps)
    features['capital_ratio'] = df['text'].apply(lambda x: sum(1 for c in x if c.isupper()) / (len(x) + 1))
    features['has_all_caps_word'] = df['text'].str.contains(r'\b[A-Z]{2,}\b').astype(int)
    
    # Length metrics
    features['text_length'] = df['text'].str.len()
    features['word_count'] = df['text'].str.split().str.len()
    
    return features

# Extract features now
train_features = extract_features(train_df)
valid_features = extract_features(valid_df)
test_features = extract_features(test_df)

In [10]:
# 2. Text Preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text) # Remove URLs
    text = re.sub(r'\s+', ' ', text).strip()    # Clean whitespace
    return text

# Apply preprocessing (This modifies the data for subsequent steps)
train_df['clean_text'] = train_df['text'].apply(preprocess_text)
valid_df['clean_text'] = valid_df['text'].apply(preprocess_text)
test_df['clean_text'] = test_df['text'].apply(preprocess_text)

TF-IDF on pre-processed data

In [11]:
# 3. New TF-IDF on Cleaned Text
# Re-running vectorizer on the cleaner text
tfidf_clean = TfidfVectorizer(stop_words='english', max_features=10000, ngram_range=(1, 2))
X_train_tfidf_clean = tfidf_clean.fit_transform(train_df['clean_text'])
X_valid_tfidf_clean = tfidf_clean.transform(valid_df['clean_text'])
X_test_tfidf_clean = tfidf_clean.transform(test_df['clean_text'])

# 4. Combine Features (TF-IDF + Manual Features)
X_train_combined = hstack([X_train_tfidf_clean, train_features])
X_valid_combined = hstack([X_valid_tfidf_clean, valid_features])
X_test_combined = hstack([X_test_tfidf_clean, test_features])

# 5. Train Improved SVM
svm_model_improved = LinearSVC(random_state=42, max_iter=10000, C=1.0)
svm_model_improved.fit(X_train_combined, y_train)

# Evaluation
valid_preds_svm_improved = svm_model_improved.predict(X_valid_combined)
print(f"Improved SVM Accuracy: {accuracy_score(y_valid, valid_preds_svm_improved):.4f}")
print(f"Improved SVM F1 Score: {f1_score(y_valid, valid_preds_svm_improved):.4f}")

Improved SVM Accuracy: 0.8017
Improved SVM F1 Score: 0.7948


Stacked BiDirectional LSTM + Early Stopping

In [12]:
from tensorflow.keras.callbacks import EarlyStopping

# 1. Update Sequences: Use the 'clean_text' from the previous step
# We re-fit the tokenizer on the cleaner text
tokenizer_clean = Tokenizer(num_words=10000)
tokenizer_clean.fit_on_texts(train_df['clean_text'])

X_train_seq_clean = pad_sequences(tokenizer_clean.texts_to_sequences(train_df['clean_text']), maxlen=100)
X_valid_seq_clean = pad_sequences(tokenizer_clean.texts_to_sequences(valid_df['clean_text']), maxlen=100)
X_test_seq_clean = pad_sequences(tokenizer_clean.texts_to_sequences(test_df['clean_text']), maxlen=100)

# 2. Improved Model: Stacked Bi-LSTM
# We stack two Bidirectional LSTM layers to learn more complex patterns
bilstm_improved = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=100),
    Bidirectional(LSTM(64, return_sequences=True)), # Return sequences is required to stack another LSTM
    Dropout(0.3),
    Bidirectional(LSTM(32)),
    Dropout(0.5),
    Dense(16, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

bilstm_improved.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 3. Training with Early Stopping
# Stop training if validation loss doesn't improve for 3 epochs
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

bilstm_improved.fit(
    X_train_seq_clean, y_train,
    epochs=10,
    batch_size=64,
    validation_data=(X_valid_seq_clean, y_valid),
    callbacks=[early_stop]
)

# 4. Evaluation
probs_bilstm_improved = bilstm_improved.predict(X_valid_seq_clean)
preds_bilstm_improved = (probs_bilstm_improved > 0.5).astype(int)

print(f"Improved Bi-LSTM Accuracy: {accuracy_score(y_valid, preds_bilstm_improved):.4f}")
print(f"Improved Bi-LSTM F1 Score: {f1_score(y_valid, preds_bilstm_improved):.4f}")

Epoch 1/10




[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 103ms/step - accuracy: 0.7724 - loss: 0.4592 - val_accuracy: 0.8659 - val_loss: 0.3505
Epoch 2/10
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 106ms/step - accuracy: 0.9139 - loss: 0.2324 - val_accuracy: 0.8478 - val_loss: 0.3840
Epoch 3/10
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 102ms/step - accuracy: 0.9548 - loss: 0.1321 - val_accuracy: 0.8296 - val_loss: 0.4995
Epoch 4/10
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 103ms/step - accuracy: 0.9745 - loss: 0.0803 - val_accuracy: 0.8254 - val_loss: 0.6515
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 34ms/step
Improved Bi-LSTM Accuracy: 0.8659
Improved Bi-LSTM F1 Score: 0.8655


CNN for text classification to spot N-gram patterns

Idea: detect phrases for sarcasm

In [13]:
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D

# 1. Model: 1D Convolutional Neural Network
# - Conv1D with kernel_size=5 looks at 5-word windows to find sarcastic phrases
# - GlobalMaxPooling1D keeps only the strongest signal found in the text
cnn_model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=100),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(16, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 2. Training
# We use the same 'clean' sequences and early stopping as before
cnn_model.fit(
    X_train_seq_clean, y_train,
    epochs=10,
    batch_size=64,
    validation_data=(X_valid_seq_clean, y_valid),
    callbacks=[early_stop]
)

# 3. Evaluation
probs_cnn = cnn_model.predict(X_valid_seq_clean)
preds_cnn = (probs_cnn > 0.5).astype(int)

print(f"CNN Validation Accuracy: {accuracy_score(y_valid, preds_cnn):.4f}")
print(f"CNN Validation F1 Score: {f1_score(y_valid, preds_cnn):.4f}")

Epoch 1/10




[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 20ms/step - accuracy: 0.7929 - loss: 0.4377 - val_accuracy: 0.8575 - val_loss: 0.3469
Epoch 2/10
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 21ms/step - accuracy: 0.9185 - loss: 0.2194 - val_accuracy: 0.8394 - val_loss: 0.3881
Epoch 3/10
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 20ms/step - accuracy: 0.9699 - loss: 0.0983 - val_accuracy: 0.8254 - val_loss: 0.4905
Epoch 4/10
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 22ms/step - accuracy: 0.9892 - loss: 0.0389 - val_accuracy: 0.8366 - val_loss: 0.6680
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
CNN Validation Accuracy: 0.8575
CNN Validation F1 Score: 0.8534


Improved Model Evaluation on Test Set

In [17]:
from sklearn.metrics import classification_report, confusion_matrix

# --- 1. Validation Set Ensemble ---

# Get predictions from the 3 improved models on Validation Data
# SVM (uses Combined Sparse Features)
val_pred_svm = svm_model_improved.predict(X_valid_combined)

# Bi-LSTM (uses Clean Sequences)
val_probs_bilstm = bilstm_improved.predict(X_valid_seq_clean)
val_pred_bilstm = (val_probs_bilstm > 0.5).astype(int).flatten()

# CNN (uses Clean Sequences)
val_probs_cnn = cnn_model.predict(X_valid_seq_clean)
val_pred_cnn = (val_probs_cnn > 0.5).astype(int).flatten()

# Majority Vote Ensemble
val_votes = val_pred_svm + val_pred_bilstm + val_pred_cnn
val_pred_ensemble = (val_votes >= 2).astype(int)

print(f"Final Ensemble Validation Accuracy: {accuracy_score(y_valid, val_pred_ensemble):.4f}")
print(f"Final Ensemble Validation F1 Score: {f1_score(y_valid, val_pred_ensemble):.4f}")

# --- 2. Test Set Evaluation (The Deliverable) ---

# Get predictions on Test Data
test_pred_svm = svm_model_improved.predict(X_test_combined)

test_probs_bilstm = bilstm_improved.predict(X_test_seq_clean)
test_pred_bilstm = (test_probs_bilstm > 0.5).astype(int).flatten()

test_probs_cnn = cnn_model.predict(X_test_seq_clean)
test_pred_cnn = (test_probs_cnn > 0.5).astype(int).flatten()

# Majority Vote Ensemble
test_votes = test_pred_svm + test_pred_bilstm + test_pred_cnn
test_pred_ensemble = (test_votes >= 2).astype(int)

# Report
print("FINAL TEST SET RESULTS:")
print(classification_report(test_df['label'], test_pred_ensemble, digits=4))

print("\nConfusion Matrix:")
print(confusion_matrix(test_df['label'], test_pred_ensemble))

[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Final Ensemble Validation Accuracy: 0.8617
Final Ensemble Validation F1 Score: 0.8580
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
FINAL TEST SET RESULTS:
              precision    recall  f1-score   support

           0     0.8706    0.8954    0.8828       526
           1     0.8706    0.8409    0.8555       440

    accuracy                         0.8706       966
   macro avg     0.8706    0.8682    0.8692       966
weighted avg     0.8706    0.8706    0.8704       966


Confusion Matrix:
[[471  55]
 [ 70 370]]


In [15]:
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# --- 1. Get Predictions for the Test Set ---

# Model A: ORIGINAL SVM (Using TF-IDF features from snippet 1)
# Assuming 'svm_model' is your original model
test_pred_svm = svm_model.predict(X_test_tfidf)

# Model B: BI-LSTM (Using clean sequences from snippet 2)
test_probs_bilstm = bilstm_improved.predict(X_test_seq_clean)
test_pred_bilstm = (test_probs_bilstm > 0.5).astype(int).flatten()

# Model C: CNN (Using clean sequences from snippet 2)
test_probs_cnn = cnn_model.predict(X_test_seq_clean)
test_pred_cnn = (test_probs_cnn > 0.5).astype(int).flatten()

# --- 2. Voting Logic (Hard Voting) ---

# Sum the binary predictions (0 or 1)
# Possible sums: 0, 1 (Majority 0) | 2, 3 (Majority 1)
test_votes = test_pred_svm + test_pred_bilstm + test_pred_cnn
final_ensemble_preds = (test_votes >= 2).astype(int)

# --- 3. Final Evaluation ---

print("=== ENSEMBLE RESULTS (Original SVM + Bi-LSTM + CNN) ===")
print(f"Accuracy: {accuracy_score(test_df['label'], final_ensemble_preds):.4f}")
print(f"F1 Score: {f1_score(test_df['label'], final_ensemble_preds):.4f}")

print("\nDetailed Classification Report:")
print(classification_report(test_df['label'], final_ensemble_preds, digits=4))

print("\nConfusion Matrix:")
print(confusion_matrix(test_df['label'], final_ensemble_preds))

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
=== ENSEMBLE RESULTS (Original SVM + Bi-LSTM + CNN) ===
Accuracy: 0.8685
F1 Score: 0.8532

Detailed Classification Report:
              precision    recall  f1-score   support

           0     0.8688    0.8935    0.8810       526
           1     0.8682    0.8386    0.8532       440

    accuracy                         0.8685       966
   macro avg     0.8685    0.8661    0.8671       966
weighted avg     0.8685    0.8685    0.8683       966


Confusion Matrix:
[[470  56]
 [ 71 369]]


In [16]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# --- 1. Generate Predictions ---

# Model 1: SVM (Original)
pred_svm = svm_model.predict(X_test_tfidf)

# Model 2: LSTM
# Using the standard sequence features
pred_probs_lstm = model.predict(X_test_seq) 
pred_lstm = (pred_probs_lstm > 0.5).astype(int).flatten()

# Model 3: Bi-LSTM
# Using the clean sequence features from your second snippet
pred_probs_bilstm = bilstm_improved.predict(X_test_seq_clean)
pred_bilstm = (pred_probs_bilstm > 0.5).astype(int).flatten()

# --- 2. Majority Vote Logic ---

# We add the three arrays together. 
# A sum of 2 or 3 means at least two models predicted '1'.
combined_votes = pred_svm + pred_lstm + pred_bilstm
ensemble_final_preds = (combined_votes >= 2).astype(int)

# --- 3. Performance Metrics ---

print("=== ENSEMBLE RESULTS: SVM + LSTM + Bi-LSTM ===")
print(f"Accuracy: {accuracy_score(test_df['label'], ensemble_final_preds):.4f}")
print("\nClassification Report:")
print(classification_report(test_df['label'], ensemble_final_preds, digits=4))

print("\nConfusion Matrix:")
print(confusion_matrix(test_df['label'], ensemble_final_preds))

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step
=== ENSEMBLE RESULTS: SVM + LSTM + Bi-LSTM ===
Accuracy: 0.8602

Classification Report:
              precision    recall  f1-score   support

           0     0.8668    0.8783    0.8725       526
           1     0.8522    0.8386    0.8454       440

    accuracy                         0.8602       966
   macro avg     0.8595    0.8585    0.8589       966
weighted avg     0.8601    0.8602    0.8602       966


Confusion Matrix:
[[462  64]
 [ 71 369]]


In [17]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# --- 1. Get Predictions ---

# Model A: SVM (Original TF-IDF version)
# Using features from snippet 1
test_pred_svm = svm_model.predict(X_test_tfidf)

# Model B: LSTM (Sequence-based)
# Using standard sequences from snippet 1
test_probs_lstm = model.predict(X_test_seq)
test_pred_lstm = (test_probs_lstm > 0.5).astype(int).flatten()

# Model C: CNN (Clean Sequence-based)
# Using clean sequences from snippet 2
test_probs_cnn = cnn_model.predict(X_test_seq_clean)
test_pred_cnn = (test_probs_cnn > 0.5).astype(int).flatten()

# --- 2. Ensemble Voting Logic ---

# We add the individual 0/1 predictions
# If 2 or more models say "1", the result is "1"
total_votes = test_pred_svm + test_pred_lstm + test_pred_cnn
hybrid_ensemble_preds = (total_votes >= 2).astype(int)

# --- 3. Results & Evaluation ---

print("=== HYBRID ENSEMBLE: SVM + LSTM + CNN ===")
print(f"Ensemble Accuracy: {accuracy_score(test_df['label'], hybrid_ensemble_preds):.4f}")

print("\nDetailed Performance:")
print(classification_report(test_df['label'], hybrid_ensemble_preds, digits=4))

print("\nConfusion Matrix (Actual vs Predicted):")
print(confusion_matrix(test_df['label'], hybrid_ensemble_preds))

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
=== HYBRID ENSEMBLE: SVM + LSTM + CNN ===
Ensemble Accuracy: 0.8634

Detailed Performance:
              precision    recall  f1-score   support

           0     0.8648    0.8878    0.8762       526
           1     0.8615    0.8341    0.8476       440

    accuracy                         0.8634       966
   macro avg     0.8632    0.8610    0.8619       966
weighted avg     0.8633    0.8634    0.8631       966


Confusion Matrix (Actual vs Predicted):
[[467  59]
 [ 73 367]]


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# --- 1. Get Predictions ---

# Model A: LSTM
# Using original sequence data
probs_lstm = model.predict(X_test_seq)
pred_lstm = (probs_lstm > 0.5).astype(int).flatten()

# Model B: Bi-LSTM
# Using clean sequence data from your second snippet
probs_bilstm = bilstm_improved.predict(X_test_seq_clean)
pred_bilstm = (probs_bilstm > 0.5).astype(int).flatten()

# Model C: CNN
# Using clean sequence data from your second snippet
probs_cnn = cnn_model.predict(X_test_seq_clean)
pred_cnn = (probs_cnn > 0.5).astype(int).flatten()

# --- 2. Ensemble Voting Logic ---

# Hard Voting: Summing the binary results
# If 2 or 3 models predict 1, the ensemble predicts 1
total_votes = pred_lstm + pred_bilstm + pred_cnn
deep_ensemble_preds = (total_votes >= 2).astype(int)

# --- 3. Evaluation ---

print("=== DEEP LEARNING ENSEMBLE: LSTM + Bi-LSTM + CNN ===")
print(f"Final Accuracy: {accuracy_score(test_df['label'], deep_ensemble_preds):.4f}")

print("\nDetailed Metrics:")
print(classification_report(test_df['label'], deep_ensemble_preds, digits=4))

print("\nConfusion Matrix:")
print(confusion_matrix(test_df['label'], deep_ensemble_preds))

[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
=== DEEP LEARNING ENSEMBLE: LSTM + Bi-LSTM + CNN ===
Final Accuracy: 0.8589

Detailed Metrics:
              precision    recall  f1-score   support

           0     0.8529    0.8694    0.8611       360
           1     0.8653    0.8483    0.8567       356

    accuracy                         0.8589       716
   macro avg     0.8591    0.8589    0.8589       716
weighted avg     0.8591    0.8589    0.8589       716


Confusion Matrix:
[[313  47]
 [ 54 302]]
