# Import all necessary libraries

In [None]:
import pandas as pd
import numpy as np

# baseline model libraries
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

# first model
from sklearn.svm import LinearSVC

# second model
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# third model
from tensorflow.keras.layers import Bidirectional, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# fourth model
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D

# ensemble model
from sklearn.metrics import classification_report, confusion_matrix

# Exploring the Datasets

In [3]:
# loading datasets
train_df_main = pd.read_csv('./datasets/train.csv')
valid_df_main = pd.read_csv('./datasets/valid.csv')
test_df_main = pd.read_csv('./datasets/test.csv')

# display shapes
print(f"Train Shape: {train_df_main.shape}")
print(f"Valid Shape: {valid_df_main.shape}")
print(f"Test Shape: {test_df_main.shape}")

print("----------")

# preview training data
print(train_df_main.head())

print("----------")

# check for class balance
print(train_df_main['label'].value_counts())

print("----------")

# check for missing values
print(train_df_main.isnull().sum())

Train Shape: (21464, 2)
Valid Shape: (716, 2)
Test Shape: (966, 2)
----------
                                                text  label
0  states slow to shut down weak teacher educatio...      0
1    drone places fresh kill on steps of white house      1
2  report: majority of instances of people gettin...      1
3  sole remaining lung filled with rich, satisfyi...      1
4                       the gop's stockholm syndrome      0
----------
label
0    11248
1    10216
Name: count, dtype: int64
----------
text     0
label    0
dtype: int64


# Preprocessing all datasets

Preprocessing includes splitting apart the sentences into tokens, lowercasing all words, and making sure there is no whitespace within the sentences themselves

In [4]:
def preprocess(text):
    text = text.lower()
    tokens = text.split()
    tokens = [t for t in tokens if t]
    
    return tokens

# unprocessed text and labels
X_train = train_df_main['text']
X_valid = valid_df_main['text']
X_test = test_df_main['text']

y_train = train_df_main['label']
y_valid = valid_df_main['label']
y_test = test_df_main['label']

# processed text
X_train_processed = []
X_valid_processed = []
X_test_processed = []

for t in X_train:
    X_train_processed.append(preprocess(t))

for t in X_valid:
    X_valid_processed.append(preprocess(t))

for t in X_test:
    X_test_processed.append(preprocess(t))
    
# validating that preprocessing worked
print(X_train_processed[0:5])

[['states', 'slow', 'to', 'shut', 'down', 'weak', 'teacher', 'education', 'programs'], ['drone', 'places', 'fresh', 'kill', 'on', 'steps', 'of', 'white', 'house'], ['report:', 'majority', 'of', 'instances', 'of', 'people', 'getting', 'lives', 'back', 'on', 'track', 'occur', 'immediately', 'after', 'visit', 'to', 'buffalo', 'wild', 'wings'], ['sole', 'remaining', 'lung', 'filled', 'with', 'rich,', 'satisfying', 'flavor'], ['the', "gop's", 'stockholm', 'syndrome']]


# Baseline Model

logistic regression + tf-idf

In [5]:
# baseline model
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1, 2))

# the vectorizer uses the main dataframe
X_train_baseline = vectorizer.fit_transform(X_train)
X_valid_baseline = vectorizer.transform(X_valid)
X_test_baseline = vectorizer.transform(X_test)

baseline_model = LogisticRegression(max_iter=10000, random_state=42)
baseline_model.fit(X_train_baseline, y_train)

valid_preds = baseline_model.predict(X_valid_baseline)

print(f"Baseline Validation Accuracy: {accuracy_score(y_valid, valid_preds):.4f}")
print(f"Baseline Validation F1 Score: {f1_score(y_valid, valid_preds):.4f}")

Baseline Validation Accuracy: 0.7751
Baseline Validation F1 Score: 0.7663


In [None]:
# pinpointing best feature value for vectorizer
best_acc = 0
best_max_features = 1000
for i in range(1000, 10000, 100):
    temp_vectorizer = TfidfVectorizer(stop_words='english', max_features=i, ngram_range=(1, 2))
    
    X_train_baseline = temp_vectorizer.fit_transform(X_train)
    X_valid_baseline = temp_vectorizer.transform(X_valid)
    X_test_baseline = temp_vectorizer.transform(X_test)

    baseline_model = LogisticRegression(max_iter=10000, random_state=42)
    baseline_model.fit(X_train_baseline, y_train)

    valid_preds = baseline_model.predict(X_valid_baseline)

    print(f"----- TESTING i = {i} -----")
    print(f"Baseline Validation Accuracy: {accuracy_score(y_valid, valid_preds):.4f}")
    print(f"Baseline Validation F1 Score: {f1_score(y_valid, valid_preds):.4f}")
    
    if accuracy_score(y_valid, valid_preds) > best_acc:
        best_acc = accuracy_score(y_valid, valid_preds)
        best_max_features = i

----- TESTING i = 1000 -----
Baseline Validation Accuracy: 0.7332
Baseline Validation F1 Score: 0.7170
----- TESTING i = 1100 -----
Baseline Validation Accuracy: 0.7249
Baseline Validation F1 Score: 0.7099
----- TESTING i = 1200 -----
Baseline Validation Accuracy: 0.7207
Baseline Validation F1 Score: 0.7041
----- TESTING i = 1300 -----
Baseline Validation Accuracy: 0.7193
Baseline Validation F1 Score: 0.7022
----- TESTING i = 1400 -----
Baseline Validation Accuracy: 0.7263
Baseline Validation F1 Score: 0.7126
----- TESTING i = 1500 -----
Baseline Validation Accuracy: 0.7277
Baseline Validation F1 Score: 0.7128
----- TESTING i = 1600 -----
Baseline Validation Accuracy: 0.7277
Baseline Validation F1 Score: 0.7178
----- TESTING i = 1700 -----
Baseline Validation Accuracy: 0.7332
Baseline Validation F1 Score: 0.7236
----- TESTING i = 1800 -----
Baseline Validation Accuracy: 0.7332
Baseline Validation F1 Score: 0.7236
----- TESTING i = 1900 -----
Baseline Validation Accuracy: 0.7388
Baselin

In [5]:
print(best_acc)
print(best_max_features)

0.7793296089385475
6200


# First Model

SVMs

In [None]:
from sklearn.svm import LinearSVC

best_vectorizer = TfidfVectorizer(stop_words='english', max_features=5200, ngram_range=(1, 2))

X_train_svm = best_vectorizer.fit_transform(X_train)
X_valid_svm = best_vectorizer.transform(X_valid)

# use linearsvc since it is better than the normal svc
# we also use td-idf with svms
svm_model = LinearSVC(random_state=42, max_iter=10000)
svm_model.fit(X_train_svm, y_train)

valid_preds_svm = svm_model.predict(X_valid_svm)

print(f"SVM Validation Accuracy: {accuracy_score(y_valid, valid_preds_svm):.4f}")
print(f"SVM Validation F1 Score: {f1_score(y_valid, valid_preds_svm):.4f}")

SVM Validation Accuracy: 0.7682
SVM Validation F1 Score: 0.7580


In [None]:
best_acc_svm = 0
best_max_features_svm = 1000
for i in range(1000, 10000, 100):
    temp_vectorizer = TfidfVectorizer(stop_words='english', max_features=i, ngram_range=(1, 2))
    
    X_train_svm = temp_vectorizer.fit_transform(X_train)
    X_valid_svm = temp_vectorizer.transform(X_valid)

    # use linearsvc since it is better than the normal svc
    # we also use td-idf with svms
    svm_model = LinearSVC(random_state=42, max_iter=10000)
    svm_model.fit(X_train_svm, y_train)

    valid_preds_svm = svm_model.predict(X_valid_svm)

    print(f"----- TESTING i = {i} -----")
    print(f"SVM Validation Accuracy: {accuracy_score(y_valid, valid_preds_svm):.4f}")
    print(f"SVM Validation F1 Score: {f1_score(y_valid, valid_preds_svm):.4f}")
    
    if accuracy_score(y_valid, valid_preds_svm) > best_acc_svm:
        best_acc_svm = accuracy_score(y_valid, valid_preds_svm)
        best_max_features_svm = i

In [11]:
print(best_acc_svm)
print(best_max_features_svm)

0.7821229050279329
3500


# Second Model

LSTM

In [14]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_df_main['text'])

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(train_df_main['text']), maxlen=100)
X_valid_seq = pad_sequences(tokenizer.texts_to_sequences(valid_df_main['text']), maxlen=100)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(test_df_main['text']), maxlen=100)

# 2. Model: Define a simple LSTM network 
model = Sequential([
    Embedding(input_dim=10000, output_dim=32, input_length=100),
    LSTM(32),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 3. Training
model.fit(X_train_seq, y_train, epochs=5, batch_size=64, validation_data=(X_valid_seq, y_valid))

# 4. Evaluation
lstm_probs = model.predict(X_valid_seq)
lstm_preds = (lstm_probs > 0.5).astype(int)

print(f"LSTM Validation Accuracy: {accuracy_score(y_valid, lstm_preds):.4f}")
print(f"LSTM Validation F1 Score: {f1_score(y_valid, lstm_preds):.4f}")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
LSTM Validation Accuracy: 0.8170
LSTM Validation F1 Score: 0.8137


# Third Model

BiLSTM

In [None]:
# 1. Update Sequences: Use the 'clean_text' from the previous step
# We re-fit the tokenizer on the cleaner text
tokenizer_clean = Tokenizer(num_words=10000)
tokenizer_clean.fit_on_texts(train_df_main['clean_text'])

X_train_seq_clean = pad_sequences(tokenizer_clean.texts_to_sequences(train_df_main['clean_text']), maxlen=100)
X_valid_seq_clean = pad_sequences(tokenizer_clean.texts_to_sequences(valid_df_main['clean_text']), maxlen=100)
X_test_seq_clean = pad_sequences(tokenizer_clean.texts_to_sequences(test_df_main['clean_text']), maxlen=100)

# 2. Improved Model: Stacked Bi-LSTM
# We stack two Bidirectional LSTM layers to learn more complex patterns
bilstm_improved = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=100),
    Bidirectional(LSTM(64, return_sequences=True)), # Return sequences is required to stack another LSTM
    Dropout(0.3),
    Bidirectional(LSTM(32)),
    Dropout(0.5),
    Dense(16, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

bilstm_improved.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 3. Training with Early Stopping
# Stop training if validation loss doesn't improve for 3 epochs
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

bilstm_improved.fit(
    X_train_seq_clean, y_train,
    epochs=10,
    batch_size=64,
    validation_data=(X_valid_seq_clean, y_valid),
    callbacks=[early_stop]
)

# 4. Evaluation
probs_bilstm_improved = bilstm_improved.predict(X_valid_seq_clean)
preds_bilstm_improved = (probs_bilstm_improved > 0.5).astype(int)

print(f"Improved Bi-LSTM Accuracy: {accuracy_score(y_valid, preds_bilstm_improved):.4f}")
print(f"Improved Bi-LSTM F1 Score: {f1_score(y_valid, preds_bilstm_improved):.4f}")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Bi-LSTM Validation Accuracy: 0.8282
Bi-LSTM Validation F1 Score: 0.8172


# Fourth Model

CNN

In [None]:
# 1. Model: 1D Convolutional Neural Network
# - Conv1D with kernel_size=5 looks at 5-word windows to find sarcastic phrases
# - GlobalMaxPooling1D keeps only the strongest signal found in the text
cnn_model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=100),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(16, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 2. Training
# We use the same 'clean' sequences and early stopping as before
cnn_model.fit(
    X_train_seq_clean, y_train,
    epochs=10,
    batch_size=64,
    validation_data=(X_valid_seq_clean, y_valid),
    callbacks=[early_stop]
)

# 3. Evaluation
probs_cnn = cnn_model.predict(X_valid_seq_clean)
preds_cnn = (probs_cnn > 0.5).astype(int)

print(f"CNN Validation Accuracy: {accuracy_score(y_valid, preds_cnn):.4f}")
print(f"CNN Validation F1 Score: {f1_score(y_valid, preds_cnn):.4f}")

# Ensemble Model

All of the models join together and create a beautiful new model to average out

In [18]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1, 2))

# the vectorizer uses the main dataframe
X_train_ensemble = vectorizer.fit_transform(X_train)
X_valid_ensemble = vectorizer.transform(X_valid)
X_test_ensemble = vectorizer.transform(X_test)

# 1. Get predictions for the Test Set from all 3 models
# SVM (uses TF-IDF features)
pred_test_svm = svm_model.predict(X_test_ensemble)

# LSTM (uses Sequence features)
pred_test_lstm = (model.predict(X_test_seq) > 0.5).astype(int).flatten()

# Bi-LSTM (uses Sequence features)
pred_test_bilstm = (bilstm_improved.predict(X_test_seq) > 0.5).astype(int).flatten()

# CNN
pred_test_cnn = (cnn_model.predict(X_test_seq) > 0.5).astype(int).flatten()

# 2. Ensemble Voting (Majority Vote)
# Sum the predictions (0 or 1). If sum is 2 or 3, majority is 1.
test_votes = pred_test_svm + pred_test_lstm + pred_test_bilstm + pred_test_cnn
pred_test_ensemble = (test_votes >= 2).astype(int)

# 3. Report detailed metrics
print("Final Evaluation on Test Set:")
print(classification_report(test_df_main['label'], pred_test_ensemble, digits=4))

# 4. Confusion Matrix (Row: True, Col: Predicted)
print("\nConfusion Matrix:")
print(confusion_matrix(test_df_main['label'], pred_test_ensemble))

ValueError: X has 5000 features, but LinearSVC is expecting 9900 features as input.