Exploration of Dataset

In [3]:
import pandas as pd
import numpy as np

# Load the datasets
train_df = pd.read_csv('./datasets/train.csv')
valid_df = pd.read_csv('./datasets/valid.csv')
test_df = pd.read_csv('./datasets/test.csv')

# Display dataset shapes
print(f"Train Shape: {train_df.shape}")
print(f"Valid Shape: {valid_df.shape}")
print(f"Test Shape: {test_df.shape}")
print()

# Preview the training data
print(train_df.head())
print()

# Check for class balance in the training set
print(train_df['label'].value_counts())
print()

# Check for any missing values
print(train_df.isnull().sum())

Train Shape: (21464, 2)
Valid Shape: (716, 2)
Test Shape: (966, 2)

                                                text  label
0  states slow to shut down weak teacher educatio...      0
1    drone places fresh kill on steps of white house      1
2  report: majority of instances of people gettin...      1
3  sole remaining lung filled with rich, satisfyi...      1
4                       the gop's stockholm syndrome      0

label
0    11248
1    10216
Name: count, dtype: int64

text     0
label    0
dtype: int64


Baseline Model with Bag of Words and Logistic Regression

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

# 1. Preprocessing: Convert text to numerical vectors (Bag of Words)
# We limit to the top 5000 most frequent words to keep it simple
vectorizer = CountVectorizer(stop_words='english', max_features=5000)

# Fit on training data, then transform valid and test data
X_train = vectorizer.fit_transform(train_df['text'])
X_valid = vectorizer.transform(valid_df['text'])
X_test = vectorizer.transform(test_df['text'])

y_train = train_df['label']
y_valid = valid_df['label']
y_test = test_df['label']

# 2. Model: Train a simple Logistic Regression model
baseline_model = LogisticRegression(max_iter=1000, random_state=42)
baseline_model.fit(X_train, y_train)

# 3. Evaluation: specific metrics on validation set
valid_preds = baseline_model.predict(X_valid)

print(f"Baseline Validation Accuracy: {accuracy_score(y_valid, valid_preds):.4f}")
print(f"Baseline Validation F1 Score: {f1_score(y_valid, valid_preds):.4f}")

Baseline Validation Accuracy: 0.7570
Baseline Validation F1 Score: 0.7464


Feature Engineering with TF-IDF and N-grams

Idea: weight words to give less importance to common words and more importance to unique words

-> This might signal sarcasm?

N-gram Idea: model seeing pairs of words together might be important for sarcasm b/c gives more context

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 1. Preprocessing: Use TF-IDF and include Bigrams (1-word and 2-word combinations)
# We increase max_features slightly to accommodate new bigrams
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10000, ngram_range=(1, 2))

X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['text'])
X_valid_tfidf = tfidf_vectorizer.transform(valid_df['text'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['text'])

# 2. Model: Retrain Logistic Regression on these new features
tfidf_model = LogisticRegression(max_iter=1000, random_state=42)
tfidf_model.fit(X_train_tfidf, y_train)

# 3. Evaluation
valid_preds_tfidf = tfidf_model.predict(X_valid_tfidf)

print(f"TF-IDF + Bigram Accuracy: {accuracy_score(y_valid, valid_preds_tfidf):.4f}")
print(f"TF-IDF + Bigram F1 Score: {f1_score(y_valid, valid_preds_tfidf):.4f}")

TF-IDF + Bigram Accuracy: 0.7737
TF-IDF + Bigram F1 Score: 0.7632


Support Vector Machine
Idea: Good at classification tasks

In [6]:
from sklearn.svm import LinearSVC

# 1. Model: Support Vector Machine (Linear Kernel)
# LinearSVC is faster and often better for text than standard SVC
svm_model = LinearSVC(random_state=42, max_iter=10000)
svm_model.fit(X_train_tfidf, y_train)

# 2. Evaluation
valid_preds_svm = svm_model.predict(X_valid_tfidf)

print(f"SVM Validation Accuracy: {accuracy_score(y_valid, valid_preds_svm):.4f}")
print(f"SVM Validation F1 Score: {f1_score(y_valid, valid_preds_svm):.4f}")

SVM Validation Accuracy: 0.7835
SVM Validation F1 Score: 0.7750




LSTM: process text as a sequence rather than Bag of Words

Idea: capture more structure in the sarcasm string

In [8]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# 1. Preprocessing: Convert text to sequences of integers
# We limit the vocab to 10,000 words and sequence length to 100
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_df['text'])

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(train_df['text']), maxlen=100)
X_valid_seq = pad_sequences(tokenizer.texts_to_sequences(valid_df['text']), maxlen=100)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(test_df['text']), maxlen=100)

# 2. Model: Define a simple LSTM network 
model = Sequential([
    Embedding(input_dim=10000, output_dim=32, input_length=100),
    LSTM(32),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 3. Training
model.fit(X_train_seq, y_train, epochs=5, batch_size=64, validation_data=(X_valid_seq, y_valid))

# 4. Evaluation
lstm_probs = model.predict(X_valid_seq)
lstm_preds = (lstm_probs > 0.5).astype(int)

print(f"LSTM Validation Accuracy: {accuracy_score(y_valid, lstm_preds):.4f}")
print(f"LSTM Validation F1 Score: {f1_score(y_valid, lstm_preds):.4f}")

Epoch 1/5




[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 18ms/step - accuracy: 0.7632 - loss: 0.4733 - val_accuracy: 0.8743 - val_loss: 0.3368
Epoch 2/5
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 17ms/step - accuracy: 0.9023 - loss: 0.2423 - val_accuracy: 0.8478 - val_loss: 0.3528
Epoch 3/5
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.9385 - loss: 0.1676 - val_accuracy: 0.8534 - val_loss: 0.4057
Epoch 4/5
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.9565 - loss: 0.1206 - val_accuracy: 0.8380 - val_loss: 0.4715
Epoch 5/5
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.9726 - loss: 0.0808 - val_accuracy: 0.8450 - val_loss: 0.6123
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
LSTM Validation Accuracy: 0.8450
LSTM Validation F1 Score: 0.8384


BiLSTM to read strings bidirectionally

Idea: typically better to gain more information.

Optimization: Dropout so that the network is not overly reliant on specific features (prevents overfitting)

In [9]:
from tensorflow.keras.layers import Bidirectional, Dropout

# 1. Model: Define a Bidirectional LSTM with Dropout
# We wrap the LSTM layer in 'Bidirectional'
bilstm_model = Sequential([
    Embedding(input_dim=10000, output_dim=32, input_length=100),
    Bidirectional(LSTM(32)),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

bilstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 2. Training
# We use the same sequence data (X_train_seq) prepared in the previous step
bilstm_model.fit(X_train_seq, y_train, epochs=5, batch_size=64, validation_data=(X_valid_seq, y_valid))

# 3. Evaluation
bilstm_probs = bilstm_model.predict(X_valid_seq)
bilstm_preds = (bilstm_probs > 0.5).astype(int)

print(f"Bi-LSTM Validation Accuracy: {accuracy_score(y_valid, bilstm_preds):.4f}")
print(f"Bi-LSTM Validation F1 Score: {f1_score(y_valid, bilstm_preds):.4f}")



Epoch 1/5
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 31ms/step - accuracy: 0.7233 - loss: 0.5235 - val_accuracy: 0.8408 - val_loss: 0.3720
Epoch 2/5
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 33ms/step - accuracy: 0.8880 - loss: 0.2764 - val_accuracy: 0.8492 - val_loss: 0.3457
Epoch 3/5
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 34ms/step - accuracy: 0.9311 - loss: 0.1879 - val_accuracy: 0.8450 - val_loss: 0.3854
Epoch 4/5
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 35ms/step - accuracy: 0.9520 - loss: 0.1365 - val_accuracy: 0.8436 - val_loss: 0.4668
Epoch 5/5
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 36ms/step - accuracy: 0.9664 - loss: 0.0971 - val_accuracy: 0.8380 - val_loss: 0.5728
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
Bi-LSTM Validation Accuracy: 0.8380
Bi-LSTM Validation F1 Score: 0.8371


Why did the accuracy go down bro :(

Build Ensemble Model

In [10]:
# 1. Gather predictions from our top 3 models
# Note: We flatten the neural network arrays to make them match the SVM's shape
pred_1 = valid_preds_svm
pred_2 = lstm_preds.flatten()
pred_3 = bilstm_preds.flatten()

# 2. Voting Logic: Sum the predictions
# If sum is 2 or 3, it means the majority voted 1 (Sarcasm)
total_votes = pred_1 + pred_2 + pred_3
ensemble_preds = (total_votes >= 2).astype(int)

# 3. Evaluation
print(f"Ensemble Validation Accuracy: {accuracy_score(y_valid, ensemble_preds):.4f}")
print(f"Ensemble Validation F1 Score: {f1_score(y_valid, ensemble_preds):.4f}")

Ensemble Validation Accuracy: 0.8408
Ensemble Validation F1 Score: 0.8376


Evaluation on Test Set

In [11]:
from sklearn.metrics import classification_report, confusion_matrix

# 1. Get predictions for the Test Set from all 3 models
# SVM (uses TF-IDF features)
pred_test_svm = svm_model.predict(X_test_tfidf)

# LSTM (uses Sequence features)
pred_test_lstm = (model.predict(X_test_seq) > 0.5).astype(int).flatten()

# Bi-LSTM (uses Sequence features)
pred_test_bilstm = (bilstm_model.predict(X_test_seq) > 0.5).astype(int).flatten()

# 2. Ensemble Voting (Majority Vote)
# Sum the predictions (0 or 1). If sum is 2 or 3, majority is 1.
test_votes = pred_test_svm + pred_test_lstm + pred_test_bilstm
pred_test_ensemble = (test_votes >= 2).astype(int)

# 3. Report detailed metrics
print("Final Evaluation on Test Set:")
print(classification_report(test_df['label'], pred_test_ensemble, digits=4))

# 4. Confusion Matrix (Row: True, Col: Predicted)
print("\nConfusion Matrix:")
print(confusion_matrix(test_df['label'], pred_test_ensemble))

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Final Evaluation on Test Set:
              precision    recall  f1-score   support

           0     0.8707    0.8707    0.8707       526
           1     0.8455    0.8455    0.8455       440

    accuracy                         0.8592       966
   macro avg     0.8581    0.8581    0.8581       966
weighted avg     0.8592    0.8592    0.8592       966


Confusion Matrix:
[[458  68]
 [ 68 372]]
