# Import all necessary libraries

In [11]:
import pandas as pd
import numpy as np
import time

# baseline model libraries
from sklearn.model_selection import KFold, cross_val_score, cross_validate, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

# first model
from sklearn.svm import LinearSVC

# second model
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# third model
from tensorflow.keras.layers import Bidirectional, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# fourth model
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D

# ensemble model
from sklearn.metrics import classification_report, confusion_matrix

# Exploring the Datasets

In [4]:
# loading datasets
train_df_main = pd.read_csv('./datasets/train.csv')
valid_df_main = pd.read_csv('./datasets/valid.csv')
test_df_main = pd.read_csv('./datasets/test.csv')

# display shapes
print(f"Train Shape: {train_df_main.shape}")
print(f"Valid Shape: {valid_df_main.shape}")
print(f"Test Shape: {test_df_main.shape}")

print("----------")

# preview training data
print(train_df_main.head())

print("----------")

# check for class balance
print(train_df_main['label'].value_counts())

print("----------")

# check for missing values
print(train_df_main.isnull().sum())

Train Shape: (21464, 2)
Valid Shape: (716, 2)
Test Shape: (966, 2)
----------
                                                text  label
0  states slow to shut down weak teacher educatio...      0
1    drone places fresh kill on steps of white house      1
2  report: majority of instances of people gettin...      1
3  sole remaining lung filled with rich, satisfyi...      1
4                       the gop's stockholm syndrome      0
----------
label
0    11248
1    10216
Name: count, dtype: int64
----------
text     0
label    0
dtype: int64


# Preprocessing all datasets

Preprocessing includes splitting apart the sentences into tokens, lowercasing all words, and making sure there is no whitespace within the sentences themselves

In [5]:
def preprocess(text):
    text = text.lower()
    tokens = text.split()
    tokens = [t for t in tokens if t]
    
    return tokens

# unprocessed text and labels
X_train = train_df_main['text']
X_valid = valid_df_main['text']
X_test = test_df_main['text']

y_train = train_df_main['label']
y_valid = valid_df_main['label']
y_test = test_df_main['label']

# processed text
X_train_processed = []
X_valid_processed = []
X_test_processed = []

for t in X_train:
    X_train_processed.append(preprocess(t))

for t in X_valid:
    X_valid_processed.append(preprocess(t))

for t in X_test:
    X_test_processed.append(preprocess(t))
    
# validating that preprocessing worked
print(X_train_processed[0:5])

[['states', 'slow', 'to', 'shut', 'down', 'weak', 'teacher', 'education', 'programs'], ['drone', 'places', 'fresh', 'kill', 'on', 'steps', 'of', 'white', 'house'], ['report:', 'majority', 'of', 'instances', 'of', 'people', 'getting', 'lives', 'back', 'on', 'track', 'occur', 'immediately', 'after', 'visit', 'to', 'buffalo', 'wild', 'wings'], ['sole', 'remaining', 'lung', 'filled', 'with', 'rich,', 'satisfying', 'flavor'], ['the', "gop's", 'stockholm', 'syndrome']]


K-Fold Splitting

# Baseline Model

logistic regression + tf-idf

In [5]:
# baseline model
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1, 2))

# the vectorizer uses the main dataframe
X_train_baseline = vectorizer.fit_transform(X_train)
X_valid_baseline = vectorizer.transform(X_valid)
X_test_baseline = vectorizer.transform(X_test)

baseline_model = LogisticRegression(max_iter=10000, random_state=42)
baseline_model.fit(X_train_baseline, y_train)

valid_preds = baseline_model.predict(X_valid_baseline)

print(f"Baseline Validation Accuracy: {accuracy_score(y_valid, valid_preds):.4f}")
print(f"Baseline Validation F1 Score: {f1_score(y_valid, valid_preds):.4f}")

Baseline Validation Accuracy: 0.7751
Baseline Validation F1 Score: 0.7663


In [None]:
# pinpointing best feature value for vectorizer
best_acc = 0
best_max_features = 1000
for i in range(1000, 10000, 100):
    temp_vectorizer = TfidfVectorizer(stop_words='english', max_features=i, ngram_range=(1, 2))
    
    X_train_baseline = temp_vectorizer.fit_transform(X_train)
    X_valid_baseline = temp_vectorizer.transform(X_valid)
    X_test_baseline = temp_vectorizer.transform(X_test)

    baseline_model = LogisticRegression(max_iter=10000, random_state=42)
    baseline_model.fit(X_train_baseline, y_train)

    valid_preds = baseline_model.predict(X_valid_baseline)

    print(f"----- TESTING i = {i} -----")
    print(f"Baseline Validation Accuracy: {accuracy_score(y_valid, valid_preds):.4f}")
    print(f"Baseline Validation F1 Score: {f1_score(y_valid, valid_preds):.4f}")
    
    if accuracy_score(y_valid, valid_preds) > best_acc:
        best_acc = accuracy_score(y_valid, valid_preds)
        best_max_features = i

----- TESTING i = 1000 -----
Baseline Validation Accuracy: 0.7332
Baseline Validation F1 Score: 0.7170
----- TESTING i = 1100 -----
Baseline Validation Accuracy: 0.7249
Baseline Validation F1 Score: 0.7099
----- TESTING i = 1200 -----
Baseline Validation Accuracy: 0.7207
Baseline Validation F1 Score: 0.7041
----- TESTING i = 1300 -----
Baseline Validation Accuracy: 0.7193
Baseline Validation F1 Score: 0.7022
----- TESTING i = 1400 -----
Baseline Validation Accuracy: 0.7263
Baseline Validation F1 Score: 0.7126
----- TESTING i = 1500 -----
Baseline Validation Accuracy: 0.7277
Baseline Validation F1 Score: 0.7128
----- TESTING i = 1600 -----
Baseline Validation Accuracy: 0.7277
Baseline Validation F1 Score: 0.7178
----- TESTING i = 1700 -----
Baseline Validation Accuracy: 0.7332
Baseline Validation F1 Score: 0.7236
----- TESTING i = 1800 -----
Baseline Validation Accuracy: 0.7332
Baseline Validation F1 Score: 0.7236
----- TESTING i = 1900 -----
Baseline Validation Accuracy: 0.7388
Baselin

In [5]:
print(best_acc)
print(best_max_features)

0.7793296089385475
6200


# First Model Exploration

SVMs

In [None]:
from sklearn.svm import LinearSVC

best_vectorizer = TfidfVectorizer(stop_words='english', max_features=5200, ngram_range=(1, 2))

X_train_svm = best_vectorizer.fit_transform(X_train)
X_valid_svm = best_vectorizer.transform(X_valid)

# use linearsvc since it is better than the normal svc
# we also use td-idf with svms
svm_model = LinearSVC(random_state=42, max_iter=10000)
svm_model.fit(X_train_svm, y_train)

valid_preds_svm = svm_model.predict(X_valid_svm)

print(f"SVM Validation Accuracy: {accuracy_score(y_valid, valid_preds_svm):.4f}")
print(f"SVM Validation F1 Score: {f1_score(y_valid, valid_preds_svm):.4f}")

SVM Validation Accuracy: 0.7682
SVM Validation F1 Score: 0.7580


In [None]:
best_acc_svm = 0
best_max_features_svm = 1000
for i in range(1000, 10000, 100):
    temp_vectorizer = TfidfVectorizer(stop_words='english', max_features=i, ngram_range=(1, 2))
    
    X_train_svm = temp_vectorizer.fit_transform(X_train)
    X_valid_svm = temp_vectorizer.transform(X_valid)

    # use linearsvc since it is better than the normal svc
    # we also use td-idf with svms
    svm_model = LinearSVC(random_state=42, max_iter=10000)
    svm_model.fit(X_train_svm, y_train)

    valid_preds_svm = svm_model.predict(X_valid_svm)

    print(f"----- TESTING i = {i} -----")
    print(f"SVM Validation Accuracy: {accuracy_score(y_valid, valid_preds_svm):.4f}")
    print(f"SVM Validation F1 Score: {f1_score(y_valid, valid_preds_svm):.4f}")
    
    if accuracy_score(y_valid, valid_preds_svm) > best_acc_svm:
        best_acc_svm = accuracy_score(y_valid, valid_preds_svm)
        best_max_features_svm = i

In [11]:
print(best_acc_svm)
print(best_max_features_svm)

0.7821229050279329
3500


# Second Model Exploration

LSTM

In [14]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_df_main['text'])

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(train_df_main['text']), maxlen=100)
X_valid_seq = pad_sequences(tokenizer.texts_to_sequences(valid_df_main['text']), maxlen=100)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(test_df_main['text']), maxlen=100)

# 2. Model: Define a simple LSTM network 
model = Sequential([
    Embedding(input_dim=10000, output_dim=32, input_length=100),
    LSTM(32),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 3. Training
model.fit(X_train_seq, y_train, epochs=5, batch_size=64, validation_data=(X_valid_seq, y_valid))

# 4. Evaluation
lstm_probs = model.predict(X_valid_seq)
lstm_preds = (lstm_probs > 0.5).astype(int)

print(f"LSTM Validation Accuracy: {accuracy_score(y_valid, lstm_preds):.4f}")
print(f"LSTM Validation F1 Score: {f1_score(y_valid, lstm_preds):.4f}")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
LSTM Validation Accuracy: 0.8170
LSTM Validation F1 Score: 0.8137


# Third Model Exploration

BiLSTM

In [None]:
# 1. Update Sequences: Use the 'clean_text' from the previous step
# We re-fit the tokenizer on the cleaner text
tokenizer_clean = Tokenizer(num_words=10000)
tokenizer_clean.fit_on_texts(train_df_main['clean_text'])

X_train_seq_clean = pad_sequences(tokenizer_clean.texts_to_sequences(train_df_main['clean_text']), maxlen=100)
X_valid_seq_clean = pad_sequences(tokenizer_clean.texts_to_sequences(valid_df_main['clean_text']), maxlen=100)
X_test_seq_clean = pad_sequences(tokenizer_clean.texts_to_sequences(test_df_main['clean_text']), maxlen=100)

# 2. Improved Model: Stacked Bi-LSTM
# We stack two Bidirectional LSTM layers to learn more complex patterns
bilstm_improved = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=100),
    Bidirectional(LSTM(64, return_sequences=True)), # Return sequences is required to stack another LSTM
    Dropout(0.3),
    Bidirectional(LSTM(32)),
    Dropout(0.5),
    Dense(16, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

bilstm_improved.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 3. Training with Early Stopping
# Stop training if validation loss doesn't improve for 3 epochs
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

bilstm_improved.fit(
    X_train_seq_clean, y_train,
    epochs=10,
    batch_size=64,
    validation_data=(X_valid_seq_clean, y_valid),
    callbacks=[early_stop]
)

# 4. Evaluation
probs_bilstm_improved = bilstm_improved.predict(X_valid_seq_clean)
preds_bilstm_improved = (probs_bilstm_improved > 0.5).astype(int)

print(f"Improved Bi-LSTM Accuracy: {accuracy_score(y_valid, preds_bilstm_improved):.4f}")
print(f"Improved Bi-LSTM F1 Score: {f1_score(y_valid, preds_bilstm_improved):.4f}")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Bi-LSTM Validation Accuracy: 0.8282
Bi-LSTM Validation F1 Score: 0.8172


# Fourth Model

CNN

In [None]:
# 1. Model: 1D Convolutional Neural Network
# - Conv1D with kernel_size=5 looks at 5-word windows to find sarcastic phrases
# - GlobalMaxPooling1D keeps only the strongest signal found in the text
cnn_model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=100),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(16, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 2. Training
# We use the same 'clean' sequences and early stopping as before
cnn_model.fit(
    X_train_seq_clean, y_train,
    epochs=10,
    batch_size=64,
    validation_data=(X_valid_seq_clean, y_valid),
    callbacks=[early_stop]
)

# 3. Evaluation
probs_cnn = cnn_model.predict(X_valid_seq_clean)
preds_cnn = (probs_cnn > 0.5).astype(int)

print(f"CNN Validation Accuracy: {accuracy_score(y_valid, preds_cnn):.4f}")
print(f"CNN Validation F1 Score: {f1_score(y_valid, preds_cnn):.4f}")

# Ensemble Model

All of the models join together and create a beautiful new model to average out

In [17]:
X_combined = pd.concat([train_df_main['text'], valid_df_main['text']], ignore_index=True)
y_combined = pd.concat([train_df_main['label'], valid_df_main['label']], ignore_index=True)

# Change 1: Reduce folds from 7 to 5 (saves ~40% time)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

all_predictions = {
    'svm': [], 'bilstm': [], 'cnn': []
}
all_labels = []

model_f1_scores = {
    'svm': [], 'bilstm': [], 'cnn': []
}

# Track timing
fold_times = []
# TF-IDF
vectorizer_svm = TfidfVectorizer(stop_words='english', max_features=3500, ngram_range=(1,2))
X_tfidf_all = vectorizer_svm.fit_transform(X_combined)

# Tokenizer for Deep Learning
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_combined)
X_seq_all = pad_sequences(tokenizer.texts_to_sequences(X_combined), maxlen=100)

for fold_num, (train_idx, val_idx) in enumerate(kfold.split(X_combined, y_combined), 1):
    fold_start = time.time()
    print(f"\n{'='*70}")
    print(f"FOLD {fold_num}/5")
    print(f"{'='*70}")
    
    X_train_svm, X_val_svm = X_tfidf_all[train_idx], X_tfidf_all[val_idx]
    X_train_seq, X_val_seq = X_seq_all[train_idx], X_seq_all[val_idx]
    y_train_fold, y_val_fold = y_combined.iloc[train_idx], y_combined.iloc[val_idx]
    
    X_train_fold = X_combined.iloc[train_idx]
    y_train_fold = y_combined.iloc[train_idx]
    X_val_fold = X_combined.iloc[val_idx]
    y_val_fold = y_combined.iloc[val_idx]
    
    all_labels.extend(y_val_fold.values)
    
    # ========================================
    # SVM (Fast - no changes needed)
    # ========================================
    print("\n[1/3] Training SVM...", end=" ")
    svm_start = time.time()
    
    svm = LinearSVC(random_state=42, max_iter=10000, class_weight='balanced')
    svm.fit(X_train_svm, y_train_fold)
    svm_preds = svm.predict(X_val_svm)
    
    all_predictions['svm'].extend(svm_preds)
    svm_f1 = f1_score(y_val_fold, svm_preds)
    model_f1_scores['svm'].append(svm_f1)
    
    print(f"Done! F1: {svm_f1:.4f} ({time.time() - svm_start:.1f}s)")
    
    # ========================================
    # BiLSTM (Optimized)
    # ========================================
    print("[2/3] Training BiLSTM...", end=" ")
    bilstm_start = time.time()
    
    tokenizer = Tokenizer(num_words=10000)
    tokenizer.fit_on_texts(X_train_fold)
    X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train_fold), maxlen=100)
    X_val_seq = pad_sequences(tokenizer.texts_to_sequences(X_val_fold), maxlen=100)
    
    # Change 2: Smaller model = faster training
    bilstm = Sequential([
        Embedding(10000, 64),  # Reduced from 128 to 64
        Bidirectional(LSTM(32, return_sequences=True, dropout=0.3)),  # Reduced from 64 to 32
        Dropout(0.3),
        Bidirectional(LSTM(16, dropout=0.3)),  # Reduced from 32 to 16
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    
    bilstm.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    # Change 3: Early stopping with patience=1 (stops at first no-improvement)
    early_stop = EarlyStopping(
        monitor='val_loss',
        patience=1,  # Stop after 1 epoch with no improvement
        restore_best_weights=True,
        verbose=0
    )
    
    # Change 4: Fewer max epochs (5 → 10, but early stopping will likely stop at 2-3)
    bilstm.fit(
        X_train_seq, y_train_fold,
        validation_data=(X_val_seq, y_val_fold),
        epochs=10,
        batch_size=128,
        callbacks=[early_stop],
        verbose=0
    )
    
    bilstm_preds = (bilstm.predict(X_val_seq, verbose=0) > 0.5).astype(int).flatten()
    
    all_predictions['bilstm'].extend(bilstm_preds)
    bilstm_f1 = f1_score(y_val_fold, bilstm_preds)
    model_f1_scores['bilstm'].append(bilstm_f1)
    
    print(f"Done! F1: {bilstm_f1:.4f} ({time.time() - bilstm_start:.1f}s)")
    
    # ========================================
    # CNN (Optimized - CNNs are naturally faster)
    # ========================================
    print("[3/3] Training CNN...", end=" ")
    cnn_start = time.time()
    
    cnn = Sequential([
        Embedding(10000, 64),  # Reduced from 128
        Conv1D(64, 5, activation='relu'),  # Reduced filters from 128 to 64
        GlobalMaxPooling1D(),
        Dense(32, activation='relu'),  # Reduced from 64 to 32
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    
    cnn.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    # CNNs converge faster, so even fewer epochs
    early_stop_cnn = EarlyStopping(
        monitor='val_loss',
        patience=1,
        restore_best_weights=True,
        verbose=0
    )
    
    cnn.fit(
        X_train_seq, y_train_fold,
        validation_data=(X_val_seq, y_val_fold),
        epochs=8,
        batch_size=64,
        callbacks=[early_stop_cnn],
        verbose=0
    )
    
    cnn_preds = (cnn.predict(X_val_seq, verbose=0) > 0.5).astype(int).flatten()
    
    all_predictions['cnn'].extend(cnn_preds)
    cnn_f1 = f1_score(y_val_fold, cnn_preds)
    model_f1_scores['cnn'].append(cnn_f1)
    
    print(f"Done! F1: {cnn_f1:.4f} ({time.time() - cnn_start:.1f}s)")
    
    # Fold summary
    fold_time = time.time() - fold_start
    fold_times.append(fold_time)
    
    print(f"\nFold {fold_num} completed in {fold_time:.1f}s")
    print(f"  SVM: {svm_f1:.4f} | BiLSTM: {bilstm_f1:.4f} | CNN: {cnn_f1:.4f}")
    
    # Estimate remaining time
    if fold_num < 5:
        avg_time_per_fold = np.mean(fold_times)
        remaining_folds = 5 - fold_num
        estimated_remaining = avg_time_per_fold * remaining_folds
        print(f"\n⏱️  Estimated time remaining: {estimated_remaining/60:.1f} minutes")


# ============================================================
# RESULTS
# ============================================================

print("\n" + "=" * 70)
print("RESULTS")
print("=" * 70)

avg_f1_scores = {model: np.mean(scores) for model, scores in model_f1_scores.items()}

print("\nAverage F1 scores across all folds:")
for model, f1 in sorted(avg_f1_scores.items(), key=lambda x: x[1], reverse=True):
    std = np.std(model_f1_scores[model])
    print(f"  {model.upper():10s}: {f1:.4f} (±{std:.4f})")

# Calculate weights
total_f1 = sum(avg_f1_scores.values())
weights = {model: f1 / total_f1 for model, f1 in avg_f1_scores.items()}

print("\nCalculated weights (based on F1 performance):")
for model, weight in sorted(weights.items(), key=lambda x: x[1], reverse=True):
    print(f"  {model.upper():10s}: {weight:.4f} ({weight*100:.1f}%)")

print(f"\nTotal training time: {sum(fold_times)/60:.1f} minutes")
print(f"Average time per fold: {np.mean(fold_times):.1f} seconds")


FOLD 1/5

[1/3] Training SVM... Done! F1: 0.7559 (0.0s)
[2/3] Training BiLSTM... Done! F1: 0.8428 (50.8s)
[3/3] Training CNN... Done! F1: 0.8428 (50.8s)
[3/3] Training CNN... Done! F1: 0.8416 (12.0s)

Fold 1 completed in 62.9s
  SVM: 0.7559 | BiLSTM: 0.8428 | CNN: 0.8416

⏱️  Estimated time remaining: 4.2 minutes

FOLD 2/5

[1/3] Training SVM... Done! F1: 0.7515 (0.1s)
[2/3] Training BiLSTM... Done! F1: 0.8416 (12.0s)

Fold 1 completed in 62.9s
  SVM: 0.7559 | BiLSTM: 0.8428 | CNN: 0.8416

⏱️  Estimated time remaining: 4.2 minutes

FOLD 2/5

[1/3] Training SVM... Done! F1: 0.7515 (0.1s)
[2/3] Training BiLSTM... Done! F1: 0.8440 (68.5s)
[3/3] Training CNN... Done! F1: 0.8440 (68.5s)
[3/3] Training CNN... Done! F1: 0.8530 (16.9s)

Fold 2 completed in 85.6s
  SVM: 0.7515 | BiLSTM: 0.8440 | CNN: 0.8530

⏱️  Estimated time remaining: 3.7 minutes

FOLD 3/5

[1/3] Training SVM... Done! F1: 0.7706 (0.1s)
[2/3] Training BiLSTM... Done! F1: 0.8530 (16.9s)

Fold 2 completed in 85.6s
  SVM: 0.751

In [16]:
# Evaluate the Ensemble Model using K-Fold Cross Validation

import numpy as np
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Compute ensemble predictions as weighted average of individual model predictions
ensemble_probs = np.zeros(len(all_labels))
for i in range(len(all_labels)):
    ensemble_probs[i] = (weights['svm'] * all_predictions['svm'][i] + 
                         weights['bilstm'] * all_predictions['bilstm'][i] + 
                         weights['cnn'] * all_predictions['cnn'][i])

# Convert probabilities to binary predictions
ensemble_preds = (ensemble_probs > 0.5).astype(int)

# Evaluate the ensemble
ensemble_accuracy = accuracy_score(all_labels, ensemble_preds)
ensemble_f1 = f1_score(all_labels, ensemble_preds)

print("\n" + "=" * 70)
print("ENSEMBLE MODEL EVALUATION (K-Fold Cross Validation)")
print("=" * 70)
print(f"Ensemble Validation Accuracy: {ensemble_accuracy:.4f}")
print(f"Ensemble Validation F1 Score: {ensemble_f1:.4f}")

print("\nClassification Report:")
print(classification_report(all_labels, ensemble_preds, target_names=['Not Sarcastic', 'Sarcastic']))

# Optional: Compare with individual models
print("\nComparison with Individual Models:")
print(f"{'Model':<10} {'Accuracy':<10} {'F1 Score':<10}")
print("-" * 30)
for model in ['svm', 'bilstm', 'cnn']:
    model_preds = np.array(all_predictions[model])
    acc = accuracy_score(all_labels, model_preds)
    f1 = f1_score(all_labels, model_preds)
    print(f"{model.upper():<10} {acc:<10.4f} {f1:<10.4f}")
print(f"{'ENSEMBLE':<10} {ensemble_accuracy:<10.4f} {ensemble_f1:<10.4f}")

NameError: name 'weights' is not defined