# Exploration of Dataset

## Import necessary libraries

In [58]:
# necessary libraries
import pandas as pd
import numpy as np

# baseline model
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

# SVM model
from sklearn.svm import LinearSVC

# LSTM model
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# BiLSTM model
from tensorflow.keras.layers import Bidirectional, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# CNN model
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D

# statistical imports
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

## Loading Datasets and Preprocessing

In [59]:
# Load the datasets
train_df = pd.read_csv('./datasets/train.csv')
valid_df = pd.read_csv('./datasets/valid.csv')
test_df = pd.read_csv('./datasets/test.csv')

# Display dataset shapes
print(f"Train Shape: {train_df.shape}")
print(f"Valid Shape: {valid_df.shape}")
print(f"Test Shape: {test_df.shape}")
print()

# Preview the training data
print(train_df.head())
print()

# Check for class balance in the training set
print(train_df['label'].value_counts())
print()

# Check for any missing values
print(train_df.isnull().sum())


Train Shape: (21464, 2)
Valid Shape: (716, 2)
Test Shape: (966, 2)

                                                text  label
0  states slow to shut down weak teacher educatio...      0
1    drone places fresh kill on steps of white house      1
2  report: majority of instances of people gettin...      1
3  sole remaining lung filled with rich, satisfyi...      1
4                       the gop's stockholm syndrome      0

label
0    11248
1    10216
Name: count, dtype: int64

text     0
label    0
dtype: int64


In [60]:
import re
from scipy.sparse import hstack

# extracting features
def extract_features(df):
    features = pd.DataFrame()
    # Punctuation counts 
    features['exclamation_count'] = df['text'].str.count('!')
    features['question_count'] = df['text'].str.count('\?')
    features['ellipsis_count'] = df['text'].str.count(r'\.\.\.')
    
    # Capitalization
    features['capital_ratio'] = df['text'].apply(lambda x: sum(1 for c in x if c.isupper()) / (len(x) + 1))
    features['has_all_caps_word'] = df['text'].str.contains(r'\b[A-Z]{2,}\b').astype(int)
    
    # Length metrics
    features['text_length'] = df['text'].str.len()
    features['word_count'] = df['text'].str.split().str.len()
    
    return features

train_features = extract_features(train_df)
valid_features = extract_features(valid_df)
test_features = extract_features(test_df)

In [61]:
def preprocess_text(text):
    # remove all capitalization and whitespace
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()    # Clean whitespace
    return text

train_df['clean_text'] = train_df['text'].apply(preprocess_text)
valid_df['clean_text'] = valid_df['text'].apply(preprocess_text)
test_df['clean_text'] = test_df['text'].apply(preprocess_text)

In [62]:
tfidf_clean = TfidfVectorizer(stop_words='english', max_features=10000, ngram_range=(1, 2))
X_train_tfidf_clean = tfidf_clean.fit_transform(train_df['clean_text'])
X_valid_tfidf_clean = tfidf_clean.transform(valid_df['clean_text'])
X_test_tfidf_clean = tfidf_clean.transform(test_df['clean_text'])

X_train_combined = hstack([X_train_tfidf_clean, train_features])
X_valid_combined = hstack([X_valid_tfidf_clean, valid_features])
X_test_combined = hstack([X_test_tfidf_clean, test_features])

## Baseline Model (Log Regression w/ Bag of Words)

In [63]:
# we limit to the top 5000 most frequent words to keep it simple
vectorizer = CountVectorizer(stop_words='english', max_features=5000)

# fit on training data, then transform valid and test data
X_train = vectorizer.fit_transform(train_df['text'])
X_valid = vectorizer.transform(valid_df['text'])
X_test = vectorizer.transform(test_df['text'])

y_train = train_df['label']
y_valid = valid_df['label']
y_test = test_df['label']

# log reg model
baseline_model = LogisticRegression(max_iter=1000, random_state=42)
baseline_model.fit(X_train, y_train)

# evaluation
valid_preds = baseline_model.predict(X_valid)

print(f"Baseline Validation Accuracy: {accuracy_score(y_valid, valid_preds):.4f}")
print(f"Baseline Validation F1 Score: {f1_score(y_valid, valid_preds):.4f}")

Baseline Validation Accuracy: 0.7598
Baseline Validation F1 Score: 0.7485


Feature Engineering with TF-IDF and N-grams

Idea: weight words to give less importance to common words and more importance to unique words

-> This might signal sarcasm?

N-gram Idea: model seeing pairs of words together might be important for sarcasm b/c gives more context

In [64]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10000, ngram_range=(1, 2))

X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['text'])
X_valid_tfidf = tfidf_vectorizer.transform(valid_df['text'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['text'])

# use new features to retrain
tfidf_model = LogisticRegression(max_iter=1000, random_state=42)
tfidf_model.fit(X_train_tfidf, y_train)

# evaluation
valid_preds_tfidf = tfidf_model.predict(X_valid_tfidf)

print(f"TF-IDF + Bigram Accuracy: {accuracy_score(y_valid, valid_preds_tfidf):.4f}")
print(f"TF-IDF + Bigram F1 Score: {f1_score(y_valid, valid_preds_tfidf):.4f}")

TF-IDF + Bigram Accuracy: 0.7709
TF-IDF + Bigram F1 Score: 0.7595


## Possible model (SVM)

Idea: Good at classification tasks

In [65]:
# LinearSVC is faster and often better for text than standard SVC
svm_model = LinearSVC(random_state=42, max_iter=10000)
svm_model.fit(X_train_tfidf, y_train)

# evaluation
valid_preds_svm = svm_model.predict(X_valid_tfidf)

print(f"SVM Validation Accuracy: {accuracy_score(y_valid, valid_preds_svm):.4f}")
print(f"SVM Validation F1 Score: {f1_score(y_valid, valid_preds_svm):.4f}")

SVM Validation Accuracy: 0.7751
SVM Validation F1 Score: 0.7636


## Possible model (LSTM)

LSTM: process text as a sequence rather than Bag of Words

Idea: capture more structure in the sarcasm string

In [66]:
# we limit the vocab to 10,000 words and sequence length to 100
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_df['text'])

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(train_df['text']), maxlen=100)
X_valid_seq = pad_sequences(tokenizer.texts_to_sequences(valid_df['text']), maxlen=100)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(test_df['text']), maxlen=100)

# LSTM model
model = Sequential([
    Embedding(input_dim=10000, output_dim=32, input_length=100),
    LSTM(32),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train_seq, y_train, epochs=5, batch_size=64, validation_data=(X_valid_seq, y_valid))

# evaluation
lstm_probs = model.predict(X_valid_seq)
lstm_preds = (lstm_probs > 0.5).astype(int)

print(f"LSTM Validation Accuracy: {accuracy_score(y_valid, lstm_preds):.4f}")
print(f"LSTM Validation F1 Score: {f1_score(y_valid, lstm_preds):.4f}")



Epoch 1/5
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 34ms/step - accuracy: 0.7343 - loss: 0.5256 - val_accuracy: 0.8464 - val_loss: 0.3561
Epoch 2/5
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 32ms/step - accuracy: 0.8927 - loss: 0.2621 - val_accuracy: 0.8464 - val_loss: 0.3433
Epoch 3/5
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 32ms/step - accuracy: 0.9367 - loss: 0.1702 - val_accuracy: 0.8478 - val_loss: 0.3913
Epoch 4/5
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 31ms/step - accuracy: 0.9613 - loss: 0.1129 - val_accuracy: 0.8422 - val_loss: 0.4913
Epoch 5/5
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 32ms/step - accuracy: 0.9750 - loss: 0.0741 - val_accuracy: 0.8240 - val_loss: 0.5673
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
LSTM Validation Accuracy: 0.8240
LSTM Validation F1 Score: 0.8147


## Possible model (BiLSTM)

BiLSTM to read strings bidirectionally

Idea: typically better to gain more information.

Optimization: Dropout so that the network is not overly reliant on specific features (prevents overfitting)

UPDATE: added stacking and early stopping

In [67]:
# we re-fit the tokenizer on the cleaner text
tokenizer_clean = Tokenizer(num_words=10000)
tokenizer_clean.fit_on_texts(train_df['clean_text'])

X_train_seq_clean = pad_sequences(tokenizer_clean.texts_to_sequences(train_df['clean_text']), maxlen=100)
X_valid_seq_clean = pad_sequences(tokenizer_clean.texts_to_sequences(valid_df['clean_text']), maxlen=100)
X_test_seq_clean = pad_sequences(tokenizer_clean.texts_to_sequences(test_df['clean_text']), maxlen=100)

# stack two Bidirectional LSTM layers to learn more complex patterns
bilstm_model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=100),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.3),
    Bidirectional(LSTM(32)),
    Dropout(0.5),
    Dense(16, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

bilstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# stop training if validation loss doesn't improve for 3 epochs
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

bilstm_model.fit(
    X_train_seq_clean, y_train,
    epochs=10,
    batch_size=64,
    validation_data=(X_valid_seq_clean, y_valid),
    callbacks=[early_stop]
)

# evaluation
probs_bilstm_improved = bilstm_model.predict(X_valid_seq_clean)
bilstm_preds = (probs_bilstm_improved > 0.5).astype(int)

print(f"Bi-LSTM Accuracy: {accuracy_score(y_valid, bilstm_preds):.4f}")
print(f"Bi-LSTM F1 Score: {f1_score(y_valid, bilstm_preds):.4f}")

Epoch 1/10




[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 107ms/step - accuracy: 0.7880 - loss: 0.4417 - val_accuracy: 0.8534 - val_loss: 0.3527
Epoch 2/10
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 115ms/step - accuracy: 0.9171 - loss: 0.2333 - val_accuracy: 0.8380 - val_loss: 0.3864
Epoch 3/10
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 116ms/step - accuracy: 0.9535 - loss: 0.1366 - val_accuracy: 0.8296 - val_loss: 0.4778
Epoch 4/10
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 123ms/step - accuracy: 0.9729 - loss: 0.0879 - val_accuracy: 0.8324 - val_loss: 0.6297
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 45ms/step
Bi-LSTM Accuracy: 0.8534
Bi-LSTM F1 Score: 0.8502


## Possible Model (CNN)

CNN for text classification to spot N-gram patterns

Idea: detect phrases for sarcasm

In [68]:
# CNN model
cnn_model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=100),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(16, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

cnn_model.fit(
    X_train_seq_clean, y_train,
    epochs=10,
    batch_size=64,
    validation_data=(X_valid_seq_clean, y_valid),
    callbacks=[early_stop]
)

# evaluation
probs_cnn = cnn_model.predict(X_valid_seq_clean)
preds_cnn = (probs_cnn > 0.5).astype(int)

print(f"CNN Validation Accuracy: {accuracy_score(y_valid, preds_cnn):.4f}")
print(f"CNN Validation F1 Score: {f1_score(y_valid, preds_cnn):.4f}")

Epoch 1/10




[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 29ms/step - accuracy: 0.7943 - loss: 0.4406 - val_accuracy: 0.8575 - val_loss: 0.3460
Epoch 2/10
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 26ms/step - accuracy: 0.9162 - loss: 0.2273 - val_accuracy: 0.8436 - val_loss: 0.3687
Epoch 3/10
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 25ms/step - accuracy: 0.9687 - loss: 0.1061 - val_accuracy: 0.8366 - val_loss: 0.4920
Epoch 4/10
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 26ms/step - accuracy: 0.9881 - loss: 0.0483 - val_accuracy: 0.8436 - val_loss: 0.6216
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
CNN Validation Accuracy: 0.8575
CNN Validation F1 Score: 0.8579


## Inital Ensemble Model

In [69]:
pred_1 = valid_preds_svm
pred_2 = lstm_preds.flatten()
pred_3 = bilstm_preds.flatten()

total_votes = pred_1 + pred_2 + pred_3
ensemble_preds = (total_votes >= 2).astype(int)

print(f"Ensemble Validation Accuracy: {accuracy_score(y_valid, ensemble_preds):.4f}")
print(f"Ensemble Validation F1 Score: {f1_score(y_valid, ensemble_preds):.4f}")

Ensemble Validation Accuracy: 0.8450
Ensemble Validation F1 Score: 0.8380


Evaluation on Test Set

## Testing different Ensemble methods and combinations

In [70]:
# Ensemble method: SVM -> LSTM -> BiLSTM w/ Ensemble Voting
pred_test_svm = svm_model.predict(X_test_tfidf)
pred_test_lstm = (model.predict(X_test_seq) > 0.5).astype(int).flatten()
pred_test_bilstm = (bilstm_model.predict(X_test_seq) > 0.5).astype(int).flatten()

test_votes = pred_test_svm + pred_test_lstm + pred_test_bilstm
pred_test_ensemble = (test_votes >= 2).astype(int)

# evaluation
print("Final Evaluation on Test Set:")
print(classification_report(test_df['label'], pred_test_ensemble, digits=4))

print("\nConfusion Matrix:")
print(confusion_matrix(test_df['label'], pred_test_ensemble))

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step
Final Evaluation on Test Set:
              precision    recall  f1-score   support

           0     0.8609    0.8821    0.8714       526
           1     0.8548    0.8295    0.8420       440

    accuracy                         0.8582       966
   macro avg     0.8578    0.8558    0.8567       966
weighted avg     0.8581    0.8582    0.8580       966


Confusion Matrix:
[[464  62]
 [ 75 365]]


In [71]:
# Ensemble method: SVM -> BiLSTM -> CNN w/ Voting
test_pred_svm = svm_model.predict(X_test_tfidf)

test_probs_bilstm = bilstm_model.predict(X_test_seq_clean)
test_pred_bilstm = (test_probs_bilstm > 0.5).astype(int).flatten()

test_probs_cnn = cnn_model.predict(X_test_seq_clean)
test_pred_cnn = (test_probs_cnn > 0.5).astype(int).flatten()

# voting
test_votes = test_pred_svm + test_pred_bilstm + test_pred_cnn
final_ensemble_preds = (test_votes >= 2).astype(int)

# evaluation
print("SUMMARY:")
print(f" - Accuracy: {accuracy_score(test_df['label'], final_ensemble_preds):.4f}")
print(f" - F1 Score: {f1_score(test_df['label'], final_ensemble_preds):.4f}")

print("\nDetailed Classification Report:")
print(classification_report(test_df['label'], final_ensemble_preds, digits=4))

print("\nConfusion Matrix:")
print(confusion_matrix(test_df['label'], final_ensemble_preds))

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
SUMMARY:
 - Accuracy: 0.8561
 - F1 Score: 0.8404

Detailed Classification Report:
              precision    recall  f1-score   support

           0     0.8617    0.8764    0.8690       526
           1     0.8492    0.8318    0.8404       440

    accuracy                         0.8561       966
   macro avg     0.8554    0.8541    0.8547       966
weighted avg     0.8560    0.8561    0.8560       966


Confusion Matrix:
[[461  65]
 [ 74 366]]


In [72]:
# Ensemble method: SVM -> LSTM -> BiLSTM w/ Voting
pred_svm = svm_model.predict(X_test_tfidf)

pred_probs_lstm = model.predict(X_test_seq) 
pred_lstm = (pred_probs_lstm > 0.5).astype(int).flatten()

pred_probs_bilstm = bilstm_model.predict(X_test_seq_clean)
pred_bilstm = (pred_probs_bilstm > 0.5).astype(int).flatten()

# voting
combined_votes = pred_svm + pred_lstm + pred_bilstm
ensemble_final_preds = (combined_votes >= 2).astype(int)

# evaluation

print("SUMMARY:")
print(f" - Accuracy: {accuracy_score(test_df['label'], ensemble_final_preds):.4f}")
print(f" - F1 Score: {f1_score(test_df['label'], ensemble_final_preds):.4f}")

print("\nClassification Report:")
print(classification_report(test_df['label'], ensemble_final_preds, digits=4))

print("\nConfusion Matrix:")
print(confusion_matrix(test_df['label'], ensemble_final_preds))

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step
SUMMARY:
 - Accuracy: 0.8602
 - F1 Score: 0.8443

Classification Report:
              precision    recall  f1-score   support

           0     0.8627    0.8840    0.8732       526
           1     0.8571    0.8318    0.8443       440

    accuracy                         0.8602       966
   macro avg     0.8599    0.8579    0.8588       966
weighted avg     0.8602    0.8602    0.8601       966


Confusion Matrix:
[[465  61]
 [ 74 366]]


In [73]:
# Ensemble method: SVM -> LSTM -> CNN w/ Voting
test_pred_svm = svm_model.predict(X_test_tfidf)

test_probs_lstm = model.predict(X_test_seq)
test_pred_lstm = (test_probs_lstm > 0.5).astype(int).flatten()

test_probs_cnn = cnn_model.predict(X_test_seq_clean)
test_pred_cnn = (test_probs_cnn > 0.5).astype(int).flatten()

# voting
total_votes = test_pred_svm + test_pred_lstm + test_pred_cnn
hybrid_ensemble_preds = (total_votes >= 2).astype(int)

# evaluation

print("SUMMARY: ")
print(f" - Accuracy: {accuracy_score(test_df['label'], hybrid_ensemble_preds):.4f}")
print(f" - F1 Score: {f1_score(test_df['label'], hybrid_ensemble_preds):.4f}")

print("\nDetailed Performance:")
print(classification_report(test_df['label'], hybrid_ensemble_preds, digits=4))

print("\nConfusion Matrix (Actual vs Predicted):")
print(confusion_matrix(test_df['label'], hybrid_ensemble_preds))

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
SUMMARY: 
 - Accuracy: 0.8571
 - F1 Score: 0.8417

Detailed Performance:
              precision    recall  f1-score   support

           0     0.8633    0.8764    0.8698       526
           1     0.8495    0.8341    0.8417       440

    accuracy                         0.8571       966
   macro avg     0.8564    0.8553    0.8558       966
weighted avg     0.8570    0.8571    0.8570       966


Confusion Matrix (Actual vs Predicted):
[[461  65]
 [ 73 367]]


In [74]:
# Ensemble Method: LSTM -> BiLSTM -> CNN
probs_lstm = model.predict(X_test_seq)
pred_lstm = (probs_lstm > 0.5).astype(int).flatten()

probs_bilstm = bilstm_model.predict(X_test_seq_clean)
pred_bilstm = (probs_bilstm > 0.5).astype(int).flatten()

probs_cnn = cnn_model.predict(X_test_seq_clean)
pred_cnn = (probs_cnn > 0.5).astype(int).flatten()

# voting
total_votes = pred_lstm + pred_bilstm + pred_cnn
deep_ensemble_preds = (total_votes >= 2).astype(int)

# evaluation
print("SUMMARY:")
print(f" - Accuracy: {accuracy_score(test_df['label'], deep_ensemble_preds):.4f}")
print(f" - F1 Score: {f1_score(test_df['label'], deep_ensemble_preds):.4f}")

print("\nDetailed Metrics:")
print(classification_report(test_df['label'], deep_ensemble_preds, digits=4))

print("\nConfusion Matrix:")
print(confusion_matrix(test_df['label'], deep_ensemble_preds))

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
SUMMARY:
 - Accuracy: 0.8613
 - F1 Score: 0.8477

Detailed Metrics:
              precision    recall  f1-score   support

           0     0.8726    0.8726    0.8726       526
           1     0.8477    0.8477    0.8477       440

    accuracy                         0.8613       966
   macro avg     0.8602    0.8602    0.8602       966
weighted avg     0.8613    0.8613    0.8613       966


Confusion Matrix:
[[459  67]
 [ 67 373]]


## LSTM -> BiLSTM -> CNN is the best Ensemble

In [76]:
import pickle

model.save('lstm_model.h5')
bilstm_model.save('bilstm_model.h5')
cnn_model.save('cnn_model.h5')

with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

with open('tokenizer_clean.pkl', 'wb') as f:
    pickle.dump(tokenizer_clean, f)

