In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import string

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set()
sns.set_style('darkgrid')
sns.color_palette('husl')

In [None]:
submission = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

id - a unique identifier for each tweet  
text - the text of the tweet  
location - the location the tweet was sent from (may be blank)  
keyword - a particular keyword from the tweet (may be blank)  
target - in train.csv only, this denotes whether a tweet is about a real disaster (1) or not (0)  

In [None]:
train.head()

In [None]:
train.isna().sum()

In [None]:
missing_cols = ['keyword', 'location']
fig, axes = plt.subplots(1,2, figsize = (10,6))
sns.barplot(x = train[missing_cols].isnull().sum().index, y = train[missing_cols].isnull().sum().values, ax = axes[0])
sns.barplot(x = test[missing_cols].isnull().sum().index, y = test[missing_cols].isnull().sum().values, ax = axes[1])

axes[0].set_ylabel('Missing Value Count', size=15, labelpad=20)
axes[0].tick_params(axis='x', labelsize=15)
axes[0].tick_params(axis='y', labelsize=15)
axes[1].tick_params(axis='x', labelsize=15)
axes[1].tick_params(axis='y', labelsize=15)

axes[0].set_title('Training Set', fontsize=13)
axes[1].set_title('Test Set', fontsize=13)
plt.show();

In [None]:
def fill_na(df):
    """
    Fill the null values in the datasets.
    """
    df['keyword'] = df['keyword'].fillna('None')
    df['location'] = df['location'].fillna('None')
    return df

In [None]:
p1_train = fill_na(train)
p1_train.isna().sum()

For now, I won't deal with location and keywords.

In [None]:
p1_test = fill_na(test)

In [None]:
X_train, y = p1_train.drop(['id', 'location', 'keyword', 'target'], axis = 1).iloc[:, 0], p1_train['target']
X_test = p1_test.drop(['id', 'location', 'keyword'], axis = 1)
X_train.shape, y.shape

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def calculate_results(y_true, y_pred):

    # calculate metrics 
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

In [None]:
from sklearn.model_selection import train_test_split
X_train_1, X_val_1, y_train_1, y_val_1 = train_test_split(X_train, y, test_size = 0.2, random_state = 42)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB

# Initialize vectorizers and pipelines
count_vec = CountVectorizer()
tfidf_vec = TfidfVectorizer()

pipe_countvec = make_pipeline(count_vec, MultinomialNB())
pipe_tfidfvec = make_pipeline(tfidf_vec, MultinomialNB())

# Fit the models
pipe_countvec.fit(X_train_1, y_train_1)
pipe_tfidfvec.fit(X_train_1, y_train_1)

# Initialize a dictionary to store the results
models = {}

# Predict and calculate results
models['countvec_baseline'] = calculate_results(y_val_1, pipe_countvec.predict(X_val_1))
models['tfidf_baseline'] = calculate_results(y_val_1, pipe_tfidfvec.predict(X_val_1))

In [None]:
models

# Baseline models: CountVectorizer and TfIdfVectorizer

In [None]:
pd.DataFrame(models).T

# Feature engineering

Let's try some other techniques, let's first try feature engineering.

From other notebooks, we can opt to create:
    
**word_count**: number of words in text  
**unique_word_count**: number of unique words in text  
**stop_word_count**: number of stop words in text  
**url_count**: number of urls in text  
**mean_word_length**: average character count in words  
**char_count**: number of characters in text  
**punctuation_count**: number of punctuations in text  
**hashtag_count**: number of hashtags (#) in text  
**mention_count**: number of mentions (@) in text  

In [None]:
from wordcloud import STOPWORDS

In [None]:
p1_train

In [None]:
def feature_engineer(df):
    df['word_count'] = df['text'].apply(lambda x: len(str(x).split()))
    df['unique_word_count'] = df['text'].apply(lambda x: len(set(str(x).split())))
    df['stop_word_count'] = df['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))
    df['url_count'] = df['text'].apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))
    df['mean_word_length'] = df['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
    df['char_count'] = df['text'].apply(lambda x: len(str(x)))
    df['punctuation_count'] = df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
    df['hashtag_count'] = df['text'].apply(lambda x: len([c for c in str(x) if c == '#']))
    df['mention_count'] = df['text'].apply(lambda x: len([c for c in str(x) if c == '@']))
    return df

In [None]:
p2_train = feature_engineer(p1_train)
p2_test = feature_engineer(p1_test)
p1_train.head()

In [None]:
def clean_text(df):
    df['cleaned_text'] = df['text'].apply(lambda x: x.lower())
    df['cleaned_text'] = df['cleaned_text'].apply(lambda x: x.translate(str.maketrans('','', string.punctuation)))

In [None]:
engineered_features = ['word_count', 'unique_word_count', 'stop_word_count', 'url_count', 'mean_word_length', 'char_count', 'punctuation_count', 'hashtag_count', 'mention_count']
DISASTER = p2_train['target'] == 1

fig, axes = plt.subplots(len(engineered_features), 2, figsize = (20, 10))
for i, feat in enumerate(engineered_features):
    sns.histplot(p2_train.loc[~DISASTER][feat], label = 'Not Disaster', ax = axes[i][0], color = 'green')
    sns.histplot(p2_train.loc[DISASTER][feat], label = 'Disaster', ax = axes[i][0], color = 'red')
    
    sns.histplot(p2_train[feat], label = 'Training', ax = axes[i][1])
    sns.histplot(p2_test[feat], label = 'Test', ax = axes[i][1])
    
    for j in range(2):
        axes[i][j].set_xlabel('')
        axes[i][j].tick_params(axis = 'x', labelsize = 6)
        axes[i][j].tick_params(axis = 'y', labelsize = 6)
        axes[i][j].legend()
        
    axes[i][0].set_title(f'{feat} Target Distribution in Training Set', fontsize = 8)
    axes[i][1].set_title(f'{feat} Training & Test Set Distribution', fontsize = 8)

plt.show();

In [None]:
import string
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import unicodedata

import nltk
#nltk.download('wordnet', force = True)

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def clean_text(df):
    df['cleaned_text'] = df['text'].apply(lambda x: x.lower())
    df['cleaned_text'] = df['cleaned_text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
    
    df['cleaned_text'] = df['cleaned_text'].apply(lambda x: [w for w in x.split() if w not in STOPWORDS])
    df['cleaned_text'] = df['cleaned_text'].apply(lambda x: ' '.join(x))
    
    df['cleaned_text'] = df['cleaned_text'].apply(lambda x: [stemmer.stem(w) for w in x.split()])
    df['cleaned_text'] = df['cleaned_text'].apply(lambda x: ' '.join(x))
    
    # Does not work for some reason
    # df['cleaned_text'] = df['cleaned_text'].apply(lambda x: [lemmatizer.lemmatize(w, pos = 'v') for w in x.split()])
    # df['cleaned_text'] = df['cleaned_text'].apply(lambda x: ' '.join(x))
    
    df['cleaned_text'] = df['cleaned_text'].apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8'))
    
    return df

In [None]:
cleaned_p2_train = clean_text(p2_train)
cleaned_p2_test = clean_text(p2_test)

In [None]:
cleaned_p2_train

# Re-run Base Models

In [None]:
p3_train = cleaned_p2_train['cleaned_text']
p3_test = cleaned_p2_test['cleaned_text']

In [None]:
p3_train

In [None]:
X_train_2, X_val_2, y_train_2, y_val_2 = train_test_split(p3_train, y, test_size = 0.2, random_state = 42, shuffle = True)

In [None]:
X_train_2

In [None]:
# Initialize vectorizers and pipelines
count_vec = CountVectorizer()
tfidf_vec = TfidfVectorizer()

pipe_countvec = make_pipeline(count_vec, MultinomialNB())
pipe_tfidfvec = make_pipeline(tfidf_vec, MultinomialNB())

# Fit the models
pipe_countvec.fit(X_train_2, y_train_2)
pipe_tfidfvec.fit(X_train_2, y_train_2)

models = {}

# Predict and calculate results
models['countvec_multiNB'] = calculate_results(y_val_2, pipe_countvec.predict(X_val_2))
models['tfidf_multiNB'] = calculate_results(y_val_2, pipe_tfidfvec.predict(X_val_2))

In [None]:
pd.DataFrame(models).T

Guess it didn't really help, noticeably there was a marginal increase on accuracy for countvec_multiNB.

# TensorFlow

We first set up our data pipeline for TensorFlow.

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization

In [None]:
from collections import Counter

words = p1_train['text'].str.split().sum()

# Count the unique words
vocab_size = len(Counter(words))

print('Vocabulary size:', vocab_size)

In [None]:
round(vocab_size)

In [None]:
seq_len = int(np.percentile(p2_train['word_count'], 95))

In [None]:
max_tokens = 32000

text_vec = TextVectorization(max_tokens = max_tokens,
                            output_sequence_length = seq_len)
text_vec.adapt(X_train_1)

In [None]:
import random
target_sentence = random.choice(p1_train['text'])
print(f"Text:\n{target_sentence}")
print(f"\nLength of text: {len(target_sentence.split())}")
print(f"\nVectorized text:\n{text_vec([target_sentence])}")

In [None]:
text_vocab = text_vec.get_vocabulary()
print(f"Number of words in vocabulary: {len(text_vocab)}"), 
print(f"Most common words in the vocabulary: {text_vocab[:5]}")
print(f"Least common words in the vocabulary: {text_vocab[-5:]}")

In [None]:
token_embed = layers.Embedding(input_dim = len(text_vocab),
                              output_dim = 128,
                              mask_zero = True,
                              name = 'token_embed')

In [None]:
train_data = tf.data.Dataset.from_tensor_slices((X_train_1, y_train_1))
val_data = tf.data.Dataset.from_tensor_slices((X_val_1, y_val_1))

train_data = train_data.batch(32).prefetch(tf.data.AUTOTUNE)
val_data = val_data.batch(32).prefetch(tf.data.AUTOTUNE)

Let's do a simple neural network first.

## Model 1: Conv1D Network

In [None]:
inputs = layers.Input(shape = (1,), dtype = tf.string)
text_vectors = text_vec(inputs)
token_embeddings = token_embed(text_vectors)
x = layers.Conv1D(64, kernel_size = 5, padding = 'same', activation = 'relu')(token_embeddings)
x = layers.GlobalMaxPooling1D()(x)
outputs = layers.Dense(1, activation = 'sigmoid')(x)
model_1 = tf.keras.Model(inputs, outputs)

model_1.compile(loss = 'binary_crossentropy',
               optimizer = tf.keras.optimizers.Adam(),
               metrics = ['accuracy'])

In [None]:
model_1.summary()

In [None]:
model_1_history = model_1.fit(train_data,
                              epochs = 10,
                              validation_data = val_data,
                              verbose = 0
                             )

In [None]:
model_1.evaluate(val_data)

In [None]:
model_1_pred_probs = model_1.predict(val_data)
model_1_pred_probs

In [None]:
model_1_preds = tf.cast(tf.round(model_1_pred_probs), tf.int32)
model_1_preds

In [None]:
pd.DataFrame(model_1_preds.numpy()).value_counts()

In [None]:
model_1_results = calculate_results(y_val_1,
                                   model_1_preds.numpy())
models['model_1'] = model_1_results
pd.DataFrame(models).T

# Model 2: Transfer Learning

In [None]:
import tensorflow_hub as hub
tf_hub_embedding_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        trainable=False,
                                        name="universal_sentence_encoder")

In [None]:
inputs = layers.Input(shape=[], dtype=tf.string)
pretrained_embedding = tf_hub_embedding_layer(inputs) # tokenize text and create embedding
x = layers.Dense(128, activation = 'relu')(pretrained_embedding)
x = layers.Dropout(0.5)(x)
x = layers.Dense(128, activation = 'relu')(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x) # create the output layer
model_2 = tf.keras.Model(inputs=inputs,
                        outputs=outputs)

# Compile the model
model_2.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["mse"])

checkpoint_dir = "kaggle/working/"
checkpoint_filename = "model.ckpt"

checkpoint_path = os.path.join(checkpoint_dir, checkpoint_filename)

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_mse', mode='min', verbose=1, patience=10)

model_ckpt = tf.keras.callbacks.ModelCheckpoint(filepath = checkpoint_path,
                                                monitor = 'val_mse',
                                                save_best_only = True,
                                                save_weights_only = True,
                                                verbose = 0)

In [None]:
checkpoint_path

In [None]:
history_2 = model_2.fit(train_data,
                       epochs = 100,
                        validation_data = val_data,
                        callbacks = [model_ckpt],
                        verbose = 0
                       )

In [None]:
model_2.load_weights(checkpoint_path)

In [None]:
model_2.evaluate(val_data)

In [None]:
model_2_preds_prob = model_2.predict(val_data)
model_2_preds_prob

In [None]:
model_2_preds = tf.cast(tf.round(model_2_preds_prob), tf.int32)
model_2_preds

In [None]:
model_2_results = calculate_results(y_val_1, model_2_preds)
models['model_2'] = model_2_results
pd.DataFrame(models).T

In [None]:
test['text']

In [None]:
y_preds = model_2.predict(test['text'])
y_preds = tf.cast(tf.round(y_preds), tf.int32)
y_preds

In [None]:
submission['target'] = y_preds
submission.to_csv('submission.csv', index = False)

In [None]:
import datetime
print(f'As of {datetime.datetime.now()}, submission score on Kaggle is: 0.81734, ranked: 328.')

# Model 3: Ensemble

Instead of using UniversalEncoder, let's use BERT from Transformer, I am also interested to see how this plays out.  
Besides that, we will implement a few sklearn models and perhaps XGBoost, we will also use VotingClassifier at the end to get an ensemble model.
For the sklearn models, our Text Vectorizer of choice will be CountVectorizer from the above experiment results.

In [None]:
pip install -U sentence-transformers

In [None]:
import pandas as pd
import numpy as np

In [None]:
submission = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = model.encode(train['text'].tolist())

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_vec = tfidf.fit_transform(train['text'])
tfidf_vec

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(tfidf_vec, train['target'], test_size=0.2, random_state=42)

# Repeat the process for the BERT model
X_train_BERT, X_val_BERT, y_train_BERT, y_val_BERT = train_test_split(embeddings, train['target'], test_size=0.2, random_state=42)

In [None]:
X_train.shape, X_val.shape, X_train_BERT.shape, y_val_BERT.shape

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

rf_clf = RandomForestClassifier(random_state = 42)
rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_val)
print(f'Accuracy score: {accuracy_score(y_val, y_pred)}')
print(classification_report(y_val, y_pred))

In [None]:
rf_clf_BERT = RandomForestClassifier(random_state = 42)
rf_clf_BERT.fit(X_train_BERT, y_train_BERT)

y_pred = rf_clf_BERT.predict(X_val_BERT)
print(f'Accuracy score: {accuracy_score(y_val_BERT, y_pred)}')
print(classification_report(y_val_BERT, y_pred))

We will use the rf_clf_BERT as our RandomForestClassifier of choice.

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(random_state = 42)
lr_clf.fit(X_train, y_train)

y_pred = lr_clf.predict(X_val)
print(f'Accuracy score: {accuracy_score(y_val, y_pred)}')
print(classification_report(y_val, y_pred))

In [None]:
lr_clf_BERT = LogisticRegression(random_state = 42)
lr_clf_BERT.fit(X_train_BERT, y_train_BERT)

y_pred = lr_clf_BERT.predict(X_val_BERT)
print(f'Accuracy score: {accuracy_score(y_val, y_pred)}')
print(classification_report(y_val, y_pred))

Similar to RandomForestClassifier, the dataset using BERT to vectorise and embed came out on top.

In [None]:
from sklearn.model_selection import RandomizedSearchCV

clf = RandomForestClassifier(bootstrap=False)

rf_param_grid = {
    'max_depth': [70, 80, 90, 100, None],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [4, 5, 6],
    'n_estimators': [200, 300, 400, 500],
    'bootstrap': [True, False]
    
}

grid_search_BERT = RandomizedSearchCV(
    estimator = clf
    , param_distributions = rf_param_grid
    , n_iter = 20
    , cv = 3
    , verbose=1
    , random_state=42
    , n_jobs = -1
)

grid_search_BERT.fit(X_train_BERT, y_train_BERT)

# Create a new classifier based on the best model 
print('Highest performing parameters: ', grid_search_BERT.best_params_)
rf_clf_BERT = grid_search_BERT.best_estimator_

In [None]:
y_pred_BERT = rf_clf_BERT.predict(X_val_BERT)

print("Randomised Search RF Accuracy (BERT Encoding):", accuracy_score(y_val_BERT, y_pred_BERT), '\n')
print(classification_report(y_vala_BERT, y_pred_BERT))

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l2', None],
    'class_weight': ['balanced', None],
    'max_iter': [1000, 2000]
}

# These are the params which got the best results
param_grid = {
    'C': [0.1]
    , 'class_weight': [None]
    , 'max_iter': [1000]
    , 'penalty': ['l2']
}

# Initialize the classifier
log_reg_BERT = LogisticRegression()

grid_search_BERT = GridSearchCV(
    estimator=log_reg_BERT
    , param_grid=param_grid
    , cv=3
    , verbose=2
    , n_jobs=-1
)
grid_search_BERT.fit(X_train_BERT, y_train_BERT)

# Create a new classifier based on the best model 
print('Highest performing parameters from CV Grid Search: ', grid_search_BERT.best_params_)
log_reg_BERT = grid_search_BERT.best_estimator_

In [None]:
y_pred = log_reg_BERT.predict(X_val_BERT)

print("Randomised Search LR Accuracy:", accuracy_score(y_val_BERT, y_pred), '\n')
print(classification_report(y_val_BERT, y_pred))

In [None]:
from sklearn.ensemble import VotingClassifier

# define the individual models
estimators = [
    ('Random Forest Classifier', rf_clf_BERT), 
    ('Logistic Regression Classifier', log_reg_BERT)
]

# create the ensemble model
hard_voting_clf = VotingClassifier(estimators=estimators, voting='hard')
soft_voting_clf = VotingClassifier(estimators=estimators, voting='soft')

# list of classifiers for easy iteration
classifiers = [hard_voting_clf, soft_voting_clf]

# fit each classifier and print their performance
for clf in classifiers:
    clf_name = clf.__class__.__name__
    if clf == soft_voting_clf:
        clf_name = "Soft " + clf_name
    else:
        clf_name = "Hard " + clf_name

    # train the voting classifier
    clf.fit(X_train_BERT, y_train_BERT)

    # make predictions
    y_pred = clf.predict(X_val_BERT)

    # calculate and print accuracy score
    accuracy = accuracy_score(y_val_BERT, y_pred)
    print(f"{clf_name} Accuracy: {accuracy}")

    # print classification report
    report = classification_report(y_val_BERT, y_pred)
    print(f"{clf_name} Classification Report: \n{report}")

Using our TF model (model 2) from previous runs.

In [None]:
# from https://www.kaggle.com/code/fraserwtt/nlp-disaster-tweet-classification#Round-2:-Tensorflow-Modelling

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, MaxPooling1D, Conv1D
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.regularizers import l1, l2
from tensorflow.keras.metrics import AUC, Precision, Recall
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
import tensorflow_addons as tfa

def f1_score(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

# Get the number of features (columns)
num_features = X_train_BERT.shape[1]

def build_nn():
    model = Sequential()
#     model.add(Dense(128, input_dim=num_features, activation='relu', kernel_regularizer=l2(0.001)))
    model.add(tf.keras.layers.Reshape((num_features, 1), input_shape=(num_features,)))
    model.add(Conv1D(activation='relu',
        filters=64, 
        kernel_size=4, 
        strides=1,
        padding='same'))
    model.add(MaxPooling1D(2))
    model.add(tf.keras.layers.Flatten())
    model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.001)))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.001)))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.001)))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.001)))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.001)))
    model.add(Dense(1, activation='sigmoid'))

    # Define the learning rate decay schedule
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=1e-1,
        decay_steps=10000,
        decay_rate=0.95)

    optimizer = Adam(learning_rate=lr_schedule)

    model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
        optimizer='adam',
        metrics=['accuracy', tfa.metrics.F1Score(num_classes=2, average='micro')]
    )
    return model

# Define the EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)

# Define the ModelCheckpoint callback
model_checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', mode='min', save_best_only=True, verbose=1)

# Define the ReduceLROnPlateau callback
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, min_lr=0.00001)

# Fit the model
keras_nn = build_nn()
history = keras_nn.fit(
    X_train_BERT, y_train_BERT, epochs=50, batch_size=8, validation_split=0.2, callbacks=[early_stopping, model_checkpoint, reduce_lr]
)

Preparing the data to tensor_slices.

In [None]:
!pip install scikeras[tensorflow]

In [None]:
from scikeras.wrappers import KerasClassifier

class CustomKerasClassifier(KerasClassifier):
    def predict(self, x, **kwargs):
        """Returns the class prediction of the samples"""
        x = x.toarray() if hasattr(x, 'toarray') else np.array(x)
        return super().predict(x, **kwargs)
    
    def predict_proba(self, x, **kwargs):
        """Returns class probabilities of the samples"""
        x = x.toarray() if hasattr(x, 'toarray') else np.array(x)
        proba = super().predict_proba(x, **kwargs)
        
        # Check if it's binary classification
        if proba.shape[1] == 1:
            # Assuming the single output is the probability of the positive class
            return np.hstack([1 - proba, proba])  # shape should be (n_samples, 2)

        return proba

keras_clf = CustomKerasClassifier(
    build_nn,
    epochs=10,
    batch_size=8,
    validation_split=0.2,
    callbacks=[early_stopping, model_checkpoint, reduce_lr]
)
keras_clf._estimator_type = "classifier"

In [None]:
train_ds = tf.data.Dataset.from_tensor_slices((X_train_BERT, y_train_BERT))
val_ds = tf.data.Dataset.from_tensor_slices((X_val_BERT, y_val_BERT))

train_ds = train_ds.batch(32).prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.batch(32).prefetch(tf.data.AUTOTUNE)

In [None]:
# define the individual models
estimators = [
    ('Keras Neural Network Classifier', keras_clf),
    ('Random Forest Classifier', rf_clf_BERT), 
    ('Logistic Regression Classifier', log_reg_BERT)
]

# create the ensemble model
hard_voting_clf = VotingClassifier(
    estimators=estimators, voting='hard', flatten_transform=True
)

hard_voting_clf.fit(X_train_BERT, y_train_BERT)

In [None]:
y_preds = hard_voting_clf.predict(X_val_BERT)

accuracy = accuracy_score(y_val_BERT, y_preds)
print(f"Voting Classifier Accuracy: {accuracy}")

# # print classification report
report = classification_report(y_val_BERT, y_preds)
print(f"Classification Report:\n{report}")

In [None]:
new_test_data_vectorized = model.encode(test['text'].tolist())
submission = pd.DataFrame({'id': test['id'], 'target': hard_voting_clf.predict(new_test_data_vectorized)})
submission.to_csv('submission.csv', index=False)

In [None]:
output