Import Libraries

SIMPLE NEURAL NETWORK

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

train = pd.read_parquet('train.parquet')
test = pd.read_parquet('test.parquet')

# preextracted features
def combine_features(df):
    sender_features = np.array(df['sender_features'].tolist())
    subject_features = np.array(df['subject_features'].tolist())
    date_features = np.array(df['date_features'].tolist())
    body_features = np.array(df['body_features'].tolist())
    tfidf_features = np.array(df['tfidf_features'].tolist())
    word2vec_features = np.array(df['word2vec_features'].tolist())
    transformer_features = np.array(df['word2vec_features'].tolist())
    return np.concatenate([sender_features, subject_features, date_features, body_features, word2vec_features, tfidf_features, transformer_features], axis=1)


# train data
X = combine_features(train)
y = train['label'].values

# split 80 20
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = Sequential([
    Dense(128, input_shape=(X_train.shape[1],), activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train, y_train, 
    epochs=20, 
    batch_size=32, 
    validation_data=(X_val, y_val), 
    verbose=1,
    callbacks=[early_stopping]
)

# prepare test data
X_test = combine_features(test)

# predict test
y_pred = model.predict(X_test)

# Convert predictions to binary labels (0 or 1)
y_pred_binary = (y_pred > 0.5).astype(int)

y_true = test['label'].values

# Calculate accuracy, precision, recall, and F1 score
accuracy = accuracy_score(y_true, y_pred_binary)
precision = precision_score(y_true, y_pred_binary)
recall = recall_score(y_true, y_pred_binary)
f1 = f1_score(y_true, y_pred_binary)


print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m3168/3168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 4ms/step - accuracy: 0.9130 - loss: 0.2124 - val_accuracy: 0.9730 - val_loss: 0.0730
Epoch 2/20
[1m3168/3168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 4ms/step - accuracy: 0.9613 - loss: 0.1074 - val_accuracy: 0.9774 - val_loss: 0.0646
Epoch 3/20
[1m3168/3168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - accuracy: 0.9679 - loss: 0.0904 - val_accuracy: 0.9768 - val_loss: 0.0674
Epoch 4/20
[1m3168/3168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 7ms/step - accuracy: 0.9715 - loss: 0.0811 - val_accuracy: 0.9801 - val_loss: 0.0580
Epoch 5/20
[1m3168/3168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 6ms/step - accuracy: 0.9752 - loss: 0.0722 - val_accuracy: 0.9805 - val_loss: 0.0562
Epoch 6/20
[1m3168/3168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 5ms/step - accuracy: 0.9755 - loss: 0.0710 - val_accuracy: 0.9795 - val_loss: 0.0589
Epoch 7/20

LSTM WITH FEEDFORWARD NEURAL NETWORK

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Concatenate, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

MAX_SEQUENCE_LENGTH = 50  
EMBEDDING_DIM = 50

# preextracted features
def combine_features(df):
    sender_features = np.array(df['sender_features'].tolist())
    subject_features = np.array(df['subject_features'].tolist())
    date_features = np.array(df['date_features'].tolist())
    body_features = np.array(df['body_features'].tolist())
    tfidf_features = np.array(df['tfidf_features'].tolist())
    word2vec_features = np.array(df['word2vec_features'].tolist())
    return np.concatenate([sender_features, subject_features, date_features, body_features, tfidf_features ,word2vec_features ], axis=1)

# tokenise text, and do padding so that length is equal
def preprocess_text(df, max_sequence_length, tokenizer=None):
    if tokenizer is None:
        tokenizer = tf.keras.preprocessing.text.Tokenizer()
        tokenizer.fit_on_texts(df['processed_text'])
    text_sequences = tokenizer.texts_to_sequences(df['processed_text'])
    text_padded = pad_sequences(text_sequences, maxlen=max_sequence_length)
    return text_padded, tokenizer

train = pd.read_parquet('train.parquet')
test = pd.read_parquet('test.parquet')

X_train_combined = combine_features(train)
X_test_combined = combine_features(test)

# preprocess text for lstm
X_train_text, tokenizer = preprocess_text(train, MAX_SEQUENCE_LENGTH)
X_test_text, _ = preprocess_text(test, MAX_SEQUENCE_LENGTH, tokenizer)
word_index = tokenizer.word_index

y_train = train['label'].values
y_test = test['label'].values

# split 80 20
X_train, X_val, y_train, y_val, body_train, body_val = train_test_split(
    X_train_combined, y_train, X_train_text, test_size=0.2, random_state=42
)

print(body_train.shape)  
print(X_train.shape)     

# lstm
input_body = Input(shape=(MAX_SEQUENCE_LENGTH,))
embedding_layer = Embedding(input_dim=len(word_index) + 1, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)(input_body)
lstm_out = LSTM(32)(embedding_layer)  

# Define input for other features
input_features = Input(shape=(X_train_combined.shape[1],))  
concat = Concatenate()([lstm_out, input_features])

# feed forward with batch normalisation, dropout, relu and l2 regulatization
dense_out = Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(concat)  
dense_out = BatchNormalization()(dense_out)
dense_out = Dropout(0.5)(dense_out)
dense_out = Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(dense_out) 
dense_out = BatchNormalization()(dense_out)
dense_out = Dropout(0.3)(dense_out)
dense_out = Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(dense_out) 
dense_out = BatchNormalization()(dense_out)
dense_out = Dropout(0.3)(dense_out)

output = Dense(1, activation='sigmoid')(dense_out)

model = Model(inputs=[input_body, input_features], outputs=output)

model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)


history = model.fit(
    [body_train, X_train], 
    y_train, 
    epochs=10,  
    batch_size=64, 
    validation_data=([body_val, X_val], y_val),
    callbacks=[early_stopping],  
    verbose=1
)



print("\nTraining complete.")
print(f"Best Epoch - Training Loss: {min(history.history['loss']):.4f}, "
      f"Validation Loss: {min(history.history['val_loss']):.4f}, "
      f"Training Accuracy: {max(history.history['accuracy']):.4f}, "
      f"Validation Accuracy: {max(history.history['val_accuracy']):.4f}")
# predict test set
y_pred = model.predict([X_test_text, X_test_combined])


y_pred_binary = (y_pred > 0.5).astype(int)

accuracy = accuracy_score(y_test, y_pred_binary)
precision = precision_score(y_test, y_pred_binary)
recall = recall_score(y_test, y_pred_binary)
f1 = f1_score(y_test, y_pred_binary)

# Print the evaluation metrics
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')


(101360, 50)
(101360, 1428)




Epoch 1/10




[1m1584/1584[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m496s[0m 310ms/step - accuracy: 0.8994 - loss: 1.0963 - val_accuracy: 0.9794 - val_loss: 0.1207
Epoch 2/10
[1m1584/1584[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m516s[0m 326ms/step - accuracy: 0.9788 - loss: 0.1143 - val_accuracy: 0.9798 - val_loss: 0.0952
Epoch 3/10
[1m1584/1584[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m500s[0m 316ms/step - accuracy: 0.9878 - loss: 0.0748 - val_accuracy: 0.9796 - val_loss: 0.0983
Epoch 4/10
[1m1584/1584[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m503s[0m 317ms/step - accuracy: 0.9874 - loss: 0.0747 - val_accuracy: 0.9781 - val_loss: 0.1208
Epoch 5/10
[1m1584/1584[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m495s[0m 312ms/step - accuracy: 0.9911 - loss: 0.0569 - val_accuracy: 0.9738 - val_loss: 0.1344

Training complete.
Best Epoch - Training Loss: 0.0605, Validation Loss: 0.0952, Training Accuracy: 0.9905, Validation Accuracy: 0.9798
[1m990/990[0m [32m━━━━━━━━━━━━━━

LSTM and CNN Hybrid

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Conv1D, MaxPooling1D, Flatten, Concatenate, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

MAX_SEQUENCE_LENGTH = 50  
EMBEDDING_DIM = 50

# preextracted features
def combine_features(df):
    sender_features = np.array(df['sender_features'].tolist())
    subject_features = np.array(df['subject_features'].tolist())
    date_features = np.array(df['date_features'].tolist())
    body_features = np.array(df['body_features'].tolist())
    tfidf_features = np.array(df['tfidf_features'].tolist())
    word2vec_features = np.array(df['word2vec_features'].tolist())
    return np.concatenate([sender_features, subject_features, date_features, body_features, tfidf_features, word2vec_features], axis=1)

# tokenise and padding 
def preprocess_text(df, max_sequence_length, tokenizer=None):
    if tokenizer is None:
        tokenizer = tf.keras.preprocessing.text.Tokenizer()
        tokenizer.fit_on_texts(df['processed_text'])
    text_sequences = tokenizer.texts_to_sequences(df['processed_text'])
    text_padded = pad_sequences(text_sequences, maxlen=max_sequence_length)
    return text_padded, tokenizer


train = pd.read_parquet('train.parquet')
test = pd.read_parquet('test.parquet')

X_train_combined = combine_features(train)
X_test_combined = combine_features(test)

# for cnn input
X_train_text, tokenizer = preprocess_text(train, MAX_SEQUENCE_LENGTH)
X_test_text, _ = preprocess_text(test, MAX_SEQUENCE_LENGTH, tokenizer)
word_index = tokenizer.word_index

y_train = train['label'].values
y_test = test['label'].values

# split 80 20
X_train, X_val, y_train, y_val, body_train, body_val = train_test_split(
    X_train_combined, y_train, X_train_text, test_size=0.2, random_state=42
)


input_body = Input(shape=(MAX_SEQUENCE_LENGTH,))
embedding_layer = Embedding(input_dim=len(word_index) + 1, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)(input_body)

# CNN layers
conv1 = Conv1D(filters=64, kernel_size=3, activation='relu')(embedding_layer)
pool1 = MaxPooling1D(pool_size=2)(conv1)

conv2 = Conv1D(filters=128, kernel_size=3, activation='relu')(pool1)
pool2 = MaxPooling1D(pool_size=2)(conv2)

# LSTM layer after CNN
lstm_out = LSTM(32)(pool2)


input_features = Input(shape=(X_train_combined.shape[1],))  

# concatenate CNN-LSTM output with other features
concat = Concatenate()([lstm_out, input_features])

# FC layers
dense_out = Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(concat)  # L2 Regularization
dense_out = BatchNormalization()(dense_out)
dense_out = Dropout(0.5)(dense_out)
dense_out = Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(dense_out)  # L2 Regularization
dense_out = BatchNormalization()(dense_out)
dense_out = Dropout(0.3)(dense_out)
dense_out = Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(dense_out)  # L2 Regularization
dense_out = BatchNormalization()(dense_out)
dense_out = Dropout(0.3)(dense_out)

output = Dense(1, activation='sigmoid')(dense_out)

model = Model(inputs=[input_body, input_features], outputs=output)

model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    [body_train, X_train],  
    y_train, 
    epochs=10,  
    batch_size=64, 
    validation_data=([body_val, X_val], y_val),
    callbacks=[early_stopping],  
    verbose=1
)


print("\nTraining complete.")
print(f"Best Epoch - Training Loss: {min(history.history['loss']):.4f}, "
      f"Validation Loss: {min(history.history['val_loss']):.4f}, "
      f"Training Accuracy: {max(history.history['accuracy']):.4f}, "
      f"Validation Accuracy: {max(history.history['val_accuracy']):.4f}")

# predict on test
y_pred = model.predict([X_test_text, X_test_combined])

y_pred_binary = (y_pred > 0.5).astype(int)

accuracy = accuracy_score(y_test, y_pred_binary)
precision = precision_score(y_test, y_pred_binary)
recall = recall_score(y_test, y_pred_binary)
f1 = f1_score(y_test, y_pred_binary)

# Print the evaluation metrics
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')




Epoch 1/10




[1m1584/1584[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m524s[0m 326ms/step - accuracy: 0.8957 - loss: 1.0651 - val_accuracy: 0.9618 - val_loss: 0.1726
Epoch 2/10
[1m1584/1584[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m504s[0m 318ms/step - accuracy: 0.9791 - loss: 0.1155 - val_accuracy: 0.9777 - val_loss: 0.1011
Epoch 3/10
[1m1584/1584[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m504s[0m 318ms/step - accuracy: 0.9862 - loss: 0.0822 - val_accuracy: 0.9820 - val_loss: 0.0879
Epoch 4/10
[1m1584/1584[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m537s[0m 339ms/step - accuracy: 0.9895 - loss: 0.0610 - val_accuracy: 0.9781 - val_loss: 0.0971
Epoch 5/10
[1m1584/1584[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m509s[0m 322ms/step - accuracy: 0.9926 - loss: 0.0473 - val_accuracy: 0.9764 - val_loss: 0.1090
Epoch 6/10
[1m1584/1584[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m502s[0m 316ms/step - accuracy: 0.9932 - loss: 0.0432 - val_accuracy: 0.9738 - val_loss: 0.1488

Tr