In [11]:
import pandas as pd
import numpy as np
import nltk
import re
from tensorflow.keras.layers import Input, Embedding, LSTM, concatenate, Dense, SpatialDropout1D, Bidirectional, Dropout
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import classification_report

# 1) Preparing Data

In [13]:
train_df = pd.read_csv('train_data.csv')
test_df = pd.read_csv('test_data.csv')

In [14]:
# Define features
features = ['num_sentences', 'misspelling_percentage', 'pos_verbs_percentage',
             'spaces_percentage', 'sentiment_score', 'money_score', 'payment_score',
             'celebration_score', 'achievement_score', 'url_presence',
             'phone_number_presence']

train_text_data = train_df['cleaned_text'].astype(str)
train_numerical_features = train_df[features].values
train_labels = train_df['binary_label']

test_text_data = test_df['cleaned_text'].astype(str)
test_numerical_features = test_df[features].values
test_labels = test_df['binary_label']

# Text data preprocessing
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text_data)
X_train_text = tokenizer.texts_to_sequences(train_text_data)
X_test_text = tokenizer.texts_to_sequences(test_text_data)
max_length = 200 # majority of sequences have less than 200 tokens
X_train_text = pad_sequences(X_train_text, maxlen=max_length)
X_test_text = pad_sequences(X_test_text, maxlen=max_length)

# 2) Model architecture

1. **Embedding Layer**:
   - Converts integer-encoded tokens into dense vectors of fixed size.
   - Each token is mapped to a unique vector in a high-dimensional space.
   - Captures semantic relationships between words based on their context.


2. **Spatial Dropout Layer**:
   - Applies dropout regularization specifically designed for 1D input data (e.g., sequences).
   - Randomly sets a fraction of input units to zero during training to prevent overfitting.


3. **Bidirectional LSTM Layer**:
   - Consists of forward and backward LSTM units, allowing it to capture information from both past and future context.
   - Each LSTM unit maintains an internal state and processes the input sequence step by step, updating its state at each time step.


5. **Concatenation Layer**:
   - Combines the outputs of the LSTM layer (both forward and backward representations) with the numerical features.


6. **Dropout Layer**:
   - Applies dropout regularization to the concatenated features.
   - Randomly sets a fraction of input units to zero during training to prevent overfitting.


7. **Dense Output Layer**:
   - A fully connected layer that produces the final output predictions.
   - Uses a sigmoid activation function to output a probability score for binary classification tasks.
   - Output value close to 1 indicates a positive prediction, while a value close to 0 indicates a negative prediction.

In [15]:
# Model architecture
text_input = Input(shape=(max_length,), name='text_input')
numerical_input = Input(shape=(len(features),), name='numerical_input')
embedding_layer = Embedding(len(tokenizer.word_index) + 1, 128, input_length=max_length)(text_input)
spatial_dropout = SpatialDropout1D(0.2)(embedding_layer)
lstm_layer = Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2))(spatial_dropout)
concatenated = concatenate([lstm_layer, numerical_input])
dropout_layer = Dropout(0.2)(concatenated)
output = Dense(1, activation='sigmoid')(dropout_layer)
model = Model(inputs=[text_input, numerical_input], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Callback
checkpoint_filepath = 'best_lstm.h5'
checkpoint_callback = ModelCheckpoint(filepath=checkpoint_filepath,
                                      monitor='val_loss',
                                      save_best_only=True,
                                      mode='min',
                                      verbose=1)

In [17]:
model.fit([X_train_text, train_numerical_features], train_labels, 
          epochs=10, batch_size=32, validation_split=0.2,
          callbacks=[checkpoint_callback])

Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.10546, saving model to best_lstm.h5
Epoch 2/10

Epoch 00002: val_loss improved from 0.10546 to 0.09136, saving model to best_lstm.h5
Epoch 3/10

Epoch 00003: val_loss did not improve from 0.09136
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.09136
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.09136
Epoch 6/10

Epoch 00006: val_loss did not improve from 0.09136
Epoch 7/10

Epoch 00007: val_loss did not improve from 0.09136
Epoch 8/10

Epoch 00008: val_loss did not improve from 0.09136
Epoch 9/10

Epoch 00009: val_loss did not improve from 0.09136
Epoch 10/10

Epoch 00010: val_loss did not improve from 0.09136


<tensorflow.python.keras.callbacks.History at 0x188a84bdbb0>

# 3) Model Evaluation
The best model is saved from epoch 2 where `val_loss` was lowest. We need to convert the output into binary predictions before doing the classfication report. We find that the model is performing well enough even for the minority `spam` class. Hence we will not proceed with hyperparameter tuning or changing the model architecture due to computational constraints.

In [18]:
# Predict labels for test data
best_model = load_model(checkpoint_filepath)

y_pred = best_model.predict([X_test_text, test_numerical_features])
y_pred = (y_pred > 0.5).astype(int)  # Convert probabilities to binary predictions

# Convert labels to binary values (0 or 1)
test_labels_binary = (test_labels > 0.5).astype(int)

# Print classification report
print(classification_report(test_labels_binary, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97      7169
           1       0.96      0.96      0.96      4704

    accuracy                           0.97     11873
   macro avg       0.97      0.97      0.97     11873
weighted avg       0.97      0.97      0.97     11873

