In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf
from scipy.sparse import csr_matrix, vstack, hstack
from sklearn.metrics import roc_auc_score


In [2]:
train = pd.read_pickle("data/train.pkl").dropna()
test = pd.read_pickle("data/test.pkl").dropna()
y_train = np.array(train['label'])
y_test = np.array(test['label'])

In [4]:
train_parent_tdidf_csr = vstack(train["parent_comment_tdidf"])
test_parent_tdidf_csr = vstack(test["parent_comment_tdidf"])

train_tdidf_csr = vstack(train["comment_tdidf_nn"])
test_tdidf_csr = vstack(test["comment_tdidf_nn"])

train_parent_bow_csr = vstack(train["parent_comment_bow"])
test_parent_bow_csr = vstack(test["parent_comment_bow"])

train_bow_csr = vstack(train["comment_bow"])
test_bow_csr = vstack(test["comment_bow"])

In [5]:
train_tdidf_csr

<161010x8160 sparse matrix of type '<class 'numpy.float64'>'
	with 855934 stored elements in Compressed Sparse Row format>

In [6]:
list_of_features = [
       'comment_word_count','parent_comment_word_count',
       'comment_token_count', 'parent_comment_token_count',
       'comment_unique_word_count', 'parent_comment_unique_word_count',
       'comment_unique_token_count', 'parent_comment_unique_token_count',
       'comment_stopword_count', 'parent_comment_stopword_count',
       'comment_mean_word_length', 'parent_comment_mean_word_length',
       'comment_mean_token_length', 'parent_comment_mean_token_length',
       'comment_char_count', 'parent_comment_char_count',
       'comment_punctuation_count', 'parent_comment_punctuation_count',
       'comment_hashtag_count', 'parent_comment_hashtag_count',
       'comment_number_count', 'parent_comment_number_count',
       'weighted_parent_sentiment_score_neutral',
       'weighted_parent_sentiment_score_positive',
       'weighted_comment_sentiment_score_neutral',
       'weighted_comment_sentiment_score_positive']
bool_cols = ['weighted_parent_sentiment_score_neutral',
             'weighted_parent_sentiment_score_positive',
             'weighted_comment_sentiment_score_neutral',
             'weighted_comment_sentiment_score_positive']

for col in bool_cols: #need to convert bool type to integer
    train[col] = train[col].astype(int)
    test[col] = test[col].astype(int)
X_train_gen_features = csr_matrix(train[list_of_features])

X_test_gen_features = csr_matrix(test[list_of_features])
x_train = hstack([X_train_gen_features,train_tdidf_csr]).toarray() #deep learning automates feature selection. from our supervised learning we have learnt that BoW adds no information given tf-idf.
x_test = hstack([X_test_gen_features,test_tdidf_csr]).toarray()

Feedforward NN

In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(128, activation='relu', input_dim=x_train.shape[1]))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=10, batch_size=8, validation_split=0.1)
y_pred_test = model.predict(x_test)
roc_auc_test = roc_auc_score(y_test, y_pred_test)
print(f'AUC: {roc_auc_test}')
#comment td-idf only AUC: 0.6628971516171749
#gen features + comment td-idf AUC: 0.7419122793471202


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
AUC: 0.744722806060709


using iterative hyperparameter tuning

In [7]:
from keras_tuner import HyperModel
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
#use tf.keras.optimizers.legacy.Adam if on M1/M2 macbook
class MyHyperModel(HyperModel):
    def __init__(self, input_dim):
        self.input_dim = input_dim

    def build(self, hp):
        model = Sequential()
        # First layer
        model.add(Dense(units=hp.Int('units_first', min_value=128, max_value=512, step=32),
                        activation='relu', input_dim=self.input_dim))
        model.add(Dropout(rate=hp.Float('dropout_first', min_value=0.1, max_value=0.5, step=0.1)))
        
        # Second layer
        model.add(Dense(units=hp.Int('units_second', min_value=64, max_value=256, step=32),
                        activation='relu'))
        model.add(Dropout(rate=hp.Float('dropout_second', min_value=0.1, max_value=0.5, step=0.1)))

        # Third layer
        model.add(Dense(units=hp.Int('units_third', min_value=32, max_value=128, step=32),
                        activation='relu'))
        model.add(Dropout(rate=hp.Float('dropout_third', min_value=0.1, max_value=0.5, step=0.1)))

        # Output layer
        model.add(Dense(1, activation='sigmoid'))

        # Tuning the learning rate
        hp_learning_rate = hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='LOG')
        
        model.compile(optimizer=Adam(learning_rate=hp_learning_rate),
                      loss='binary_crossentropy',
                      metrics=['accuracy'])
        return model

Using TensorFlow backend


In [8]:
from kerastuner.tuners import Hyperband

hypermodel = MyHyperModel(input_dim=x_train.shape[1])

tuner = Hyperband(
    hypermodel,
    objective='val_accuracy',
    max_epochs=10,
    factor=3,
    directory='keras_tuner_dir',
    project_name='keras_tuner'
)
from tensorflow.keras.callbacks import EarlyStopping

stop_early = EarlyStopping(monitor='val_loss', patience=5)
tuner.search(x_train, y_train,
             epochs=50,
             validation_split=0.1,
             callbacks=[stop_early])


Reloading Tuner from keras_tuner_dir\keras_tuner\tuner0.json


  from kerastuner.tuners import Hyperband


In [9]:
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
best_model = tuner.hypermodel.build(best_hps)
# Summary of the best model
best_model.summary()
# Optionally, you can retrain the model with the best hyperparameters on the full dataset
best_model.fit(x_train, y_train, epochs=10, batch_size=8, validation_split=0.1)
y_pred_test = best_model.predict(x_test)
roc_auc_test = roc_auc_score(y_test, y_pred_test)
print(f'Best model AUC: {roc_auc_test}')


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 352)               2860000   
                                                                 
 dropout (Dropout)           (None, 352)               0         
                                                                 
 dense_4 (Dense)             (None, 160)               56480     
                                                                 
 dropout_1 (Dropout)         (None, 160)               0         
                                                                 
 dense_5 (Dense)             (None, 64)                10304     
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_6 (Dense)             (None, 1)                

### RNN implementation

# Idea: 
- 2 LSTM branches-> iteratively takes in comment tokens, and parent comment tokens
- Dense branch-> dense layer takes in the other features
- Merge branches
- one more Dense layer


In [3]:
rnn_features = [
       'comment_word_count','parent_comment_word_count',
       'comment_token_count', 'parent_comment_token_count',
       'comment_unique_word_count', 'parent_comment_unique_word_count',
       'comment_unique_token_count', 'parent_comment_unique_token_count',
       'comment_stopword_count', 'parent_comment_stopword_count',
       'comment_mean_word_length', 'parent_comment_mean_word_length',
       'comment_mean_token_length', 'parent_comment_mean_token_length',
       'comment_char_count', 'parent_comment_char_count',
       'comment_punctuation_count', 'parent_comment_punctuation_count',
       'comment_hashtag_count', 'parent_comment_hashtag_count',
       'comment_number_count', 'parent_comment_number_count',
       'weighted_parent_sentiment_score_neutral',
       'weighted_parent_sentiment_score_positive',
       'weighted_comment_sentiment_score_neutral',
       'weighted_comment_sentiment_score_positive', 'documents_comment']#, 'documents_parent_comment'] #if doesnt work we try comment_tdidf_nn
train["documents_comment"] = train['comment_tokens'].apply(lambda tokens_list:' '.join(tokens_list)) #keras needs to use own tokenizer
#train["documents_parent_comment"] = train['parent_comment_tokens'].apply(lambda tokens_list:' '.join(tokens_list))
test["documents_comment"] = test['comment_tokens'].apply(lambda tokens_list:' '.join(tokens_list))
#test["documents_parent_comment"] = test['parent_comment_tokens'].apply(lambda tokens_list:' '.join(tokens_list))

rnn_train = train[rnn_features]
rnn_test = test[rnn_features]

In [4]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Concatenate, TextVectorization
from tensorflow.keras.models import Model


# TextVectorization for comment and parent_comment
max_features = 8000 #follow number of tokens for feedforward


vectorize_layer_comment = TextVectorization(
    max_tokens=max_features,
    split='whitespace',
    ngrams=3
    )
# Prepare dataset for TextVectorization adapt
train_texts = rnn_train['documents_comment'].tolist()
test_texts = rnn_test['documents_comment'].tolist()
vectorize_layer_comment.adapt(train_texts)

In [6]:
from tensorflow.keras.layers import Embedding
#from tensorflow.keras.optimizers import Adam
#use tf.keras.optimizers.legacy.Adam if on M1/M2 macbook
# Assuming max_features is the vocabulary size and embedding_dim is the dimension of the embedding
embedding_dim = 128  # You can choose an appropriate value

# Add an Embedding layer after text vectorization
embedding_layer = Embedding(max_features, embedding_dim)

# LSTM Branch
text_input_comment = Input(shape=(1,), dtype=tf.string, name='text_comment')
text_features_comment = vectorize_layer_comment(text_input_comment)
text_features_comment = embedding_layer(text_features_comment)  # Embedding layer
lstm_comment = LSTM(64)(text_features_comment)

# Dense Features Branch
other_features_input = Input(shape=(len(rnn_features) - 1,), name='other_features')
dense_features = Dense(128, activation='relu')(other_features_input)

# Concatenate
concatenated = Concatenate()([lstm_comment, dense_features])

# Additional Dense Layers
output = Dense(64, activation='relu')(concatenated)
output = Dense(1, activation='sigmoid')(output)

# Build Model
model = Model(inputs=[text_input_comment, other_features_input], outputs=output)
#optimiser = tf.keras.optimizers.legacy.SGD(learning_rate=0.001)
# Compile
model.compile(optimizer="adam", loss='binary_crossentropy', metrics=['accuracy'])#use Adam optimiser


In [7]:

# Prepare text inputs
train_texts = np.array(train_texts)[:, np.newaxis]
test_texts = np.array(test_texts)[:, np.newaxis]

# Prepare other features inputs
feature_columns = [col for col in rnn_features if col != 'documents_comment']

train_other_features = np.array(rnn_train[feature_columns]).astype(np.float32)
test_other_features = np.array(rnn_test[feature_columns]).astype(np.float32)

# Fit the model
history = model.fit(
    [train_texts, train_other_features], y_train,
    epochs=40,
    batch_size=32,
    validation_split=0.25  # or use a validation set
)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate([test_texts, test_other_features], y_test)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

# Predict probabilities for the test set
y_pred_probs = model.predict([test_texts, test_other_features]).ravel()

# Calculate ROC-AUC
roc_auc = roc_auc_score(y_test, y_pred_probs)
print(f"ROC-AUC Score: {roc_auc}")

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Test Loss: 1.6900098323822021, Test Accuracy: 0.6306102871894836
ROC-AUC Score: 0.6599739705156451
