In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf
from scipy.sparse import csr_matrix, vstack, hstack
from sklearn.metrics import roc_auc_score


In [2]:
train = pd.read_pickle("data/train.pkl").dropna()
test = pd.read_pickle("data/test.pkl").dropna()
y_train = np.array(train['label'])
y_test = np.array(test['label'])

In [3]:
train_parent_tdidf_csr = vstack(train["parent_comment_tdidf"])
test_parent_tdidf_csr = vstack(test["parent_comment_tdidf"])

train_tdidf_csr = vstack(train["comment_tdidf_nn"])
test_tdidf_csr = vstack(test["comment_tdidf_nn"])

train_parent_bow_csr = vstack(train["parent_comment_bow"])
test_parent_bow_csr = vstack(test["parent_comment_bow"])

train_bow_csr = vstack(train["comment_bow"])
test_bow_csr = vstack(test["comment_bow"])

In [4]:
train_tdidf_csr

<160989x8098 sparse matrix of type '<class 'numpy.float64'>'
	with 850705 stored elements in Compressed Sparse Row format>

In [5]:
list_of_features = [
       'comment_word_count','parent_comment_word_count',
       'comment_token_count', 'parent_comment_token_count',
       'comment_unique_word_count', 'parent_comment_unique_word_count',
       'comment_unique_token_count', 'parent_comment_unique_token_count',
       'comment_stopword_count', 'parent_comment_stopword_count',
       'comment_mean_word_length', 'parent_comment_mean_word_length',
       'comment_mean_token_length', 'parent_comment_mean_token_length',
       'comment_char_count', 'parent_comment_char_count',
       'comment_punctuation_count', 'parent_comment_punctuation_count',
       'comment_hashtag_count', 'parent_comment_hashtag_count',
       'comment_number_count', 'parent_comment_number_count',
       'weighted_parent_sentiment_score_neutral',
       'weighted_parent_sentiment_score_positive',
       'weighted_comment_sentiment_score_neutral',
       'weighted_comment_sentiment_score_positive']
bool_cols = ['weighted_parent_sentiment_score_neutral',
             'weighted_parent_sentiment_score_positive',
             'weighted_comment_sentiment_score_neutral',
             'weighted_comment_sentiment_score_positive']

for col in bool_cols: #need to convert bool type to integer
    train[col] = train[col].astype(int)
    test[col] = test[col].astype(int)
X_train_gen_features = csr_matrix(train[list_of_features])

X_test_gen_features = csr_matrix(test[list_of_features])
x_train = hstack([X_train_gen_features,train_tdidf_csr]).toarray() #deep learning automates feature selection. from our supervised learning we have learnt that BoW adds no information given tf-idf.
x_test = hstack([X_test_gen_features,test_tdidf_csr]).toarray()

first implementation of rnn

In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(128, activation='relu', input_dim=x_train.shape[1]))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=10, batch_size=8, validation_split=0.1)
y_pred_test = model.predict(x_test)
roc_auc_test = roc_auc_score(y_test, y_pred_test)
print(f'AUC: {roc_auc_test}')
#comment td-idf only AUC: 0.6628971516171749
#gen features + comment td-idf AUC: 0.7419122793471202


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
AUC: 0.744722806060709


using iterative hyperparameter tuning

In [7]:
from keras_tuner import HyperModel
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
#use tf.keras.optimizers.legacy.Adam if on M1/M2 macbook
class MyHyperModel(HyperModel):
    def __init__(self, input_dim):
        self.input_dim = input_dim

    def build(self, hp):
        model = Sequential()
        # First layer
        model.add(Dense(units=hp.Int('units_first', min_value=128, max_value=512, step=32),
                        activation='relu', input_dim=self.input_dim))
        model.add(Dropout(rate=hp.Float('dropout_first', min_value=0.1, max_value=0.5, step=0.1)))
        
        # Second layer
        model.add(Dense(units=hp.Int('units_second', min_value=64, max_value=256, step=32),
                        activation='relu'))
        model.add(Dropout(rate=hp.Float('dropout_second', min_value=0.1, max_value=0.5, step=0.1)))

        # Third layer
        model.add(Dense(units=hp.Int('units_third', min_value=32, max_value=128, step=32),
                        activation='relu'))
        model.add(Dropout(rate=hp.Float('dropout_third', min_value=0.1, max_value=0.5, step=0.1)))

        # Output layer
        model.add(Dense(1, activation='sigmoid'))

        # Tuning the learning rate
        hp_learning_rate = hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='LOG')
        
        model.compile(optimizer=Adam(learning_rate=hp_learning_rate),
                      loss='binary_crossentropy',
                      metrics=['accuracy'])
        return model

In [8]:
from kerastuner.tuners import Hyperband

hypermodel = MyHyperModel(input_dim=x_train.shape[1])

tuner = Hyperband(
    hypermodel,
    objective='val_accuracy',
    max_epochs=10,
    factor=3,
    directory='keras_tuner_dir',
    project_name='keras_tuner'
)
from tensorflow.keras.callbacks import EarlyStopping

stop_early = EarlyStopping(monitor='val_loss', patience=5)
tuner.search(x_train, y_train,
             epochs=50,
             validation_split=0.1,
             callbacks=[stop_early])


Reloading Tuner from keras_tuner_dir\keras_tuner\tuner0.json


  from kerastuner.tuners import Hyperband


In [9]:
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
best_model = tuner.hypermodel.build(best_hps)
# Summary of the best model
best_model.summary()
# Optionally, you can retrain the model with the best hyperparameters on the full dataset
best_model.fit(x_train, y_train, epochs=10, batch_size=8, validation_split=0.1)
y_pred_test = best_model.predict(x_test)
roc_auc_test = roc_auc_score(y_test, y_pred_test)
print(f'Best model AUC: {roc_auc_test}')


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 352)               2860000   
                                                                 
 dropout (Dropout)           (None, 352)               0         
                                                                 
 dense_4 (Dense)             (None, 160)               56480     
                                                                 
 dropout_1 (Dropout)         (None, 160)               0         
                                                                 
 dense_5 (Dense)             (None, 64)                10304     
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_6 (Dense)             (None, 1)                