In [1]:
import pandas as pd #pandas 1.5.3
import numpy as np

import tensorflow as tf
from scipy.sparse import csr_matrix, vstack, hstack
from sklearn.metrics import roc_auc_score
from joblib import load
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from lime.lime_tabular import LimeTabularExplainer

In [2]:
train = pd.read_pickle("data/train.pkl")
test = pd.read_pickle("data/test.pkl")
tfidf_vec = load('tdvectorizer_nn.pkl')
y_train = np.array(train['label'])
y_test = np.array(test['label'])


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [3]:
# Assuming tfidf_vec is your TF-IDF vectorizer and train_tfidf_features is your TF-IDF sparse matrix
feature_names = tfidf_vec.get_feature_names_out()

train_tfidf_csr = vstack(train["comment_tfidf_nn"])
test_tfidf_csr = vstack(test["comment_tfidf_nn"])
# Convert the TF-IDF vectors to a DataFrame
train_tfidf_df = pd.DataFrame(train_tfidf_csr.toarray(), columns=feature_names)
test_tfidf_df = pd.DataFrame(test_tfidf_csr.toarray(), columns=feature_names) # Now train_tfidf_df and test_tfidf_df have the original feature names as columns


ValueError: Shape of passed values is (241496, 11639), indices imply (241496, 6000)

In [None]:
list_of_features = [
       'comment_word_count','parent_comment_word_count',
       'comment_token_count', 'parent_comment_token_count',
       'comment_unique_word_count', 'parent_comment_unique_word_count',
       'comment_unique_token_count', 'parent_comment_unique_token_count',
       'comment_stopword_count', 'parent_comment_stopword_count',
       'comment_mean_word_length', 'parent_comment_mean_word_length',
       'comment_mean_token_length', 'parent_comment_mean_token_length',
       'comment_char_count', 'parent_comment_char_count',
       'comment_punctuation_count', 'parent_comment_punctuation_count',
       'comment_hashtag_count', 'parent_comment_hashtag_count',
       'comment_number_count', 'parent_comment_number_count',
       'weighted_parent_sentiment_score_neutral',
       'weighted_parent_sentiment_score_positive',
       'weighted_comment_sentiment_score_neutral',
       'weighted_comment_sentiment_score_positive']
bool_cols = ['weighted_parent_sentiment_score_neutral',
             'weighted_parent_sentiment_score_positive',
             'weighted_comment_sentiment_score_neutral',
             'weighted_comment_sentiment_score_positive']

for col in bool_cols: #need to convert bool type to integer
    train[col] = train[col].astype(int)
    test[col] = test[col].astype(int)

x_train = pd.concat([train[list_of_features].reset_index(drop=True), train_tfidf_df.reset_index(drop=True)], axis=1)
x_test = pd.concat([test[list_of_features].reset_index(drop=True), test_tfidf_df.reset_index(drop=True)], axis=1)


## Feedforward NN
- We have tuned a baseline model using keras tuner, of which the code is below. We managed to achieve about 0.744 AUC at best.

In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(128, activation='relu', input_dim=x_train.shape[1]))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=10, batch_size=8, validation_split=0.1)




: 

Visualising ROC AUC curve

In [None]:
y_pred_test = model.predict(x_test)
#roc_auc_test = roc_auc_score(y_test, y_pred_test)
#comment td-idf only AUC: 0.6628971516171749
#gen features + comment td-idf AUC: 0.7419122793471202
fpr, tpr, thresholds = roc_curve(y_test, y_pred_test)
roc_auc_test = auc(fpr, tpr)
print(f'AUC: {roc_auc_test}')

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='No Skill')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()


In [None]:
raw_data = np.hstack([raw_text_data, raw_other_features])
feature_names = ['comment_text'] + list_of_features  # Assuming 'comment_text' is the name of your text column
explainer = LimeTabularExplainer(
    training_data=raw_data,
    feature_names=feature_names,
    class_names=['class_0', 'class_1'],  # Update with your class names
    discretize_continuous=True
)
def predict_fn(data):
    # Split the data back into text and other features
    text_data = data[:, 0]
    other_features_data = data[:, 1:]

    # Preprocess the text data and other features
    # For text, use your tfidf_vec object
    text_data_tfidf = tfidf_vec.transform(text_data)

    # For other features, apply any required preprocessing (e.g., scaling)
    # ...

    # Combine them back
    final_data = hstack([text_data_tfidf, other_features_data]).toarray()

    # Get predictions
    preds = model.predict(final_data)
    return preds
idx_to_explain = 50  # Example index
data_to_explain = raw_data[idx_to_explain]
exp = explainer.explain_instance(
    data_row=data_to_explain,
    predict_fn=predict_fn
)

# Display the explanation
exp.show_in_notebook(show_table=True)


using iterative hyperparameter tuning

In [7]:
from keras_tuner import HyperModel
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
#use tf.keras.optimizers.legacy.Adam if on M1/M2 macbook
class MyHyperModel(HyperModel):
    def __init__(self, input_dim):
        self.input_dim = input_dim

    def build(self, hp):
        model = Sequential()
        # First layer
        model.add(Dense(units=hp.Int('units_first', min_value=128, max_value=512, step=32),
                        activation='relu', input_dim=self.input_dim))
        model.add(Dropout(rate=hp.Float('dropout_first', min_value=0.1, max_value=0.5, step=0.1)))
        
        # Second layer
        model.add(Dense(units=hp.Int('units_second', min_value=64, max_value=256, step=32),
                        activation='relu'))
        model.add(Dropout(rate=hp.Float('dropout_second', min_value=0.1, max_value=0.5, step=0.1)))

        # Third layer
        model.add(Dense(units=hp.Int('units_third', min_value=32, max_value=128, step=32),
                        activation='relu'))
        model.add(Dropout(rate=hp.Float('dropout_third', min_value=0.1, max_value=0.5, step=0.1)))

        # Output layer
        model.add(Dense(1, activation='sigmoid'))

        # Tuning the learning rate
        hp_learning_rate = hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='LOG')
        
        model.compile(optimizer=Adam(learning_rate=hp_learning_rate),
                      loss='binary_crossentropy',
                      metrics=['accuracy'])
        return model

Using TensorFlow backend


In [8]:
from kerastuner.tuners import Hyperband

hypermodel = MyHyperModel(input_dim=x_train.shape[1])

tuner = Hyperband(
    hypermodel,
    objective='val_accuracy',
    max_epochs=10,
    factor=3,
    directory='keras_tuner_dir',
    project_name='keras_tuner'
)
from tensorflow.keras.callbacks import EarlyStopping

stop_early = EarlyStopping(monitor='val_loss', patience=5)
tuner.search(x_train, y_train,
             epochs=50,
             validation_split=0.1,
             callbacks=[stop_early])


Reloading Tuner from keras_tuner_dir\keras_tuner\tuner0.json


  from kerastuner.tuners import Hyperband


In [9]:
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
best_model = tuner.hypermodel.build(best_hps)
# Summary of the best model
best_model.summary()
# Optionally, you can retrain the model with the best hyperparameters on the full dataset
#best_model.fit(x_train, y_train, epochs=10, batch_size=8, validation_split=0.1)
#y_pred_test = best_model.predict(x_test)
#roc_auc_test = roc_auc_score(y_test, y_pred_test)
#print(f'Best model AUC: {roc_auc_test}')


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 352)               2860000   
                                                                 
 dropout (Dropout)           (None, 352)               0         
                                                                 
 dense_4 (Dense)             (None, 160)               56480     
                                                                 
 dropout_1 (Dropout)         (None, 160)               0         
                                                                 
 dense_5 (Dense)             (None, 64)                10304     
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_6 (Dense)             (None, 1)                

### RNN implementation

# Idea: 
- LSTM branch-> iteratively takes in comment tokens
- Dense branch-> dense layer takes in the other features
- Merge branches
- one more Dense layer


In [3]:
rnn_features = [
       'comment_word_count','parent_comment_word_count',
       'comment_token_count', 'parent_comment_token_count',
       'comment_unique_word_count', 'parent_comment_unique_word_count',
       'comment_unique_token_count', 'parent_comment_unique_token_count',
       'comment_stopword_count', 'parent_comment_stopword_count',
       'comment_mean_word_length', 'parent_comment_mean_word_length',
       'comment_mean_token_length', 'parent_comment_mean_token_length',
       'comment_char_count', 'parent_comment_char_count',
       'comment_punctuation_count', 'parent_comment_punctuation_count',
       'comment_hashtag_count', 'parent_comment_hashtag_count',
       'comment_number_count', 'parent_comment_number_count',
       'weighted_parent_sentiment_score_neutral',
       'weighted_parent_sentiment_score_positive',
       'weighted_comment_sentiment_score_neutral',
       'weighted_comment_sentiment_score_positive', 'documents_comment']#, 'documents_parent_comment'] #if doesnt work we try comment_tfidf_nn
train["documents_comment"] = train['comment_tokens'].apply(lambda tokens_list:' '.join(tokens_list)) #keras needs to use own tokenizer
#train["documents_parent_comment"] = train['parent_comment_tokens'].apply(lambda tokens_list:' '.join(tokens_list))
test["documents_comment"] = test['comment_tokens'].apply(lambda tokens_list:' '.join(tokens_list))
#test["documents_parent_comment"] = test['parent_comment_tokens'].apply(lambda tokens_list:' '.join(tokens_list))

rnn_train = train[rnn_features]
rnn_test = test[rnn_features]

In [4]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Concatenate, TextVectorization
from tensorflow.keras.models import Model


# TextVectorization for comment and parent_comment
max_features = 8000 #follow number of tokens for feedforward


vectorize_layer_comment = TextVectorization(
    max_tokens=max_features,
    split='whitespace',
    ngrams=3
    )
# Prepare dataset for TextVectorization adapt
train_texts = rnn_train['documents_comment'].tolist()
test_texts = rnn_test['documents_comment'].tolist()
vectorize_layer_comment.adapt(train_texts)

In [None]:
from tensorflow.keras.layers import Embedding
#from tensorflow.keras.optimizers import Adam
#use tf.keras.optimizers.legacy.Adam if on M1/M2 macbook
# Assuming max_features is the vocabulary size and embedding_dim is the dimension of the embedding
embedding_dim = 128  # You can choose an appropriate value

# Add an Embedding layer after text vectorization
embedding_layer = Embedding(max_features, embedding_dim)

# LSTM Branch
text_input_comment = Input(shape=(1,), dtype=tf.string, name='text_comment')
text_features_comment = vectorize_layer_comment(text_input_comment)
text_features_comment = embedding_layer(text_features_comment)  # Embedding layer
lstm_comment = LSTM(64)(text_features_comment)

# Dense Features Branch
other_features_input = Input(shape=(len(rnn_features) - 1,), name='other_features')
dense_features = Dense(128, activation='relu')(other_features_input)

# Concatenate
concatenated = Concatenate()([lstm_comment, dense_features])

# Additional Dense Layers
output = Dense(64, activation='relu')(concatenated)
output = Dense(1, activation='sigmoid')(output)

# Build Model
model = Model(inputs=[text_input_comment, other_features_input], outputs=output)
#optimiser = tf.keras.optimizers.legacy.SGD(learning_rate=0.001)
# Compile
model.compile(optimizer="adam", loss='binary_crossentropy', metrics=['accuracy'])#use Adam optimiser


In [None]:

# Prepare text inputs
train_texts = np.array(train_texts)[:, np.newaxis]
test_texts = np.array(test_texts)[:, np.newaxis]

# Prepare other features inputs
feature_columns = [col for col in rnn_features if col != 'documents_comment']

train_other_features = np.array(rnn_train[feature_columns]).astype(np.float32)
test_other_features = np.array(rnn_test[feature_columns]).astype(np.float32)

# Fit the model
history = model.fit(
    [train_texts, train_other_features], y_train,
    epochs=20,
    batch_size=32,
    validation_split=0.1  # or use a validation set
)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate([test_texts, test_other_features], y_test)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

# Predict probabilities for the test set
y_pred_probs = model.predict([test_texts, test_other_features]).ravel()

# Calculate ROC-AUC
roc_auc = roc_auc_score(y_test, y_pred_probs)
print(f"ROC-AUC Score: {roc_auc}")