In [2]:
import pandas as pd
import numpy as np

import tensorflow as tf
from scipy.sparse import csr_matrix, vstack, hstack


In [4]:
train = pd.read_pickle("data/train.pkl").dropna()
test = pd.read_pickle("data/test.pkl").dropna()

In [5]:
train_parent_tdidf_csr = vstack(train["parent_comment_tdidf"])
test_parent_tdidf_csr = vstack(test["parent_comment_tdidf"])

train_tdidf_csr = vstack(train["comment_tdidf"])
test_tdidf_csr = vstack(test["comment_tdidf"])

train_parent_bow_csr = vstack(train["parent_comment_bow"])
test_parent_bow_csr = vstack(test["parent_comment_bow"])

train_bow_csr = vstack(train["comment_bow"])
test_bow_csr = vstack(test["comment_bow"])

In [6]:
list_of_features = [
       'comment_word_count','parent_comment_word_count',
       'comment_token_count', 'parent_comment_token_count',
       'comment_unique_word_count', 'parent_comment_unique_word_count',
       'comment_unique_token_count', 'parent_comment_unique_token_count',
       'comment_stopword_count', 'parent_comment_stopword_count',
       'comment_mean_word_length', 'parent_comment_mean_word_length',
       'comment_mean_token_length', 'parent_comment_mean_token_length',
       'comment_char_count', 'parent_comment_char_count',
       'comment_punctuation_count', 'parent_comment_punctuation_count',
       'comment_hashtag_count', 'parent_comment_hashtag_count',
       'comment_number_count', 'parent_comment_number_count',
       'weighted_parent_sentiment_score_neutral',
       'weighted_parent_sentiment_score_positive',
       'weighted_comment_sentiment_score_neutral',
       'weighted_comment_sentiment_score_positive']
bool_cols = ['weighted_parent_sentiment_score_neutral',
             'weighted_parent_sentiment_score_positive',
             'weighted_comment_sentiment_score_neutral',
             'weighted_comment_sentiment_score_positive']

for col in bool_cols: #need to convert bool type to integer
    train[col] = train[col].astype(int)
    test[col] = test[col].astype(int)
X_train_gen_features = csr_matrix(train[list_of_features])
y_train = train['label']

X_test_gen_features = csr_matrix(test[list_of_features])
y_test = test['label']
X_train = hstack([X_train_gen_features,train_tdidf_csr,train_parent_tdidf_csr]) #deep learning automates feature selection. from our supervised learning we have learnt that BoW adds no information given tf-idf.

In [7]:
# Reshaping the data
X_train_rnn = train_tdidf_csr.toarray().reshape(train_tdidf_csr.shape[0], 1, train_tdidf_csr.shape[1])
y_train_rnn = np.array(y_train)

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense

model = Sequential()
model.add(SimpleRNN(50, input_shape=(X_train_rnn.shape[1], X_train_rnn.shape[2]), return_sequences=False))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [9]:
model.fit(X_train_rnn, y_train_rnn, epochs=10, batch_size=32, validation_split=0.1)

: 

In [None]:
X_test_rnn = test_tdidf_csr.toarray().reshape(test_tdidf_csr.shape[0], 1, test_tdidf_csr.shape[1])
y_test_rnn = np.array(y_test)

loss, accuracy = model.evaluate(X_test_rnn, y_test_rnn)
print(f"Test Accuracy: {accuracy * 100:.2f}%")