In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import sys
path = '/content/gdrive/Team Drives/cs273p project'
sys.path.append(path)

In [0]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

import re
from nltk.stem import WordNetLemmatizer 
import nltk
nltk.download('wordnet') 

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix, hstack
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU
from keras.layers import Bidirectional, GlobalMaxPool1D, Conv1D, MaxPooling1D
from keras.models import Model, Sequential, load_model
from keras.callbacks import EarlyStopping

from keras import initializers, regularizers, constraints, optimizers, layers

from sklearn.metrics import roc_auc_score

Using TensorFlow backend.


In [0]:
import pickle

def saveobj(save_list, filename):
    with open(path + '/' + filename, 'wb') as f:  # Python 3: open(..., 'wb')
        pickle.dump(save_list, f)
        
# restore object        
def loadobj(filename):
    with open(path + '/' + filename, 'rb') as f:  # Python 3: open(..., 'rb')
        li = pickle.load(f)
    return li

In [0]:
def nn_submission(isTest, filename, Y_t, Y_t_p, Y_t_result):
    print(filename + ' submission:')
    scores = []
    p_submission = pd.DataFrame.from_dict({'id': (test_df['id'] if isTest else valid_df['id'])})
    r_submission = pd.DataFrame.from_dict({'id': (test_df['id'] if isTest else valid_df['id'])})
    for i,class_name in enumerate(classes):
        cv_score = roc_auc_score(Y_t[:,i], Y_t_p[:,i])
        scores.append(cv_score)
        print (class_name, ': ', cv_score)
        
        p_submission[class_name] = Y_t_p[:,i]
        r_submission[class_name] = Y_t_result[:,i]
        
    
    print('Total CV score is {}'.format(np.mean(scores)))
    print()
    p_submission.to_csv(path + r'/output/neural network/prob_{}_submission.csv'.format(filename), index=False)
    r_submission.to_csv(path + r'/output/neural network/bin_{}_submission.csv'.format(filename), index=False)

In [0]:
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

## Preprocessing

In [0]:
# load preprocessed pickle
train_df, valid_df, test_df = loadobj('data/filtered_comment_pickle')

## Vectorized and Transform 

### NN Tokenizing/Indexing/Index Representation

In [0]:
Y_tr = train_df[classes].values
Y_v = valid_df[classes].values
Y_te = test_df[classes].values
list_sentences_train = train_df["filt_comment"]
list_sentences_valid = valid_df["filt_comment"]
list_sentences_test = test_df["filt_comment"]

In [0]:
batch_size = 32

### RNN Model
- predict Validation/Test dataset

In [0]:
rnn_max_features = 20000
rnn_tokenizer = Tokenizer(num_words=rnn_max_features)
rnn_tokenizer.fit_on_texts(list(list_sentences_train))
rnn_list_tokenized_train = rnn_tokenizer.texts_to_sequences(list_sentences_train)
rnn_list_tokenized_valid = rnn_tokenizer.texts_to_sequences(list_sentences_valid)
rnn_list_tokenized_test = rnn_tokenizer.texts_to_sequences(list_sentences_test)

In [0]:
rnn_maxlen = 200
rnn_X_tr = pad_sequences(rnn_list_tokenized_train, maxlen=rnn_maxlen)
rnn_X_v = pad_sequences(rnn_list_tokenized_valid, maxlen=rnn_maxlen)
rnn_X_te = pad_sequences(rnn_list_tokenized_test, maxlen=rnn_maxlen)

In [0]:
gru_model = load_model(path + '/output/neural network/gru_model.h5')
gru_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 200, 128)          2560000   
_________________________________________________________________
GRU_layer (GRU)              (None, 200, 60)           34020     
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 60)                0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 60)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 6)                 366       
Total params: 2,594,386
Trainable params: 2,594,386
Non-trainable params: 0
_________________________________________________________________


In [0]:
rnn_Y_te_p = gru_model.predict(rnn_X_te, batch_size=batch_size, verbose=1)
rnn_Y_v_p = gru_model.predict(rnn_X_v, batch_size=batch_size, verbose=1)



rnn test/validation prediction data ready

In [0]:
rnn_Y_te_result = np.where(rnn_Y_te_p > 0.5, 1, 0)
rnn_Y_v_result = np.where(rnn_Y_v_p > 0.5, 1, 0)

In [0]:
nn_submission(True, 'RNN_test', Y_te, rnn_Y_te_p, rnn_Y_te_result);
nn_submission(False, 'RNN_valid', Y_v, rnn_Y_v_p, rnn_Y_v_result);

RNN_test submission:
toxic :  0.9673779461795202
severe_toxic :  0.9882611814992497
obscene :  0.9769079036855035
threat :  0.9832581026436812
insult :  0.9743989175911283
identity_hate :  0.9841695572324024
Total CV score is 0.9790622681385809

RNN_valid submission:
toxic :  0.974369260879046
severe_toxic :  0.9906496431068403
obscene :  0.9915549326274465
threat :  0.9802749825416996
insult :  0.984592059423264
identity_hate :  0.9835624680711166
Total CV score is 0.984167224441569



### CNN Model
- Predict Validation/Test dataset

In [0]:
cnn_max_features = 20000
cnn_tokenizer = Tokenizer(num_words=cnn_max_features,char_level=True)
cnn_tokenizer.fit_on_texts(list(list_sentences_train))
cnn_list_tokenized_train = cnn_tokenizer.texts_to_sequences(list_sentences_train)
cnn_list_tokenized_valid = cnn_tokenizer.texts_to_sequences(list_sentences_valid)
cnn_list_tokenized_test = cnn_tokenizer.texts_to_sequences(list_sentences_test)

In [0]:
cnn_maxlen = 500
cnn_X_tr = pad_sequences(cnn_list_tokenized_train, maxlen=cnn_maxlen)
cnn_X_v = pad_sequences(cnn_list_tokenized_valid, maxlen=cnn_maxlen)
cnn_X_te = pad_sequences(cnn_list_tokenized_test, maxlen=cnn_maxlen)

In [0]:
# returns a compiled model
cnn_model = load_model(path + '/output/neural network/cnn_model.h5')
cnn_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 500)               0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 500, 240)          365280    
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 500, 100)          96100     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 125, 100)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 125, 120)          57960     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 120)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                6050      
__________

In [0]:
cnn_Y_te_p = cnn_model.predict(cnn_X_te, batch_size=batch_size, verbose=1)
cnn_Y_v_p = cnn_model.predict(cnn_X_v, batch_size=batch_size, verbose=1)



rnn test/validation prediction data ready

In [0]:
cnn_Y_te_result = np.where(cnn_Y_te_p > 0.5, 1, 0)
cnn_Y_v_result = np.where(cnn_Y_v_p > 0.5, 1, 0)

In [0]:
nn_submission(True, 'CNN_test', Y_te, cnn_Y_te_p, cnn_Y_te_result);
nn_submission(False, 'CNN_valid', Y_v, cnn_Y_v_p, cnn_Y_v_result);

CNN_test submission:
toxic :  0.9594618800156307
severe_toxic :  0.9859726418712305
obscene :  0.9752332625576052
threat :  0.9788533298471025
insult :  0.9695588094541403
identity_hate :  0.9700995165054841
Total CV score is 0.9731965733751989

CNN_valid submission:
toxic :  0.9629010489085756
severe_toxic :  0.9890533795099427
obscene :  0.9880136804312376
threat :  0.9665759407448111
insult :  0.9784002154424853
identity_hate :  0.9737213936035453
Total CV score is 0.9764442764400997

