Notes:
- Important parameters: kernel size, no. of feature maps
- 1-max pooling generally outperforms otehr types of pooling
- Dropout has little effect
- Gridsearch across kernel size in the range 1-10
- Search no. of filters from 100-600 and dropout of 0.0-0.5
- Explore tanh, relu, linear activation functions

In [17]:
model_name = 'CNN_single_'
corpus = 'comment_text'

## Import data

In [2]:
import os
import numpy as np
import pandas as pd

In [3]:
dir_path = os.path.realpath('..')

In [4]:
path = 'data/processed/test.csv'

full_path = os.path.join(dir_path, path)
df = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df.shape))

Dataset has 226998 rows, 1 columns.


## Pre-processing

In [9]:
import pickle
from keras.preprocessing.sequence import pad_sequences

In [10]:
# load tokenizer
with open('tokenizer.pickle', 'rb') as handle:
    t = pickle.load(handle)

vocab_size = len(t.word_index) + 1
max_length = 1000

# integer encode and pad the documents
encoded_test = t.texts_to_sequences(df[corpus].astype(str))
padded_test = pad_sequences(encoded_test, maxlen=max_length, padding='post')

## Evaluation

In [11]:
from keras.models import model_from_json
from sklearn.metrics import log_loss

In [12]:
def load_model(model_path):
    # load json and create model
    json_file = open(model_path+'.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    # load weights into new model
    loaded_model.load_weights(model_path+".h5")
    print("Loaded model from disk")
    return loaded_model

In [18]:
# load the model
model_path = os.path.join(dir_path, 'models', model_name)
model = load_model(model_path)

y_pred_proba = model.predict(padded_test, verbose=1, batch_size=1)

Loaded model from disk


In [29]:
target = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
submission = pd.DataFrame(y_pred_proba, index=df.index, columns=target)

In [30]:
## Output submissions

path = 'data/submissions/' + model_name + '.csv'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)

submission.to_csv(full_path, header=True, index=True)