Bag of words MLP model with Keras

In [25]:
import os
import numpy as np
import pandas as pd

## Import model

In [26]:
from keras.models import model_from_json

## Import data

In [27]:
dir_path = os.path.realpath('..')

In [28]:
path = 'data/processed/test.csv'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)
df_test = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df_test.shape))

Dataset has 226998 rows, 1 columns.


In [29]:
# load doc into memory
def load_doc(filename):
    # open the file as read only 
    file = open(filename, 'r')
    # read all text
    text = file.read()
    text = text.splitlines()
    # close the file file.close()
    return text

In [30]:
path = 'data/processed/vocab.txt'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)
vocab = load_doc(full_path)

## Encoding

In [31]:
from keras.preprocessing.text import Tokenizer

In [32]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [33]:
%%time
tokenizer = create_tokenizer(vocab)

CPU times: user 239 ms, sys: 17.2 ms, total: 256 ms
Wall time: 256 ms


In [34]:
%%time
# encode data
Xtest = tokenizer.texts_to_matrix(df_test.comment_text, mode='freq')

CPU times: user 7.29 s, sys: 167 ms, total: 7.45 s
Wall time: 7.5 s


In [35]:
print(Xtest.shape)

(226998, 24280)


## Predict model

In [36]:
def load_model(model_name):
    # load json and create model
    json_file = open(model_name+'.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    # load weights into new model
    loaded_model.load_weights(model_name+".h5")
    print("Loaded model from disk")
    return loaded_model

In [37]:
%%time
target = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
submission = pd.DataFrame(index=df_test.index, columns=target)

for label in target:
    print('... Processing {}'.format(label))
    model_name = 'model_' + label

    # load the model
    loaded_model = load_model(model_name)
    
    y_pred_proba = loaded_model.predict(Xtest, verbose=0, batch_size=1)
    submission[label] = y_pred_proba.flatten()

... Processing toxic
Loaded model from disk
... Processing severe_toxic
Loaded model from disk
... Processing obscene
Loaded model from disk
... Processing threat
Loaded model from disk
... Processing insult
Loaded model from disk
... Processing identity_hate
Loaded model from disk
CPU times: user 19min 14s, sys: 4min 5s, total: 23min 19s
Wall time: 19min 56s


In [38]:
path = 'data/processed/submission.csv'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)

submission.to_csv(full_path, header=True, index=True)