Bag of words MLP model with Keras

In [1]:
import os
import numpy as np
import pandas as pd

## Import model

In [2]:
from keras.models import model_from_json

Using TensorFlow backend.


## Import data

In [3]:
# load doc into memory
def load_doc(filename):
    # open the file as read only 
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file file.close()
    return text

In [4]:
dir_path = os.path.realpath('..')

In [5]:
path = 'data/interim/train.csv'

full_path = os.path.join(dir_path, path)
df_train = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df_train.shape))

Dataset has 95851 rows, 7 columns.


In [6]:
path = 'data/interim/test.csv'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)
df_test = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df_test.shape))

Dataset has 226998 rows, 1 columns.


In [7]:
path = 'data/interim/vocab.txt'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)
vocab = load_doc(full_path)

## Encoding

In [8]:
from keras.preprocessing.text import Tokenizer

In [9]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [10]:
%%time
tokenizer = create_tokenizer(df_train.comment_text)

CPU times: user 4.56 s, sys: 30.1 ms, total: 4.59 s
Wall time: 4.66 s


In [11]:
%%time
# encode data
Xtrain = tokenizer.texts_to_matrix(df_train.comment_text, mode='freq')
Xtest = tokenizer.texts_to_matrix(df_test.comment_text, mode='freq')

CPU times: user 24.5 s, sys: 3.01 s, total: 27.5 s
Wall time: 27.8 s


In [12]:
print(Xtrain.shape, Xtest.shape)

(95851, 46666) (226998, 46666)


## Predict model

In [13]:
def load_model(model_name):
    # load json and create model
    json_file = open(model_name+'.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    # load weights into new model
    loaded_model.load_weights(model_name+".h5")
    print("Loaded model from disk")
    return loaded_model

In [14]:
path = 'data/processed/submission.csv'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)

In [15]:
for i in range(4):
    print(i)

0
1
2
3


In [None]:
%%time
target = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
submission = pd.DataFrame(index=df_test.index, columns=target)
n_splits = 10

for label in target:
    print('... Processing {}'.format(label))
    model_name = 'model_' + label

    # load the model
    loaded_model = load_model(model_name)
    
    # compute the predicted probabilities - split due to lack of memory
    size = round(len(df_test)/n_splits)
    i = 0
    for batch in range(n_splits):
        if batch == (n_splits-1):
            y_pred_proba = loaded_model.predict(Xtest[i:], verbose=0, batch_size=1)
            submission[label].iloc[i:] = y_pred_proba.flatten()
            i = i + size
            print('last complete!')
        else:
            y_pred_proba = loaded_model.predict(Xtest[i:i+size], verbose=0, batch_size=1)
            submission[label].iloc[i:i+size] = y_pred_proba.flatten()
            i = i + size
            print(i)
    
    submission.to_csv(full_path, header=True, index=True)

... Processing toxic
Loaded model from disk
22700
45400
68100
90800
113500
136200
158900
181600


In [17]:
path = 'data/processed/submission.csv'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)

submission.to_csv(full_path, header=True, index=True)