In [1]:
import os
import numpy as np
import pandas as pd

## Import data

In [2]:
# load doc into memory
def load_doc(filename):
    # open the file as read only 
    file = open(filename, 'r')
    # read all text
    text = file.read()
    text = text.splitlines()
    # close the file file.close()
    return text

In [3]:
dir_path = os.path.realpath('..')

In [4]:
path = 'data/raw/train.csv'

full_path = os.path.join(dir_path, path)
df_train = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df_train.shape))

Dataset has 95851 rows, 7 columns.


In [5]:
path = 'data/raw/test.csv'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)
df_test = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df_test.shape))

Dataset has 226998 rows, 1 columns.


In [6]:
path = 'data/processed/vocab.txt'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)
vocab = load_doc(full_path)

In [7]:
# fill NaN with string "unknown"
df_train.fillna('unknown',inplace=True)
df_test.fillna('unknown',inplace=True)

## Glove

In [11]:
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

In [23]:
%%time
# integer encode the documents
vocab_size = 20000
Xtrain = [one_hot(d, vocab_size) for d in df_train.comment_text]
Xtest = [one_hot(d, vocab_size) for d in df_test.comment_text]

CPU times: user 27.4 s, sys: 790 ms, total: 28.2 s
Wall time: 29.2 s


In [24]:
# pad documents to a max length of 4 words
max_length = 4
padded_train = pad_sequences(Xtrain, maxlen=max_length, padding='post')
padded_test = pad_sequences(Xtest, maxlen=max_length, padding='post')

In [25]:
# define the model
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=max_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [26]:
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 4, 8)              160000    
_________________________________________________________________
flatten_2 (Flatten)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 160,033
Trainable params: 160,033
Non-trainable params: 0
_________________________________________________________________


In [27]:
def save_model(model, model_name):
    # serialize model to JSON
    model_json = model.to_json()
    with open(model_name + ".json", "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights(model_name + ".h5")
    print("Saved model to disk")

In [28]:
%%time
# fit the model
target = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

for label in target:
    print('... Processing {}'.format(label))
    ytrain = df_train[label]
    model_name = 'keras_model_' + label
    
    # train the model
    model.fit(padded_train, ytrain, epochs=1, verbose=2)
    
    # save the model
    save_model(model, model_name)

... Processing toxic
Epoch 1/1
 - 11s - loss: 0.2726 - acc: 0.9181
Saved model to disk
... Processing severe_toxic
Epoch 1/1
 - 11s - loss: 0.0474 - acc: 0.9867
Saved model to disk
... Processing obscene
Epoch 1/1
 - 11s - loss: 0.1290 - acc: 0.9611
Saved model to disk
... Processing threat
Epoch 1/1
 - 11s - loss: 0.0254 - acc: 0.9931
Saved model to disk
... Processing insult
Epoch 1/1
 - 10s - loss: 0.1236 - acc: 0.9599
Saved model to disk
... Processing identity_hate
Epoch 1/1
 - 10s - loss: 0.0371 - acc: 0.9894
Saved model to disk
CPU times: user 1min 52s, sys: 28 s, total: 2min 20s
Wall time: 1min 3s


## Prediction

In [29]:
from keras.models import model_from_json

In [30]:
def load_model(model_name):
    # load json and create model
    json_file = open(model_name+'.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    # load weights into new model
    loaded_model.load_weights(model_name+".h5")
    print("Loaded model from disk")
    return loaded_model

In [31]:
%%time
target = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
submission = pd.DataFrame(index=df_test.index, columns=target)

for label in target:
    print('... Processing {}'.format(label))
    model_name = 'keras_model_' + label

    # load the model
    loaded_model = load_model(model_name)
    
    y_pred_proba = loaded_model.predict(padded_test, verbose=0, batch_size=1)
    submission[label] = y_pred_proba.flatten()

... Processing toxic
Loaded model from disk
... Processing severe_toxic
Loaded model from disk
... Processing obscene
Loaded model from disk
... Processing threat
Loaded model from disk
... Processing insult
Loaded model from disk
... Processing identity_hate
Loaded model from disk
CPU times: user 12min 39s, sys: 1min 28s, total: 14min 8s
Wall time: 10min 42s


In [33]:
path = 'data/submissions/keras_learnt_embedding.csv'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)

submission.to_csv(full_path, header=True, index=True)

## Evaluation

In [None]:
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))