https://codekansas.github.io/blog/2016/gensim.html

In [1]:
import os
import numpy as np
import pandas as pd

## Import data

In [2]:
# load doc into memory
def load_doc(filename):
    # open the file as read only 
    file = open(filename, 'r')
    # read all text
    text = file.read()
    text = text.splitlines()
    # close the file file.close()
    return text

In [3]:
dir_path = os.path.realpath('..')

In [4]:
path = 'data/raw/train.csv'

full_path = os.path.join(dir_path, path)
df_train = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df_train.shape))

Dataset has 95851 rows, 7 columns.


In [5]:
path = 'data/raw/test.csv'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)
df_test = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df_test.shape))

Dataset has 226998 rows, 1 columns.


path = 'data/processed/vocab.txt'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)
vocab = load_doc(full_path)

In [6]:
# fill NaN with string "unknown"
df_train.fillna('unknown',inplace=True)
df_test.fillna('unknown',inplace=True)

## Load trained w2v model, embeddings and vocab

In [7]:
from gensim.models import Word2Vec
import json

In [8]:
def load_vocab(vocab_path):
    with open(vocab_path, 'r') as f:
        data = json.loads(f.read())
    word2idx = data
    idx2word = dict([(v, k) for k, v in data.items()])
    return word2idx, idx2word

In [9]:
from keras.layers import Embedding
from keras.engine import Input

def word2vec_embedding_layer(embeddings_path='embeddings.npz'):
    """
    Generate an embedding layer word2vec embeddings
    :param embeddings_path: where the embeddings are saved (as a numpy file)
    :return: the generated embedding layer
    """
    
    weights = np.load(open(embeddings_path, 'rb'))
    layer = Embedding(input_dim=weights.shape[0],
                      output_dim=weights.shape[1],
                      weights=[weights])
    return layer

Using TensorFlow backend.


In [10]:
# load model
w2v_model = Word2Vec.load('w2v_model.bin')
print(w2v_model)

Word2Vec(vocab=48349, size=100, alpha=0.025)


In [11]:
embeddings_path = 'embeddings.npy'
weights = np.load(open(embeddings_path, 'rb'))

In [12]:
embedding = w2v_model.wv.get_keras_embedding()

In [13]:
w2v_vocab = load_vocab('vocab.json')

In [14]:
path = 'data/processed/vocab.txt'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)
vocab = load_doc(full_path)

## Training

In [15]:
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding

In [16]:
%%time
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(vocab)
vocab_size = len(t.word_index) + 1

CPU times: user 235 ms, sys: 9.12 ms, total: 245 ms
Wall time: 243 ms


In [17]:
%%time
# integer encode the documents
Xtrain = t.texts_to_sequences(df_train.comment_text)
Xtest = t.texts_to_sequences(df_test.comment_text)

CPU times: user 30.3 s, sys: 384 ms, total: 30.7 s
Wall time: 32.1 s


In [18]:
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 100))
for word, i in t.word_index.items():
	embedding_vector = w2v_vocab[0].get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

In [19]:
embedding_matrix[0]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [20]:
# pad documents to a max length of 4 words
max_length = 4
padded_train = pad_sequences(Xtrain, maxlen=max_length, padding='post')
padded_test = pad_sequences(Xtest, maxlen=max_length, padding='post')

In [21]:
# define model
model = Sequential()
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=4, trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [22]:
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 4, 100)            2428000   
_________________________________________________________________
flatten_1 (Flatten)          (None, 400)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 401       
Total params: 2,428,401
Trainable params: 401
Non-trainable params: 2,428,000
_________________________________________________________________


In [23]:
def save_model(model, model_name):
    # serialize model to JSON
    model_json = model.to_json()
    with open(model_name + ".json", "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights(model_name + ".h5")
    print("Saved model to disk")

In [24]:
%%time
# fit the model
target = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

for label in target:
    print('... Processing {}'.format(label))
    ytrain = df_train[label]
    model_name = 'learnt_model_' + label
    
    # train the model
    model.fit(padded_train, ytrain, epochs=1, verbose=2)
    
    # save the model
    save_model(model, model_name)

... Processing toxic
Epoch 1/1
 - 7s - loss: 1.7297 - acc: 0.8863
Saved model to disk
... Processing severe_toxic
Epoch 1/1
 - 7s - loss: 0.1567 - acc: 0.9899
Saved model to disk
... Processing obscene
Epoch 1/1
 - 6s - loss: 0.8773 - acc: 0.9421
Saved model to disk
... Processing threat
Epoch 1/1
 - 6s - loss: 0.0520 - acc: 0.9968
Saved model to disk
... Processing insult
Epoch 1/1
 - 6s - loss: 0.7597 - acc: 0.9503
Saved model to disk
... Processing identity_hate
Epoch 1/1
 - 6s - loss: 0.1313 - acc: 0.9915
Saved model to disk
CPU times: user 42 s, sys: 18 s, total: 60 s
Wall time: 39.7 s


## Prediction

In [25]:
from keras.models import model_from_json

In [30]:
def load_model(model_name):
    # load json and create model
    json_file = open(model_name+'.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    # load weights into new model
    loaded_model.load_weights(model_name+".h5")
    print("Loaded model from disk")
    return loaded_model

In [31]:
%%time
target = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
submission = pd.DataFrame(index=df_test.index, columns=target)

for label in target:
    print('... Processing {}'.format(label))
    model_name = 'learnt_model_' + label

    # load the model
    loaded_model = load_model(model_name)
    
    y_pred_proba = loaded_model.predict(padded_test, verbose=0, batch_size=1)
    submission[label] = y_pred_proba.flatten()

... Processing toxic
Loaded model from disk
... Processing severe_toxic
Loaded model from disk
... Processing obscene
Loaded model from disk
... Processing threat
Loaded model from disk
... Processing insult
Loaded model from disk
... Processing identity_hate
Loaded model from disk
CPU times: user 11min 14s, sys: 1min 23s, total: 12min 38s
Wall time: 9min 1s


In [32]:
path = 'data/submissions/w2v_keras.csv'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)

submission.to_csv(full_path, header=True, index=True)

## Evaluation

In [33]:
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

NameError: name 'padded_docs' is not defined