In [1]:
import re
import string
import nltk
import pickle as pk
import gensim
import numpy as np
from nltk.stem import *
stemmer = PorterStemmer()
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
import pandas as pd
from sklearn.metrics import accuracy_score  
from xgboost import XGBClassifier
import data_helpers 
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers.core import Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU
from keras.callbacks import ModelCheckpoint
from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Embedding,Input,BatchNormalization,Dense,Bidirectional,LSTM,Dropout
from keras.callbacks import History ,ModelCheckpoint, EarlyStopping
%env CUDA_VISIBLE_DEVICES=2

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Load Data

In [2]:
def _shuffle(X, Y):
    randomize = np.arange(len(X))
    np.random.shuffle(randomize)
    return (X[randomize], Y[randomize])

In [4]:
x_train_text, pos1_train, pos2_train, labels_train = data_helpers.load_data_and_labels('data/TRAIN_FILE.txt')
x_test_text, pos1_test, pos2_test, labels_test = data_helpers.load_data_and_labels('data/TEST_FILE_FULL.txt')
x_total = x_train_text + x_test_text

## Tokenizer

In [10]:
tokenizer = Tokenizer(num_words=25000,lower=True,split=' ',char_level=False)
tokenizer.fit_on_texts(x_total)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 22433 unique tokens.


In [22]:
train_sentence_seq = tokenizer.texts_to_sequences(x_train_text)
test_sentence_seq = tokenizer.texts_to_sequences(x_test_text)

max_length = np.max([len(i) for i in train_sentence_seq+test_sentence_seq])
print("max length:", max_length)

x_train_seq = sequence.pad_sequences(train_sentence_seq, maxlen=max_length)
x_test_seq = sequence.pad_sequences(test_sentence_seq, maxlen=max_length)

max length: 87


## Build embedding_matrix

In [29]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
tmp_file = get_tmpfile("/home/thtang/LifeLog/data/glove_pretrained/gensim_crawl_300d.txt")

w2vModel = KeyedVectors.load_word2vec_format(tmp_file)

In [34]:
# prepare embedding matrix
num_words = len(word_index)+1
embedding_matrix = np.zeros((num_words, embedding_size))
oov = 0
for word, i in word_index.items():
    if word in w2vModel.wv.vocab:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = w2vModel[word]
    else:
        oov+=1
print("OOV:",oov)

  


OOV: 990


## GRU training

In [73]:
def train_GRU(x_train,y_train,x_val,y_val,embedding_matrix):
    max_features = 22434
    max_length = 100
    embedding_size = 300
    gru_output_size = 128
    batch_size = 1000
    epochs = 30
      
    model = Sequential()
    model.add(Embedding(input_dim=max_features, output_dim=embedding_size, 
                        weights=[embedding_matrix], input_length=max_length, trainable=False))
    model.add(Dropout(0.3))
    
    model.add(GRU(embedding_size, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))
    model.add(GRU(gru_output_size, dropout=0.2, recurrent_dropout=0.2))

    model.add(Dense(units=64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(units=19, activation='softmax'))
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy'])
    
    filepath="model/GRU_model.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=2, save_best_only=True, mode='max')
    callbacks_list = [checkpoint]
    
    history = model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=epochs, batch_size=batch_size, callbacks=callbacks_list)
    
    scores = model.evaluate(x_train, y_train, verbose=0)
    print("Accuracy: %.2f%%" % (scores[1]*100))

    model.summary()
    #summarize history for accuracy
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('Accuracy plot of GRU model')
    plt.ylabel('Accuracy')
    plt.xlabel('# of epoch')
    plt.legend(['train', 'valid'], loc='upper left')
    plt.show()

    return model

## BiLSTM resource：https://github.com/thtang/ML2017FALL/blob/master/hw4/hw4_w2v_2.py

In [37]:
from keras.utils.generic_utils import get_custom_objects
from keras.layers import Embedding, Input,InputLayer,BatchNormalization, Dense, Bidirectional,LSTM,Dropout,GRU,Activation
from keras import backend as K
def swish(x):
    return (K.sigmoid(x) * x)
get_custom_objects().update({'swish': Activation(swish)})

def train_BiLSTM(x_train,y_train,x_val,y_val,embedding_matrix, i):
    max_features = 22434
    max_length = 100
    embedding_size = 300
    gru_output_size = 128
    batch_size = 64
    epochs = 100
    embedding_layer = Embedding(max_features,output_dim= embedding_size,
                            weights=[embedding_matrix],
                            input_length=max_length,
                            trainable=False)
    sequence_input = Input(shape=(max_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    lstm0 = Bidirectional(LSTM(256,activation="tanh",dropout=0.2,return_sequences = True,
                kernel_initializer='he_uniform'))(embedded_sequences)
    lstm1 = Bidirectional(LSTM(128,activation="tanh",dropout=0.2,return_sequences = True,
                kernel_initializer='he_uniform'))(lstm0)
    lstm2 = Bidirectional(LSTM(64,activation="tanh",dropout=0.2,return_sequences = False,
                kernel_initializer='he_uniform'))(lstm1)
    bn1 = BatchNormalization()(lstm2)
    dense1 = Dense(64, activation=swish)(bn1)
    dropout1 = Dropout(0.5)(dense1)
    dense2 = Dense(32, activation=swish)(dropout1)
    dropout2 = Dropout(0.5)(dense2)
    preds = Dense(19, activation='softmax')(dropout2)
    model = Model(sequence_input, preds)
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    filepath = "models/BiLSTM-{epoch:05d}-{val_acc:.5f}-" + str(i) + ".hdf5"
    checkpoint = ModelCheckpoint(filepath,monitor='val_acc',save_best_only=True)
    callbacks_list = [checkpoint]
    
    history = model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=epochs, batch_size=batch_size, callbacks=callbacks_list)
    
    scores = model.evaluate(x_train, y_train, verbose=0)
    print("Accuracy: %.2f%%" % (scores[1]*100))

In [38]:
for i in range(20):
#     if i==0:
#         numb_train = 7000
#         x_train, x_val = x_train_seq[:numb_train], x_train_seq[numb_train:]
#         y_train, y_val = labels_train[:numb_train], labels_train[numb_train:]
#     elif i==1:
#         numb_train = -7000
#         x_train, x_val = x_train_seq[numb_train:], x_train_seq[:numb_train]
#         y_train, y_val = labels_train[numb_train:], labels_train[:numb_train]
#     else:
    train_X, train_y = _shuffle(x_train_seq, labels_train)
#     x_train, x_val, y_train, y_val = train_test_split(x_train_seq, labels_train, 
#                                                           test_size=0.07)

    model = train_BiLSTM(train_X, train_y, x_test_seq ,labels_test ,embedding_matrix, i)

KeyboardInterrupt: 

## Testing

In [79]:
y_test = [np.where(r==1)[0][0] for r in labels_test ]
from keras.models import load_model
model4 = load_model('model/BiLSTM-00173-0.64643.hdf5')
model3 = load_model('model/BiLSTM-00106-0.64286.hdf5')
model2 = load_model('model/BiLSTM-00100-0.63036.hdf5')
model1 = load_model('model/BiLSTM-00062-0.62500.hdf5')
prediction4 = model4.predict(x_test_seq, batch_size=1000)
prediction3 = model3.predict(x_test_seq, batch_size=1000)
prediction2 = model2.predict(x_test_seq, batch_size=1000)
prediction1 = model1.predict(x_test_seq, batch_size=1000)
pred_y_prob = (prediction3+prediction2+prediction1+prediction4)/4
pred_y = np.argmax(pred_y_prob,axis=1)

In [80]:
print(accuracy_score(pred_y.tolist(), y_test)) 
#print(accuracy_score(prediction_2.tolist(), y_test)) 

0.6547662863452337


In [76]:
model4 = load_model('model/BiLSTM-00173-0.64643.hdf5')
prediction3 = model4.predict(x_test_seq, batch_size=1000)
pred_y = np.argmax(prediction3,axis=1)
print(accuracy_score(pred_y.tolist(), y_test)) 

0.646301067353699
