In [154]:
'''This script loads pre-trained word embeddings (GloVe embeddings)
into a frozen Keras Embedding layer, and uses it to
train a text classification model on the 20 Newsgroup dataset
(classification of newsgroup messages into 20 different categories).
GloVe embedding data can be found at:
http://nlp.stanford.edu/data/glove.6B.zip
(source page: http://nlp.stanford.edu/projects/glove/)
'''

'This script loads pre-trained word embeddings (GloVe embeddings)\ninto a frozen Keras Embedding layer, and uses it to\ntrain a text classification model on the 20 Newsgroup dataset\n(classification of newsgroup messages into 20 different categories).\nGloVe embedding data can be found at:\nhttp://nlp.stanford.edu/data/glove.6B.zip\n(source page: http://nlp.stanford.edu/projects/glove/)\n'

In [1]:
from __future__ import print_function

import os
import sys
import numpy as np
from keras.utils import to_categorical
from keras.layers import Dense, Input
from keras.layers import Embedding, Flatten
from keras.models import Model
from keras.optimizers import RMSprop
import scipy.io as sio

Using TensorFlow backend.


In [2]:
BASE_DIR = '/Users/kalpeshpatel/Downloads/'
GLOVE_DIR = os.path.join(BASE_DIR, 'Glove')
MAX_SEQUENCE_LENGTH = 3
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [3]:
# first, build index mapping words in the embeddings set
# to their embedding vector

print('Indexing word vectors.')

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


In [4]:
mat_contents = sio.loadmat('/users/kalpeshpatel/Downloads/data.mat')
xx = mat_contents['data']

In [5]:
## Extract vocabulary
yy = xx['vocab'][0,0]
index_to_word = {}
word_to_index = {}
vocab_size = yy.shape[1]
print("vocab size:" + str(vocab_size))
for i in range(vocab_size):
    word = yy[0][i][0]
    #print(word)
    index_to_word[i] = word
    word_to_index[word] = i
#print(word_to_index['just'])
#print(index_to_word[11])

vocab size:250


In [6]:
# Create Embedding matrix
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in word_to_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [8]:
#load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(vocab_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
# Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
embedding_layer.build((None,))
# Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
embedding_layer.set_weights([embedding_matrix])

In [9]:
xx = mat_contents['data']
training = xx['trainData'][0,0]
training_x = training[0:3,].T
training_x = training_x -1
print("training_x:" + str(training_x.shape))
training_y = training[3,:].T
training_y = training_y -1
training_y_one = to_categorical(training_y)
training_y_one.shape

training_x:(372550, 3)


(372550, 250)

In [10]:
#Load Test data
# Subtract one to get index between 0 and vocab - 1

test = xx['testData'][0,0]
test_x = test[0:3,].T
test_x = test_x - 1
print("test_x:" + str(test_x.shape))
test_y = test[3].T
test_y = test_y - 1
test_y_one = to_categorical(test_y)
test_y_one.shape
test_y.shape

test_x:(46568, 3)


(46568,)

In [11]:
# Load validation data

valid = xx['validData'][0,0]
valid_x = valid[0:3,].T
valid_x = valid_x - 1
valid_x.shape
valid_y = (valid[3,].T) - 1
valid_y_one = to_categorical(valid_y)
valid_y.shape

(46568,)

In [12]:
cache = (training_x,training_y,yy)

In [13]:
def buildModel(cache):
    
    training_x,training_y,vocab = cache
    vocabSize = vocab.shape[1]
    #input_length = training_x.shape[1] -- same as MAX_SEQ_LENGTH
    denseLayer = 200
        
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    x = Flatten()(embedded_sequences)
    x = Dense(denseLayer, activation = 'tanh')(x)
    preds = Dense(vocabSize,activation = 'softmax')(x)
    model = Model(sequence_input, preds)
    model.summary()
    return model

In [14]:
model = buildModel(cache)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 3)                 0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 3, 100)            25000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 300)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 200)               60200     
_________________________________________________________________
dense_2 (Dense)              (None, 250)               50250     
Total params: 135,450
Trainable params: 110,450
Non-trainable params: 25,000
_________________________________________________________________


In [15]:
def validation_loop(learning_rate):
    complete_hist = {}
    for lr in learning_rate:
        print(lr)
        opt = RMSprop(lr=lr, rho=0.9, epsilon=None, decay=0.0)
        model.compile(loss='categorical_crossentropy',
              optimizer= opt,
              metrics=['acc'])
        hist = model.fit(training_x,training_y_one,epochs=10, batch_size=32,
                         validation_data = (valid_x,valid_y_one))
        complete_hist[lr] = hist.history
    return(complete_hist)

In [16]:
#hist = model.fit(training_x,training_y_one,epochs=2, batch_size=32,validation_data = (valid_x,valid_y_one))
#print(hist.history)
return_hist = validation_loop([0.001,0.01])

0.001
Train on 372550 samples, validate on 46568 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.01
Train on 372550 samples, validate on 46568 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [17]:
for i in return_hist.keys():
    print("lr: " + str(i) + " Validation accuracy: " + str (np.average(return_hist[i]['val_acc'])))

lr: 0.001 Validation accuracy: 0.336333963237
lr: 0.01 Validation accuracy: 0.218967960831


In [18]:
from prettytable import PrettyTable
table = PrettyTable()
accuracy = 0
table.field_names = ["#", "Word1", "word2", "word3", "expected", "actual1", "actual2","actual3"]
classes = model.predict(test_x,batch_size = 32)
for  i in range(classes.shape[0]):
    output = np.random.choice(a= yy[0],size = 3,p = classes[i,:], replace = False)
    if (index_to_word[test_y[i]] in output):
        accuracy += 1
    #table.add_row([i,index_to_word[test_x[i,0]],index_to_word[test_x[i,1]], index_to_word[test_x[i,2]],index_to_word[test_y[i]],output[0],output[1],output[2]])    
print("Accuracy: "+ str(accuracy/classes.shape[0]))
#print(table) 

Accuracy: 0.320155471568459
