In [3]:
from __future__ import print_function

import os
import sys
import numpy as np
import scipy.io as sio
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as dataUtil

In [4]:
BASE_DIR = '/Users/kalpeshpatel/Downloads/'
GLOVE_DIR = os.path.join(BASE_DIR, 'Glove')
MAX_SEQUENCE_LENGTH = 3
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
BATCH_SIZE = 4

In [5]:
mat_contents = sio.loadmat('/users/kalpeshpatel/Downloads/data.mat')
xx = mat_contents['data']

In [6]:
## Extract vocabulary
yy = xx['vocab'][0,0]
index_to_word = {}
word_to_index = {}
vocab_size = yy.shape[1]
print("vocab size:" + str(vocab_size))
for i in range(vocab_size):
    word = yy[0][i][0]
    #print(word)
    index_to_word[i] = word
    word_to_index[word] = i
#print(word_to_index['just'])
#print(index_to_word[11])

vocab size:250


In [7]:
xx = mat_contents['data']
training = xx['trainData'][0,0]
training_x = training[0:3,].T
training_x = training_x -1
print("training_x:" + str(training_x.shape))
training_y = training[3,:].T
training_y = training_y -1

training_x:(372550, 3)


In [8]:
#Load Test data
# Subtract one to get index between 0 and vocab - 1

test = xx['testData'][0,0]
test_x = test[0:3,].T
test_x = test_x - 1
print("test_x:" + str(test_x.shape))
test_y = test[3].T
test_y = test_y - 1
test_y.shape

test_x:(46568, 3)


(46568,)

In [9]:
# Load validation data

valid = xx['validData'][0,0]
valid_x = valid[0:3,].T
valid_x = valid_x - 1
valid_x.shape
valid_y = (valid[3,].T) - 1
#valid_y_one = to_categorical(valid_y)
valid_y.shape

(46568,)

In [16]:
class buildModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(buildModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size,embedding_dim)
        self.linear1 = nn.Linear(3*embedding_dim,128) # 3 is the size of context and 128 are number of output
        self.linear2 = nn.Linear(128, vocab_size)
        self.embedding_dim = embedding_dim
    def forward(self,inputs):
        embeds = self.embeddings(inputs).view((-1,3*self.embedding_dim))
        #embeds = self.embeddings(inputs)
        x = F.tanh(self.linear1(embeds))
        x = self.linear2(x)
        out = F.softmax(x,dim = 1)
        return(out)

In [17]:
losses = []
loss_function = nn.CrossEntropyLoss()
model = buildModel(vocab_size,EMBEDDING_DIM)
opt = optim.SGD(model.parameters(),lr = 0.01,momentum = 0.5)

In [24]:
def training(epochs):
    combined_data = dataUtil.TensorDataset(torch.tensor(training_x,dtype = torch.long),torch.tensor(training_y,dtype = torch.long))
    dataloader = dataUtil.DataLoader(combined_data,batch_size = BATCH_SIZE,shuffle = True)
    for epoch in range(epochs):
        running_loss = 0
        total_loss = torch.Tensor([0])
    
        for i,input_x in enumerate(dataloader):
            input_tensor = input_x[0]
            target_tensor = input_x[1]
            model.zero_grad()
            log_probs = model(input_tensor)
            loss = loss_function(log_probs, target_tensor)
            loss.backward()
            opt.step()
            if i % 1000 == 0:
                print("Loss for batch: is:", i, loss.item())

In [None]:
training(1)

Loss for batch: is: 0 5.528280258178711
Loss for batch: is: 1000 5.278372764587402
Loss for batch: is: 2000 5.528309345245361
Loss for batch: is: 3000 5.278292655944824
Loss for batch: is: 4000 5.528310775756836
Loss for batch: is: 5000 5.5277419090271
Loss for batch: is: 6000 5.0282206535339355
Loss for batch: is: 7000 5.527736663818359
Loss for batch: is: 8000 5.528310775756836
Loss for batch: is: 9000 5.028300762176514
Loss for batch: is: 10000 5.5283098220825195
Loss for batch: is: 11000 5.028310775756836
Loss for batch: is: 12000 5.528309345245361
Loss for batch: is: 13000 5.278310775756836
Loss for batch: is: 14000 5.528065204620361
Loss for batch: is: 15000 5.52830696105957
Loss for batch: is: 16000 5.526644229888916
Loss for batch: is: 17000 5.5277099609375
Loss for batch: is: 18000 5.278254508972168
Loss for batch: is: 19000 5.278310298919678
Loss for batch: is: 20000 5.526927471160889
Loss for batch: is: 21000 5.278310298919678
Loss for batch: is: 22000 5.029082775115967
Loss

In [15]:
def validation_loop(learning_rate):
    complete_hist = {}
    for lr in learning_rate:
        print(lr)
        opt = RMSprop(lr=lr, rho=0.9, epsilon=None, decay=0.0)
        model.compile(loss='categorical_crossentropy',
              optimizer= opt,
              metrics=['acc'])
        hist = model.fit(training_x,training_y_one,epochs=10, batch_size=32,
                         validation_data = (valid_x,valid_y_one))
        complete_hist[lr] = hist.history
    return(complete_hist)

In [16]:
#hist = model.fit(training_x,training_y_one,epochs=2, batch_size=32,validation_data = (valid_x,valid_y_one))
#print(hist.history)
return_hist = validation_loop([0.001,0.01])

0.001
Train on 372550 samples, validate on 46568 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.01
Train on 372550 samples, validate on 46568 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [17]:
for i in return_hist.keys():
    print("lr: " + str(i) + " Validation accuracy: " + str (np.average(return_hist[i]['val_acc'])))

lr: 0.001 Validation accuracy: 0.336333963237
lr: 0.01 Validation accuracy: 0.218967960831


In [18]:
from prettytable import PrettyTable
table = PrettyTable()
accuracy = 0
table.field_names = ["#", "Word1", "word2", "word3", "expected", "actual1", "actual2","actual3"]
classes = model.predict(test_x,batch_size = 32)
for  i in range(classes.shape[0]):
    output = np.random.choice(a= yy[0],size = 3,p = classes[i,:], replace = False)
    if (index_to_word[test_y[i]] in output):
        accuracy += 1
    #table.add_row([i,index_to_word[test_x[i,0]],index_to_word[test_x[i,1]], index_to_word[test_x[i,2]],index_to_word[test_y[i]],output[0],output[1],output[2]])    
print("Accuracy: "+ str(accuracy/classes.shape[0]))
#print(table) 

Accuracy: 0.320155471568459
