In [1]:
import os
import time
import tqdm
import datetime
import pickle as pkl
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset

In [10]:
CURR_PATH = os.getcwd()
DATA_PATH = '/data/'
VEC_PATH = '/wiki-news-300d-1M.vec'

In [5]:
import load_data
from load_data import create_weights, create_emb_layer

In [6]:
## added three cat_mode into model function
import models_yi
from models_yi import LogisticRegression, NeuralNetwork

In [7]:
# added training accuracy, need to add training and validation loss later
import training_yi
from training_yi import acc, train_model

In [8]:
HIDDEN_DIM = 100
MAX_SENTENCE_LENGTH = 30
BATCH_SIZE = 32
NUM_CLASS = 20
LEARNING_RATE = 0.01
NUM_EPOCHES = 10

In [11]:
## Load raw data sets
snli_train = pd.read_csv(CURR_PATH + DATA_PATH + "snli_train.tsv", sep='\t')
snli_val = pd.read_csv(CURR_PATH + DATA_PATH + "snli_val.tsv", sep='\t')

In [12]:
## Preprocess raw date sets
train_data = load_data.prepare_data(snli_train)
val_data = load_data.prepare_data(snli_val)

In [13]:
#tracking accuracy and parameters
lr_train_acc = []
nn_train_acc = []
lr_val_acc = []
nn_val_acc = []
lr_param = []
nn_param = []
best_lr = 0
best_nn = 0
best_lr_param = None
best_nn_param = None

In [None]:
for VOCAB_SIZE in [10000, 20000, 40000]:
    vectors = pkl.load(open('pickle/'+str(VOCAB_SIZE)+'_vectors.pkl', 'rb'))
    id2token = pkl.load(open('pickle/'+str(VOCAB_SIZE)+'_id2token.pkl', 'rb'))
    token2id = pkl.load(open('pickle/'+str(VOCAB_SIZE)+'_token2id.pkl', 'rb'))
    ## Convert to token lists to lists of corresponding indices
    indiced_train_data, train_target = load_data.token2index_dataset(train_data, token2id, MAX_SENTENCE_LENGTH)
    indiced_val_data, val_target = load_data.token2index_dataset(val_data, token2id, MAX_SENTENCE_LENGTH)
    train_dataset = load_data.SNLIDataset(indiced_train_data, train_target)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=load_data.SNLI_collate_func,
                                           shuffle=True)

    val_dataset = load_data.SNLIDataset(indiced_val_data, val_target)
    val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=load_data.SNLI_collate_func,
                                           shuffle=True)
    for embed_dim in [200,300,500]:
        torch.manual_seed(embed_dim)
        num_embed = len(set(id2token))
        emb_layer = nn.Embedding(num_embed, embed_dim)
        '''the following line is used for pre-trained embedding'''
        
        #embed_layer, num_embed, embed_dim = create_emb_layer(create_weights(vectors, id2token), non_trainable = True)
        for cat_mode in ["DIRECT","MUL","SUB"]:
            print('Vocab_size:{}, Embed_dim:{}, cat_mode:{}'.format(VOCAB_SIZE, embed_dim, cat_mode))
            
            criterion =nn.NLLLoss()
            #train logistic regression model
            lr_model = LogisticRegression(emb_layer, embed_dim, NUM_CLASS, cat_mode)
            optimizer = optim.Adam(lr_model.parameters(), lr = LEARNING_RATE)
            param, train_acc, val_acc = train_model(model = lr_model,train_loader = train_loader, val_loader=val_loader, optimizer=optimizer, criterion=criterion, n_epochs=NUM_EPOCHES,save_file = 'model_lr.pt')
            lr_train_acc.append(train_acc)
            lr_val_acc.append(val_acc)
            lr_param.append(param)
            if val_acc > best_lr:
                best_lr = val_acc
                best_lr_param = param
                torch.save(param.state_dict(),'model_lr.pt')
                print("New best LR model found after 10 epochs, saving at model_lr.pt")
                        
                 
                    
            #train neural network model
            nn_model = NeuralNetwork(emb_layer, embed_dim, NUM_CLASS, HIDDEN_DIM, cat_mode)
            optimizer = optim.Adam(nn_model.parameters(), lr = LEARNING_RATE)
            param, train_acc, val_acc = train_model(model = nn_model,train_loader = train_loader, val_loader=val_loader, optimizer=optimizer, criterion=criterion, n_epochs=NUM_EPOCHES,save_file = 'model_nn.pt')
            nn_train_acc.append(train_acc)
            nn_val_acc.append(val_acc)
            nn_param.append(param)
            if val_acc > best_nn:
                best_nn = val_acc
                best_nn_param = param
                torch.save(param.state_dict(),'model_nn.pt')
                print("New best NN model found after 10 epochs, saving at model_nn.pt") 
                


Vocab_size:10000, Embed_dim:200, cat_mode:DIRECT
Starting epoch 0
Epoch: [1/10], Step: [500/3125],Training Acc: 55.639, Validation Acc: 54.6, Time: 32.8288369178772 sec
Epoch: [1/10], Step: [1000/3125],Training Acc: 55.766, Validation Acc: 53.4, Time: 70.76151084899902 sec
Epoch: [1/10], Step: [1500/3125],Training Acc: 60.774, Validation Acc: 58.5, Time: 113.03145503997803 sec
Epoch: [1/10], Step: [2000/3125],Training Acc: 59.124, Validation Acc: 57.0, Time: 156.93732380867004 sec
Epoch: [1/10], Step: [2500/3125],Training Acc: 57.667, Validation Acc: 54.5, Time: 201.06663298606873 sec
Epoch: [1/10], Step: [3000/3125],Training Acc: 65.066, Validation Acc: 60.8, Time: 244.7548167705536 sec
End of epoch 1, Training Acc: 64.698,Validation Acc: 60.3, Time: 268.79166293144226 sec
New best model found, saving at model_lr.pt

Starting epoch 1


In [13]:
# save training and validation accuracy into csv files, need to add loss            
with open('lr_model.csv', 'ab') as f:
    np.savetxt(f, lr_train_acc, lr_val_acc ,delimiter=",")    
    
with open('nn_model.csv', 'ab') as f:
    np.savetxt(f, nn_train_acc, nn_val_acc ,delimiter=",") 


In [34]:
#create data frame reporting
dat_lr = {'Vocab_size':np.repeat([10000, 20000, 40000],9), 'Embed_dim':np.tile(np.repeat([200,300,500],3),3),'Cat_mode':['DIRECT','MUL','SUB']*9,'train_acc':lr_train_acc, 'val_acc':lr_val_acc}
df_lr = pd.DataFrame(dat_lr) 

dat_nn = {'Vocab_size':np.repeat([10000, 20000, 40000],9), 'Embed_dim':np.tile(np.repeat([200,300,500],3),3),'Cat_mode':['DIRECT','MUL','SUB']*9,'train_acc':nn_train_acc, 'val_acc':nn_val_acc}
df_nn = pd.DataFrame(dat_nn) 