In [1]:
import os
import time
import tqdm
import datetime
import pickle as pkl
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset

In [2]:
CURR_PATH = os.getcwd()
DATA_PATH = '/data/'
VEC_PATH = '/wiki-news-300d-1M.vec'

In [3]:
import load_data
from load_data import create_weights, create_emb_layer

In [4]:
## added three cat_mode into model function
from models import LogisticRegression, NeuralNetwork

In [5]:
# added training accuracy, need to add training and validation loss later
from training import acc, train_model

In [6]:
## Load raw data sets
snli_train = pd.read_csv(CURR_PATH + DATA_PATH + "snli_train.tsv", sep='\t')
snli_val = pd.read_csv(CURR_PATH + DATA_PATH + "snli_val.tsv", sep='\t')

In [7]:
## Preprocess raw datasets
train_data = load_data.prepare_data(snli_train)
val_data = load_data.prepare_data(snli_val)

In [9]:
HIDDEN_DIM = 100
MAX_SENTENCE_LENGTH = 30
BATCH_SIZE = 32
NUM_CLASS = 20
LEARNING_RATE = 0.01
NUM_EPOCHES = 10

VOCAB_SIZES = [10000, 20000, 40000]
EMB_DIMS = [100,200,300,500]
CAT_MODES = ["DIRECT","MUL","SUB"]
MODEL_TYPES = {'log-reg': LogisticRegression,
              'neural-net': NeuralNetwork}

CRITERION = nn.NLLLoss()

In [10]:
torch.manual_seed(0)
results = [] # list of dictionaries that will be converted to DataFrame later
for vocab_size in VOCAB_SIZES:
    # Load datasets
    vectors = pkl.load(open('pickle/'+str(vocab_size)+'_vectors.pkl', 'rb'))
    id2token = pkl.load(open('pickle/'+str(vocab_size)+'_id2token.pkl', 'rb'))
    token2id = pkl.load(open('pickle/'+str(vocab_size)+'_token2id.pkl', 'rb'))
    ## Convert to token lists to lists of corresponding indices
    indiced_train_data, train_target = load_data.token2index_dataset(train_data, token2id, MAX_SENTENCE_LENGTH)
    indiced_val_data, val_target = load_data.token2index_dataset(val_data, token2id, MAX_SENTENCE_LENGTH)
    train_dataset = load_data.SNLIDataset(indiced_train_data, train_target)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=load_data.SNLI_collate_func,
                                           shuffle=True)
    val_dataset = load_data.SNLIDataset(indiced_val_data, val_target)
    val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=load_data.SNLI_collate_func,
                                           shuffle=True)
    
    for embed_dim in EMB_DIMS: 
        num_embed = len(set(id2token))
        emb_layer = nn.Embedding(num_embed, embed_dim)
        
        '''the following line is used for pre-trained embedding'''
        #embed_layer, num_embed, embed_dim = create_emb_layer(create_weights(vectors, id2token), non_trainable = True)
        
        for cat_mode in CAT_MODES:
            print('Vocab_size:{}, Embed_dim:{}, cat_mode:{}'.format(vocab_size, embed_dim, cat_mode))
                 
            for model_str, model_class in MODEL_TYPES.items():
                # Generate filename to save model
                # Will need to change for pretrained vectors
                filename = '{}_{}_{}_{}.pt'.format(vocab_size, embed_dim, cat_mode, model_str)
                save_path = os.path.join('models', 'snli', filename)
                
                if model_class is NeuralNetwork:
                    model = model_class(emb_layer, embed_dim, NUM_CLASS, HIDDEN_DIM, cat_mode)
                elif model_class is LogisticRegression:
                    model = model_class(emb_layer, embed_dim, NUM_CLASS, cat_mode)
                    
                criterion = nn.NLLLoss()
                optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
                
                print('Training model.\nVocab Size: {}\nEmbedding Dimension: {}\nConcat Mode: {}\nModel: {}'.format(vocab_size, embed_dim, cat_mode, model_str))
                train_output = train_model(model=model,
                                          train_loader=train_loader, 
                                          val_loader=val_loader, 
                                          optimizer=optimizer, 
                                          criterion=criterion, 
                                          n_epochs=NUM_EPOCHES,
                                          save_file=save_path)
                results.append(vars(train_output))
                
results_df = pd.DataFrame(results)

Vocab_size:50000, Embed_dim:100, cat_mode:DIRECT
Training model.
Vocab Size: 50000
Embedding Dimension: 100
Concat Mode: DIRECT
Model: log-reg
Starting epoch 0
Epoch: [1/1], Step: [500/3125],Training Loss: 1.0192272663116455, Validation Acc: 52.1, Time: 25.08227777481079 sec
Epoch: [1/1], Step: [1000/3125],Training Loss: 1.0992815494537354, Validation Acc: 56.9, Time: 51.9400954246521 sec
Epoch: [1/1], Step: [1500/3125],Training Loss: 0.9212397933006287, Validation Acc: 60.1, Time: 85.02800846099854 sec
Epoch: [1/1], Step: [2000/3125],Training Loss: 0.8241577744483948, Validation Acc: 60.3, Time: 123.95779061317444 sec
Epoch: [1/1], Step: [2500/3125],Training Loss: 0.9588789939880371, Validation Acc: 61.5, Time: 166.792498588562 sec
Epoch: [1/1], Step: [3000/3125],Training Loss: 0.9739933609962463, Validation Acc: 59.7, Time: 215.07045888900757 sec
End of epoch 1, Training Acc: 65.23,Validation Acc: 60.5, Time: 236.84411311149597 sec
New best model found, saving at models/snli/50000_10

KeyboardInterrupt: 