# Model

#### Date: 09/14/2019
#### Implement Logistic Regression and Neural Network (2-layer) to the embedding vectors

## Initialization

#### Packages

In [1]:
import os
import time
import tqdm
import datetime
import pickle as pkl
import numpy as np
import pandas as pd

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset

In [3]:
# The customized util package
import load_data
from load_data import create_weights, create_emb_layer

#### Initialize global variables

In [64]:
CURR_PATH = os.getcwd()

DATA_PATH = '/data/'
VEC_PATH = '/wiki-news-300d-1M.vec'

VOCAB_SIZE = 50000
EMBED_DIM = 300
HIDDEN_DIM = 100
MAX_SENTENCE_LENGTH = 30
BATCH_SIZE = 32
NUM_CLASS = 20
LEARNING_RATE = 0.01
NUM_EPOCHES = 5
CONCAT_MODE = "DIRECT" ## Possible values: "DIRECT", "AVERAGE"

## Load Data

In [5]:
## Load raw data sets
snli_train = pd.read_csv(CURR_PATH + DATA_PATH + "snli_train.tsv", sep='\t')
snli_val = pd.read_csv(CURR_PATH + DATA_PATH + "snli_val.tsv", sep='\t')

In [6]:
## Preprocess raw date sets
train_data = load_data.prepare_data(snli_train)
val_data = load_data.prepare_data(snli_val)
vectors = pkl.load(open('pickle/'+str(VOCAB_SIZE)+'_vectors.pkl', 'rb'))
id2token = pkl.load(open('pickle/'+str(VOCAB_SIZE)+'_id2token.pkl', 'rb'))
token2id = pkl.load(open('pickle/'+str(VOCAB_SIZE)+'_token2id.pkl', 'rb'))

In [7]:
## Convert to token lists to lists of corresponding indices
indiced_train_data, train_target = load_data.token2index_dataset(train_data, token2id, MAX_SENTENCE_LENGTH)
indiced_val_data, val_target = load_data.token2index_dataset(val_data, token2id, MAX_SENTENCE_LENGTH)

## Pytorch DataLoader

#### Customize dataloader

In [8]:
class SNLIDataset(Dataset):
    """Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    def __init__(self, data, target):
        """
        @param data: dict, with `sentence1` and `sentence2` columns consist of indiced data
        @param target: list 
        """
        self.x1 = data['sentence1']
        self.x2 = data['sentence2']
        self.y = target
        assert (len(self.x1) == len(self.x2) == len(self.y))

    def __len__(self):
        return len(self.x1)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        Turn sentence1[key] and sentence2[key] into indices
        """
        token_idx_1 = self.x1[key]
        token_idx_2 = self.x2[key]
                
        label = self.y[key]
        return [token_idx_1, len(token_idx_1), token_idx_2, len(token_idx_2), label]

In [9]:
def SNLI_collate_func(batch):
    """Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    x1_list = []
    x1_length_list = []
    x2_list = []
    x2_length_list = []
    label_list = []
    for datum in batch:
        x1_padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        x1_list.append(x1_padded_vec)
        x1_length_list.append(datum[1])
        
        x2_padded_vec = np.pad(np.array(datum[2]), 
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[3])), 
                                mode="constant", constant_values=0)
        x2_list.append(x2_padded_vec)
        x2_length_list.append(datum[3])
        
        label_list.append(datum[4])

    return [torch.from_numpy(np.array(x1_list)), torch.LongTensor(x1_length_list),
            torch.from_numpy(np.array(x2_list)), torch.LongTensor(x2_length_list),
            torch.LongTensor(label_list)]

#### Create dataloader

In [10]:
train_dataset = load_data.SNLIDataset(indiced_train_data, train_target)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=load_data.SNLI_collate_func,
                                           shuffle=True)

val_dataset = load_data.SNLIDataset(indiced_val_data, val_target)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=load_data.SNLI_collate_func,
                                           shuffle=True)

## Logistic Regression Model

#### Initialize model

In [79]:
class LogisticRegression(nn.Module):
    '''
    Logistic regression classification model
    '''
    def __init__(self, embed_layer, embed_dim, num_class, cat_mode='DIRECT'):
        super(LogisticRegression, self).__init__()
        
        self.embed = embed_layer
        self.cat_model = cat_mode
        if cat_mode == 'DIRECT':
            self.linear = nn.Linear(2*embed_dim, num_class)
        else:
            self.linear = nn.Linear(embed_dim, num_class)
        self.init_weights()
        
    def forward(self, data_pre, data_post, len_pre, len_post):
        out_pre = self.embed(data_pre)
        out_pre = torch.sum(out_pre, dim=1)
        out_pre /= len_pre.view(len_pre.size()[0],1).expand_as(out_pre).float()
        out_post = self.embed(data_post)
        out_post = torch.sum(out_post, dim=1)
        out_post /= len_post.view(len_post.size()[0],1).expand_as(out_post).float()
        if self.cat_model == 'DIRECT':
            out = torch.cat((out_pre, out_post), 1)
            logit = self.linear(out)
            return F.log_softmax(logit, dim=1)
        else:
            out = out_pre.add(out_post)
            out = torch.div(out, 2.0)
            logit = self.linear(out)
            return F.log_softmax(logit, dim=1)
    
    def init_weights(self):
        # Use some specific initialization schemes
        nn.init.xavier_normal_(self.linear.weight)
        nn.init.uniform_(self.linear.bias)

#### Define testing functions

In [80]:
# Function for testing the model
def test_lr(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data_pre, len_pre, data_post, len_post, labels in loader:
        outputs = model(data_pre, data_post, len_pre, len_post)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

#### Create Pre_trained_embedding

In [81]:
embed_layer, num_embed, embed_dim = create_emb_layer(create_weights(vectors, id2token), non_trainable = True)

#### initialize model

In [82]:
lr_model = LogisticRegression(embed_layer, embed_dim, NUM_CLASS, 'DIRECT')

#### initialize optimization

In [83]:
criterion = nn.NLLLoss()

optimizer = optim.Adam(lr_model.parameters(), lr = LEARNING_RATE)

#### Train the model

In [84]:
start = time.time()
for epoch in range(NUM_EPOCHES):
    for i, (data_pre, len_pre, data_post, len_post, label) in enumerate(train_loader):
        optimizer.zero_grad()
        lr_model.train()
        
        y_hat = lr_model(data_pre, data_post, len_pre, len_post)
        
        train_loss = criterion(y_hat, label)
        
        train_loss.backward()
        optimizer.step()
        
        if (i+1) % 500 == 0:
            val_acc = test_lr(val_loader, lr_model)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}, Time: {} sec'.format( 
                       epoch+1, NUM_EPOCHES, i+1, len(train_loader), val_acc, time.time()-start))
print("Done in {} sec".format(time.time()-start))

Epoch: [1/5], Step: [500/3125], Validation Acc: 54.2, Time: 2.26324200630188 sec
Epoch: [1/5], Step: [1000/3125], Validation Acc: 55.6, Time: 4.430830240249634 sec
Epoch: [1/5], Step: [1500/3125], Validation Acc: 55.9, Time: 6.595863103866577 sec
Epoch: [1/5], Step: [2000/3125], Validation Acc: 58.1, Time: 8.963868141174316 sec
Epoch: [1/5], Step: [2500/3125], Validation Acc: 57.2, Time: 11.220885992050171 sec
Epoch: [1/5], Step: [3000/3125], Validation Acc: 57.0, Time: 13.408932209014893 sec
Epoch: [2/5], Step: [500/3125], Validation Acc: 57.1, Time: 16.179883003234863 sec
Epoch: [2/5], Step: [1000/3125], Validation Acc: 57.9, Time: 18.47706437110901 sec
Epoch: [2/5], Step: [1500/3125], Validation Acc: 57.4, Time: 20.72262930870056 sec
Epoch: [2/5], Step: [2000/3125], Validation Acc: 57.9, Time: 22.90026021003723 sec
Epoch: [2/5], Step: [2500/3125], Validation Acc: 57.4, Time: 25.146512269973755 sec
Epoch: [2/5], Step: [3000/3125], Validation Acc: 57.0, Time: 27.330843210220337 sec
Ep

## Neural Network Model

#### initialize model

In [67]:
class NeuralNetwork(nn.Module):
    '''
    Neural Network classification model
    '''
    def __init__(self, embed_layer, embed_dim, num_class, hidden_dim, cat_mode='DIRECT'):
        super(NeuralNetwork, self).__init__()
        
        self.embed = embed_layer
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.cat_model = cat_mode
        if cat_mode == 'DIRECT':
            self.linear1 = nn.Linear(2*embed_dim, hidden_dim)
            self.linear2 = nn.Linear(hidden_dim, num_class)
        else:
            self.linear1 = nn.Linear(embed_dim, hidden_dim)
            self.linear2 = nn.Linear(hidden_dim, num_class)
        self.init_weights()
        
    def forward(self, data_pre, data_post, len_pre, len_post):
        out_pre = self.embed(data_pre)
        out_pre = torch.sum(out_pre, dim=1)
        out_pre /= len_pre.view(len_pre.size()[0],1).expand_as(out_pre).float()
        out_post = self.embed(data_post)
        out_post = torch.sum(out_post, dim=1)
        out_post /= len_post.view(len_post.size()[0],1).expand_as(out_post).float()
        if self.cat_model == 'DIRECT':
            out = torch.cat((out_pre, out_post), 1)
            z1 = self.linear1(out)
            a1 = F.relu(z1)
            logit = self.linear2(a1)
            return F.log_softmax(logit, dim=1)
        else:
            out = out_pre.add(out_post)
            out = torch.div(out,2)
            z1 = self.linear1(out)
            a1 = torch.relu(z1)
            logit = self.linear2(a1)
            return F.log_softmax(logit, dim=1)
    
    def init_weights(self):
        # Use some specific initialization schemes
        nn.init.xavier_normal_(self.linear1.weight)
        nn.init.uniform_(self.linear1.bias)
        nn.init.xavier_normal_(self.linear2.weight)
        nn.init.uniform_(self.linear2.bias)

#### Define testing functions

In [68]:
# Function for testing the model
def test_nn(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data_pre, len_pre, data_post, len_post, labels in loader:
        outputs = model(data_pre, data_post, len_pre, len_post)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

#### Create Pre_trained_embedding

In [69]:
embed_layer, num_embed, embed_dim = create_emb_layer(create_weights(vectors, id2token), non_trainable = True)

#### initialize model

In [74]:
nn_model = NeuralNetwork(embed_layer, embed_dim, NUM_CLASS, HIDDEN_DIM, 'DIRECT')

#### initialize optimization

In [75]:
criterion = nn.NLLLoss()

optimizer = optim.Adam(nn_model.parameters(), lr = LEARNING_RATE)

#### Train the model

In [76]:
start = time.time()
for epoch in range(NUM_EPOCHES):
    for i, (data_pre, len_pre, data_post, len_post, label) in enumerate(train_loader):
        optimizer.zero_grad()
        nn_model.train()
        
        y_hat = nn_model(data_pre, data_post, len_pre, len_post)
        
        train_loss = criterion(y_hat, label)
        
        train_loss.backward()
        optimizer.step()
        
        if (i+1) % 500 == 0:
            val_acc = test_lr(val_loader, nn_model)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}, Time: {} sec'.format( 
                       epoch+1, NUM_EPOCHES, i+1, len(train_loader), val_acc, time.time()-start))
print("Done in {} sec".format(time.time()-start))

Epoch: [1/5], Step: [500/3125], Validation Acc: 52.1, Time: 2.5380640029907227 sec
Epoch: [1/5], Step: [1000/3125], Validation Acc: 55.0, Time: 5.583797931671143 sec
Epoch: [1/5], Step: [1500/3125], Validation Acc: 55.7, Time: 8.719244956970215 sec
Epoch: [1/5], Step: [2000/3125], Validation Acc: 55.6, Time: 11.921544075012207 sec
Epoch: [1/5], Step: [2500/3125], Validation Acc: 60.3, Time: 15.1805100440979 sec
Epoch: [1/5], Step: [3000/3125], Validation Acc: 58.5, Time: 18.396465063095093 sec
Epoch: [2/5], Step: [500/3125], Validation Acc: 59.1, Time: 22.46944808959961 sec
Epoch: [2/5], Step: [1000/3125], Validation Acc: 61.1, Time: 25.95941400527954 sec
Epoch: [2/5], Step: [1500/3125], Validation Acc: 61.2, Time: 29.487083911895752 sec
Epoch: [2/5], Step: [2000/3125], Validation Acc: 62.6, Time: 32.767805099487305 sec
Epoch: [2/5], Step: [2500/3125], Validation Acc: 63.0, Time: 36.29167294502258 sec
Epoch: [2/5], Step: [3000/3125], Validation Acc: 62.5, Time: 39.74349308013916 sec
Ep