## Importing data, preparing batch processing

In [9]:
import copy
from itertools import product
import torch
import torch.nn as nn
import torch.functional as F
import torch.optim as optim
import torchtext.data as data
import torchtext.vocab as vocab

import preprocessing as pre

In [2]:
train_data, test_data, val_data, TEXT, LABEL = pre.get_data('train_small.csv', 'val_small.csv', 'test_small.csv', None)

Connected!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  apply(lambda x: x[0: 1000000])


In [78]:
vectors = vocab.Vectors('Law2Vec.100d.txt') # Law2Vec available from https://archive.org/details/Law2Vec
TEXT.build_vocab(train_data, vectors=vectors, unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train_data)

In [79]:
USE_CUDA = torch.cuda.is_available()

In [80]:
BATCH_SIZE = 5

device = torch.device('cuda' if USE_CUDA else 'cpu')

train_it, test_it, val_it = data.BucketIterator.splits(
    (train_data, test_data, val_data), 
    batch_size = BATCH_SIZE,
    sort_key=lambda x: len(x.alj_text),
    sort_within_batch=True,
    device = device)

In [85]:
vectors['medicare']

tensor([ 0.6415, -0.5367, -0.3537, -0.0634, -0.1798,  0.0626, -0.1836, -0.2705,
         0.2504,  0.5061,  0.4746, -0.2351, -0.0465,  0.3184,  0.8974,  0.0470,
        -0.2594,  0.3485, -0.3356,  0.1163,  0.2207,  0.2707,  0.4748,  0.1122,
        -0.1188, -0.0790,  0.4377, -0.4711,  0.1401, -0.0234, -0.2009, -0.2143,
         0.1335, -0.4407,  0.4077, -0.0634,  0.5104,  0.1820, -0.4729, -0.1758,
         0.6194,  0.5708, -0.3034, -0.3658,  0.1609,  0.0753, -0.2024, -0.1472,
         0.0665,  0.1823,  0.3091, -0.0913,  0.2495,  0.0777, -0.1873, -0.5850,
        -0.3243,  0.1540, -0.5094,  0.6227,  0.1163, -0.6202, -0.4416, -0.3509,
        -0.5760, -0.4837, -0.6283,  0.0938,  0.3528, -0.0674, -0.7097, -0.2053,
        -0.6007, -0.1306,  0.0146, -0.0830,  0.5486, -0.2328, -0.3193,  0.1496,
        -0.1635,  0.0755, -0.2594, -0.0317,  0.1249, -0.5599,  0.0722, -0.0369,
         0.3139,  0.0102, -0.3353,  0.1142, -0.1163,  0.1505,  0.0952,  0.0206,
        -0.0733, -0.4851,  0.4995,  0.04

In [144]:
TEXT.vocab.vectors[TEXT.vocab.stoi['medicare']]

tensor([ 0.6415, -0.5367, -0.3537, -0.0634, -0.1798,  0.0626, -0.1836, -0.2705,
         0.2504,  0.5061,  0.4746, -0.2351, -0.0465,  0.3184,  0.8974,  0.0470,
        -0.2594,  0.3485, -0.3356,  0.1163,  0.2207,  0.2707,  0.4748,  0.1122,
        -0.1188, -0.0790,  0.4377, -0.4711,  0.1401, -0.0234, -0.2009, -0.2143,
         0.1335, -0.4407,  0.4077, -0.0634,  0.5104,  0.1820, -0.4729, -0.1758,
         0.6194,  0.5708, -0.3034, -0.3658,  0.1609,  0.0753, -0.2024, -0.1472,
         0.0665,  0.1823,  0.3091, -0.0913,  0.2495,  0.0777, -0.1873, -0.5850,
        -0.3243,  0.1540, -0.5094,  0.6227,  0.1163, -0.6202, -0.4416, -0.3509,
        -0.5760, -0.4837, -0.6283,  0.0938,  0.3528, -0.0674, -0.7097, -0.2053,
        -0.6007, -0.1306,  0.0146, -0.0830,  0.5486, -0.2328, -0.3193,  0.1496,
        -0.1635,  0.0755, -0.2594, -0.0317,  0.1249, -0.5599,  0.0722, -0.0369,
         0.3139,  0.0102, -0.3353,  0.1142, -0.1163,  0.1505,  0.0952,  0.0206,
        -0.0733, -0.4851,  0.4995,  0.04

## Setting up model

In [135]:
class RNN(nn.Module):
    def __init__(self, rnn_type, input_size, embedding_size, hidden_size, output_size,
                 num_layers, dropout, bidirectional, padding_idx):
        super().__init__()
        # self.embedding = nn.Embedding(input_size, embedding_size, padding_idx=padding_idx)
        self.embedding = nn.Embedding.from_pretrained(TEXT.vocab.vectors) # can abstract some
        self.rnn = getattr(nn, rnn_type.upper())(embedding_size, hidden_size, num_layers,
                                                 dropout=(droopout if num_layers > 1 else 0),
                                                 bidirectional=bidirectional)

        self.dropout = nn.Dropout(dropout)
        self.leakyrelu = nn.LeakyReLU()
        linear_inp = (hidden_size * 2 if bidirectional else hidden_size)
        self.linear = nn.Linear(linear_inp, output_size)
             
    def forward(self, input):
        embed = self.embedding(input)
        rnn_out, hidden = self.rnn(embed)
        rnn_out = rnn_out[-1]
        rnn_out = self.leakyrelu(rnn_out)
        dropped_rnn_out = self.dropout(rnn_out)
        linear_out = self.linear(rnn_out)
        return linear_out
    
    def evaluate(self, preds, labels):
        return self.loss_fn(pred, label)


In [136]:
def binary_accuracy(preds, y):
    """
    Return accuracy per batch
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y) # convert into float for division 
    acc = correct.sum().item() / len(correct)
    return acc

class Training_module():

    def __init__(self, model, lr, pos_weight, use_cuda, epochs):
        self.model = model
        self.use_cuda = use_cuda
        if self.use_cuda:
            model = model.cuda()
            
        self.epochs = epochs
       
        ##YOUR CODE HERE##
        # Choose an optimizer. optim.Adam is a popular choice
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
        

    
    def train_epoch(self, iterator):
        '''
        Train the model for one epoch. For this repeat the following, 
        going through all training examples.
        1. Get the next batch of inputs from the iterator.
        2. Determine the predictions using a forward pass.
        3. Compute the loss.
        4. Compute gradients using a backward pass.
        5. Execute one step of the optimizer to update the model paramters.
        '''
        epoch_loss = 0
        epoch_acc = 0
        self.model.train()
        
        for batch in iterator:
          # batch.text has the texts and batch.label has the labels.

            self.optimizer.zero_grad()
                
            ##YOUR CODE HERE##
            text = batch.alj_text
            target = batch.decision_binary
            if self.use_cuda:
                text = text.cuda()
                target = target.cuda()
            predictions = self.model.forward(text).squeeze()
            loss = self.loss_fn(predictions, target)
            accuracy = binary_accuracy(predictions, target)
        
            loss.backward()
            self.optimizer.step()
            epoch_loss += loss.item()
            epoch_acc += accuracy
        
        return epoch_loss / len(iterator), epoch_acc / len(iterator)
    
    def train_model(self, train_iterator, dev_iterator):
        """
        Train the model for multiple epochs, and after each evaluate on the
        development set.  Return the best performing model.
        """  
        dev_accs = [0.]
        best_model = None
        for epoch in range(self.epochs):
            self.train_epoch(train_iterator)
            dev_acc = self.evaluate(dev_iterator)
            print(f"Epoch {epoch}: Dev Accuracy: {dev_acc[1]} Dev Loss:{dev_acc[0]}")
            if dev_acc[1] > max(dev_accs) or best_model is None:
                best_model = copy.deepcopy(self.model)
                #best_model.flatten_parameters() # would be good for RNNs
            dev_accs.append(dev_acc[1])

        return best_model
                
    def evaluate(self, iterator):
        '''
        Evaluate the performance of the model on the given examples.
        '''
        epoch_loss = 0
        epoch_acc = 0
        self.model.eval()
    
        with torch.no_grad():
    
            for batch in iterator:

                ##YOUR CODE HERE##
                text = batch.alj_text
                target = batch.decision_binary
                if self.use_cuda:
                    text = text.cuda()
                    target = target.cuda()
                predictions = self.model.forward(text).squeeze()
                loss = self.loss_fn(predictions, target)
                acc = binary_accuracy(predictions, target)
                epoch_loss += loss.item()
                epoch_acc += acc
        
        return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [138]:
# Model architecture parameters
RNN_TYPES = ['RNN', 'LSTM']
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_SIZE = vectors.vectors.size()[1]
HIDDEN_SIZES = [1/3, 2/3]
OUTPUT_SIZE = 1
NUM_LAYERS = [1, 2]
DROPOUTS = [0.5, 0.75]
BIDIRECTIONALS = [False, True]
PADDING_IDX = TEXT.vocab.stoi[TEXT.pad_token]

# Model training hyperparameters
LEARNING_RATE = [0.01, 0.0001]
train_len = 0
train_pos = 0
for batch in train_it:
    train_len += len(batch.decision_binary)
    train_pos += batch.decision_binary.sum().item()
POS_WEIGHT = torch.tensor([(train_len - train_pos) / train_pos])
if USE_CUDA:
    POS_WEIGHT = POS_WEIGHT.cuda()
EPOCHS = 10

# Iterator over various model parameters
param_iter = product (RNN_TYPES, HIDDEN_SIZES, NUM_LAYERS, DROPOUTS,
                      BIDIRECTIONALS, LEARNING_RATE)

# Magic loop
best_models = []
for rnn_type, hidden_size, num_layers, dropout, bidirectional, lr in param_iter:
    # Print out model type here
    model = RNN(rnn_type, INPUT_DIM, EMBEDDING_SIZE, int(hidden_size * embed_size),
                OUTPUT_SIZE, num_layers, dropout, bidirectional, PADDING_IDX)
    tm = Training_module(model, lr, POS_WEIGHT, USE_CUDA, EPOCHS)
    best_model = tm.train_model(train_it, val_it)
    best_models.append(best_model) # might be nice to save accuracy and recall numbers here as well

Epoch 0: Dev Accuracy: 0.2 Dev Loss:0.932163655757904
Epoch 1: Dev Accuracy: 0.6 Dev Loss:0.9430160522460938
Epoch 2: Dev Accuracy: 0.6 Dev Loss:0.9668230414390564
Epoch 3: Dev Accuracy: 0.6 Dev Loss:0.9841375350952148
Epoch 4: Dev Accuracy: 0.6 Dev Loss:1.0054779052734375
Epoch 5: Dev Accuracy: 0.6 Dev Loss:1.0226577520370483
Epoch 6: Dev Accuracy: 0.6 Dev Loss:1.0352592468261719
Epoch 7: Dev Accuracy: 0.6 Dev Loss:1.0628145933151245
Epoch 8: Dev Accuracy: 0.6 Dev Loss:1.0771770477294922
Epoch 9: Dev Accuracy: 0.6 Dev Loss:1.098939061164856
