 - [ ] try bigram
 - [ ] use pretraining embeddings

In [1]:
import pandas as pd
import os
import numpy as np

In [9]:
input_path = '.'

# with open(os.path.join(input_path, 'dev_text.txt'), 'r', encoding='utf-8') as f:
#     dev_text = f.read().strip().split('\n')

# with open(os.path.join(input_path, 'heldout_text.txt'), 'r', encoding='utf-8') as f:
#     heldout_text = f.read().strip().split('\n')

In [3]:
# dev_label_path = os.path.join(input_path,'dev_label.txt')
# with open(dev_label_path, 'r', encoding='utf-8') as f:
#     dev_y = f.read().split('\n')

# Prepare data

In [4]:
# dev_data = pd.DataFrame({'text':dev_text, 'label':dev_y})

# dev_data.to_csv(os.path.join(input_path, 'dev_data.tsv'), sep='\t',index=False)

In [5]:
# test_data = pd.DataFrame({'text':heldout_text})
# test_data.to_csv(os.path.join(input_path, 'test_data.tsv'), sep='\t',index=False)

# Load data

In [7]:
import torch
from torchtext import data
import random
from torchtext.data import TabularDataset
import torch.nn as nn



SEED = 1234

torch.manual_seed(SEED)
# torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy', include_lengths = True)
LABEL = data.LabelField(dtype=str)

In [10]:
dev_datafields = [("text", TEXT), ("label", LABEL)]
dev_dataset = TabularDataset(
               path=os.path.join(input_path, 'dev_data.tsv'),
               format='tsv',
               skip_header=True,
               fields=dev_datafields)

test_datafields = [("text", TEXT)]
test_dataset = TabularDataset(
           path=os.path.join(input_path, 'test_data.tsv'),
           format='csv',
           skip_header=True,
           fields=test_datafields)

In [11]:
train_dataset, valid_dataset = dev_dataset.split(split_ratio=0.7, random_state = random.seed(SEED))

In [16]:
TEXT.build_vocab(train_dataset, 
                 max_size = 40000, 
#                  vectors = "glove.6B.100d",
#                  unk_init = torch.Tensor.normal_
                )
LABEL.build_vocab(train_dataset)

In [19]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_dataset, valid_dataset), 
    batch_size = BATCH_SIZE,
    device = device,
    sort = True,
    sort_key = lambda x: len(x.comment_text),
    sort_within_batch = True) #sort by length for padding
test_iterator = data.Iterator(
    test_dataset,
    batch_size = BATCH_SIZE, 
    device = device, 
    sort = False, 
    sort_key = lambda x: len(x.comment_text),
    sort_within_batch = True, 
    repeat = False)

In [20]:
train_dataset[0].text[:5]

['A', 'very', 'well', 'made', 'film']

In [21]:
TEXT.vocab.freqs.most_common(1)

[('the', 15700)]

In [22]:
print(LABEL.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7f31ec5ab158>, {'neg': 0, 'pos': 1})


# Define model

In [23]:
class BaselineLstm(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_dim, nlayers, dropout, pad_idx):
        super(BaselineLstm, self).__init__()
        # input padding index to embedding to prevent training embedding for paddings
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx = pad_idx)
        self.lstm = nn.LSTM(embed_size, 
                           hidden_size, 
                           num_layers=nlayers, 
                           bidirectional=True, 
                           dropout=dropout)
        self.fc = nn.Linear(hidden_size * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
    def forward(self, text, text_lengths):
        
        # [sent len, batch size]
        embedded = self.dropout(self.embedding(text)) #[sent len, batch size, emb dim]
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
#         #unpack sequence
#         output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output) # [sent len, batch size, hid dim * num directions]
        # [forward_layer_0, backward_layer_0, forward_layer_1, backward_layer 1, ..., forward_layer_n, backward_layer n]
        # use the top two hidden layers 
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        return self.fc(hidden.squeeze(0))

In [24]:
model = BaselineLstm(vocab_size=len(TEXT.vocab), 
                    embed_size=400, 
                    hidden_size=256, 
                    output_dim=1, 
                    nlayers=3,
                    dropout=0.2, 
                    pad_idx=TEXT.vocab.stoi[TEXT.pad_token])

In [25]:
model

BaselineLstm(
  (embedding): Embedding(27331, 400, padding_idx=1)
  (lstm): LSTM(400, 256, num_layers=3, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.2)
)

In [None]:
def run(model, optimizer, train_dataloader, valid_dataloader, best_epoch, best_vali_loss, start_epoch=None):
    best_eval = None
    start_epoch = 0 if start_epoch is None else start_epoch
    max_epoch = config.max_epoch
    batch_size = config.batch_size
    
    model = model.cuda() if torch.cuda.is_available() else model
    
    ctc = CTCCriterion(size_average=True)
#     ctc = nn.CTCLoss()
    for epoch in range(start_epoch, max_epoch+1):
        start_time = time.time()
        model.train()
        # outputs records
        f = open(os.path.join(paths.output_path,'metrics.txt'), 'a')
        print_file_and_screen('### Epoch %5d' % (epoch), f=f)
        
        avg_loss = 0
        num_batches = len(train_dataloader)
        for batch, (data_batch, label_batch) in enumerate(train_dataloader): # lists, presorted, preloaded on GPU
            optimizer.zero_grad()
            phoneme, input_lengths = model(data_batch)
            target_lengths = torch.tensor([len(seq_labels) for seq_labels in label_batch])
            loss = ctc.forward((phoneme, input_lengths, target_lengths), torch.cat(label_batch))

            loss.backward()
            optimizer.step()
            avg_loss += loss.item()
            if batch%50 == 49:
                print_file_and_screen('Epoch: {}\tBatch: {}\tAvg-Loss: {:.4f}'.format(epoch, batch+1, avg_loss/50), f = f)
                avg_loss = 0.0
            # clear memory
            torch.cuda.empty_cache()
            data_batch = data_batch.detach()
            label_batch = label_batch.detach()
            del data_batch
            del label_batch
            del loss
                
        train_loss = test_validation(model, train_dataloader)
        val_loss = test_validation(model, valid_dataloader)
        print_file_and_screen('Train Loss: {:.4f}\tVal Loss: {:.4f}\t'.format(train_loss, val_loss), f=f)
        
        # check whether the best
        if val_loss < best_vali_loss:
            best_vali_loss = val_loss
            best_epoch = epoch
            is_best = True
        else:
            is_best = False
        
        with torch.no_grad():
            avg_ldistance = run_eval(model, valid_dataloader)
        print_file_and_screen('vali_distance: {:.4f}\t'.format(avg_ldistance), f=f)
        
        save_checkpoint({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'val_loss': val_loss,
            'best_vali_loss': best_vali_loss,
            'best_epoch': best_epoch,
            'optimizer_label_state_dict' : optimizer.state_dict()
        }, is_best, paths.output_path, filename='4bilstm_adam_'+str(epoch)+'.pth.tar')
        
        
        end_time = time.time()
        print_file_and_screen('Epoch time used: ', end_time - start_time, 's', f=f)
        
        f.close()
    
    # print summary to the file
    with open(os.path.join(paths.output_path,'metrics.txt'), 'a') as f:
        print_file_and_screen('Summary:', f=f)
        print_file_and_screen('- Best Epoch: %1d | - Best Val Acc: %1d'%(best_epoch, best_vali_loss), f=f)
