 - [ ] try bigram
 - [ ] use pretraining embeddings

In [1]:
import pandas as pd
import os
import numpy as np

In [4]:
input_path = '.'

with open(os.path.join(input_path, 'dev_text.txt'), 'r', encoding='utf-8') as f:
    dev_text = f.read().strip().split('\n')

with open(os.path.join(input_path, 'heldout_text.txt'), 'r', encoding='utf-8') as f:
    heldout_text = f.read().strip().split('\n')

In [3]:
dev_label_path = os.path.join(input_path,'dev_label.txt')
with open(dev_label_path, 'r', encoding='utf-8') as f:
    dev_y = f.read().split('\n')

# Prepare data

In [11]:
dev_data = pd.DataFrame({'text':dev_text, 'label':dev_y})

dev_data.to_csv(os.path.join(input_path, 'dev_data.tsv'), sep='\t',index=False)

In [17]:
test_data = pd.DataFrame({'text':heldout_text})
test_data.to_csv(os.path.join(input_path, 'test_data.tsv'), sep='\t',index=False)

# Load data

In [46]:
import torch
from torchtext import data
import random
from torchtext.data import TabularDataset
import torch.nn as nn



SEED = 1234

torch.manual_seed(SEED)
# torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy', include_lengths = True)
LABEL = data.LabelField(dtype=str)

In [20]:
dev_datafields = [("text", TEXT), ("label", LABEL)]
dev_dataset = TabularDataset(
               path=os.path.join(input_path, 'dev_data.tsv'),
               format='tsv',
               skip_header=True,
               fields=dev_datafields)

test_datafields = [("text", TEXT)]
test_dataset = TabularDataset(
           path=os.path.join(input_path, 'test_data.tsv'),
           format='csv',
           skip_header=True,
           fields=test_datafields)

In [27]:
train_dataset, valid_dataset = dev_dataset.split(split_ratio=0.7, random_state = random.seed(SEED))

In [62]:
TEXT.build_vocab(train_dataset, 
                 max_size = 40000, 
#                  vectors = "glove.6B.100d",
#                  unk_init = torch.Tensor.normal_
                )

LABEL.build_vocab(train_dataset)

In [70]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_dataset, valid_dataset), 
    batch_size = BATCH_SIZE,
    device = device,
    sort = True,
    sort_key = lambda x: len(x.comment_text),
    sort_within_batch = True) #sort by length for padding
test_iterator = data.Iterator(
    test_dataset,
    batch_size = BATCH_SIZE, 
    device = device, 
    sort = False, 
    sort_key = lambda x: len(x.comment_text),
    sort_within_batch = True, 
    repeat = False)

In [80]:
train_dataset[0].text[:5]

['A', 'very', 'well', 'made', 'film']

In [73]:
# TEXT.vocab.freqs.most_common(1)

In [74]:
# print(LABEL.vocab.stoi)

# Define model

In [55]:
class BaselineLstm(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_dim, nlayers, dropout, pad_idx):
        super(BaselineLstm, self).__init__()
        # input padding index to embedding to prevent training embedding for paddings
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx = pad_idx)
        self.lstm = nn.LSTM(embed_size, 
                           hidden_size, 
                           num_layers=nlayers, 
                           bidirectional=True, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_size * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
    def forward(self, text, text_lengths):
        
        # [sent len, batch size]
        embedded = self.dropout(self.embedding(text)) #[sent len, batch size, emb dim]
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
#         #unpack sequence
#         output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output) # [sent len, batch size, hid dim * num directions]
        # [forward_layer_0, backward_layer_0, forward_layer_1, backward_layer 1, ..., forward_layer_n, backward_layer n]
        # use the top two hidden layers 
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        return self.fc(hidden.squeeze(0))

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)