The link to this challenge: https://www.kaggle.com/c/word2vec-nlp-tutorial/overview

Things to do to improve the model:
    - Preprocessing data (trim off common words such as "the", "a", ...
    - Increasing vocabulary size

In [1]:
import torch
import torchtext
import torchtext.data as data
import torch.nn as nn
from IPython.display import Image
import csv
import pandas as pd

In [2]:
device = torch.device("cuda: 0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [3]:
DATA_DIR = "data/labeledTrainData.tsv"

In [6]:
# Create Field 
TEXT = data.Field(tokenize="spacy", include_lengths=True)
LABEL = data.LabelField(dtype=torch.float)

In [7]:
# Create fields based on the data table
fields = [(None,None),("label", LABEL),("text",TEXT)]

In [8]:
# Create Tabular dataset
dataset = data.TabularDataset(path=DATA_DIR,format="tsv", fields = fields, skip_header=True)

In [9]:
# Split dataset into training set and validating set
trainset, valset = dataset.split(split_ratio=0.9)
print(f"Number of training samples: {len(trainset)}")
print(f"Number of validating samples: {len(valset)}")

Number of training samples: 22500
Number of validating samples: 2500


In [10]:
# Check if data have been read properly
#print(vars(trainset.examples[0]))

In [11]:
# Build the vocabulary on the training set, using pre-trained embedding layer (word2vector)
MAX_VOCAB_SIZE = 20000
TEXT.build_vocab(trainset,
                 max_size = MAX_VOCAB_SIZE,
                 vectors = "glove.6B.100d",
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(trainset)

In [12]:
# The vocabulary automatically add <pad> and <unk> tokens so that the size increases by 2
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 20002
Unique tokens in LABEL vocabulary: 2


In [13]:
# Show 10 most common words on the training vocabulary
#print(TEXT.vocab.freqs.most_common(10))

In [14]:
BATCH_SIZE = 64
# Load dataset into iterator, sort the data by the length of words (required for packed sequence in the model)
train_iter, valid_iter = data.BucketIterator.splits(
    (trainset, valset), 
    sort_within_batch = True,
    sort_key=lambda x: len(x.text),
    batch_size = BATCH_SIZE,
    device = device)

![architecture.png](attachment:architecture.png)

In [15]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim ,
                 n_layers, bidirectional, dropout, pad_idx):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embed_dim,
                            hidden_dim,
                            num_layers=n_layers,
                            bidirectional = bidirectional,
                            dropout = dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        #self.activation = nn.Sigmoid()
    def forward(self, text, text_lengths):
        embedded = self.dropout(self.embedding(text))
        #Pack padded sequence for add padding to texts that dont have enoungh length
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        #LSTM return output, hidden state and cell state
        packed_output ,(hidden, cell) = self.lstm(packed_embedded)
        # Because we use bidirectional so that we have to concatonate two hidden layers 
        #(the last layer in both foward and backward RNN)
        hidden = self.dropout(
            torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        
        # Because we concatenate two hidden layes so the input dim for fully-connected is doubled
        output = self.fc(hidden)
        return output

In [16]:
VOCAB_SIZE = len(TEXT.vocab)
EMBED_DIM = 100
HIDDEN_DIM = 100
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
# Get the <pad> id for embedding layer
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] 

model = RNN(VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX)
print(model)

RNN(
  (embedding): Embedding(20002, 100)
  (lstm): LSTM(100, 100, num_layers=2, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=200, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [17]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,403,601 trainable parameters


In [18]:
# Load pretrained embedding for the vocabulary
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

# Set the <pad> and <unk> token to zero values (because we dont need these token for training)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBED_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBED_DIM)

In [19]:
optimizer = torch.optim.Adam(model.parameters(), lr = 0.005)
criterion = nn.BCEWithLogitsLoss().to(device)
model = model.to(device)

In [20]:
# Return accuracy
def binary_accuracy(predictions, y):
    rounded_preds = torch.round(torch.sigmoid(predictions))
    correct = (rounded_preds==y).float().sum()
    acc = correct/len(predictions)
    return acc

In [21]:
from tqdm import tqdm
def train_step(model, train_iter, optimizer, criterion):
    total_loss = 0
    total_acc = 0
    model.train()
    for batch in tqdm(train_iter):
        optimizer.zero_grad()
        text, text_lengths = batch.text
        predictions = model(text, text_lengths).squeeze()
        loss= criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        total_acc += acc.item()
    return total_loss / len(train_iter), total_acc/ len(train_iter)

In [22]:
def val_step(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
    
        for batch in iterator:
            text, text_lengths = batch.text
            predictions = model(text, text_lengths).squeeze()
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [23]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [24]:
def train(model, optimizer, criterion, train_iter, val_iter):
    EPOCHS = 10
    best_valid_acc = 0
    print("Start training...")
    for epoch in range(EPOCHS):
        start_time = time.time()
        train_loss, train_acc = train_step(model,train_iter,optimizer, criterion)
        val_loss, val_acc = val_step(model, val_iter,  criterion)
        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
        if val_acc > best_valid_acc:
            best_valid_acc = val_acc
            torch.save(model.state_dict(), 'model/model.pt')
            print("Model saved!")
        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\t Val. Loss: {val_loss:.3f} |  Val. Acc: {val_acc*100:.2f}%')
    print("Training completed.")
    return model

In [25]:
TRAIN_DATA = 0
if TRAIN_DATA:
    model = train(model,optimizer,criterion,train_iter, valid_iter)

In [26]:
import spacy
nlp = spacy.load("en")
# Predict a sentence
def predict_sentence(model, sentence):
    model.eval()
    # tokenize the sentence (by spacing)
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    
    # Convert tokens into interger
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    
    # Compute number of words in the sentence
    length = [len(indexed)]
    
    #Tensorize
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    
    #Predict probability
    prediction = torch.sigmoid(model(tensor, length_tensor))
    return prediction.item()

In [28]:
if not TRAIN_DATA:
    model.load_state_dict(torch.load("model/model 89,30.pt"))

In [29]:
TEST_DIR= "data/testData.tsv"
ID = data.LabelField(sequential=False)
test_fields = [("id", ID), ("text", TEXT)]
testset = data.TabularDataset(path = TEST_DIR, format = "tsv", fields=test_fields, skip_header= True)

In [30]:
ID.build_vocab(testset)

In [31]:
test_iter = data.BucketIterator(testset,
                                batch_size=BATCH_SIZE,
                                sort_key= lambda x: len(x.text),
                                sort_within_batch=True,
                                device = device)

In [36]:
def predict_test(model, test_iter):
    model.eval()
    result = []
    with torch.no_grad():
        for batch in tqdm(test_iter):
            text, text_lengths = batch.text
            output = model(text,text_lengths).squeeze()
            predictions = torch.sigmoid(output)
            labels = predictions >= 0.6
            ids_list = batch.id.cpu().numpy()
            labels_list = labels.cpu().numpy()
            for i,j in zip(ids_list,labels_list):
                result.append([ID.vocab.itos[i],j])
    return result

In [37]:
result = predict_test(model, test_iter)

100%|████████████████████████████████████████████████████████████████████████████████| 391/391 [00:08<00:00, 48.78it/s]


In [38]:
with open("data/submission.csv", "w", newline="") as f:
    Writer = csv.writer(f)
    Writer.writerow(["id"]+["sentiment"])
    for i in result:
        Writer.writerow([str(i[0])] + [int(i[1])])

In [35]:
print(predict_sentence(model,"This film sucks"))

0.6940540671348572
