In order to classify sentences with LSTM, I am going to use Many-to-one architecture from LSTM modules.

In [1]:
CUDA_LAUNCH_BLOCKING=1

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext.legacy import data,datasets
import torchtext.legacy
import torch.optim as optim
import time

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Definition of Tokenizer

In [3]:
def tokenizer(text):
  text = text.split(" ")
  return text

TEXT and LABEL creation

In [4]:
# In this part, basic tokenizer ais used and stop words are removed.
TEXT = data.Field(tokenize = tokenizer, batch_first= True, stop_words=stopwords.words('english'))
LABEL = data.LabelField(dtype = torch.float)

data is splitted to test and train sets

In [5]:
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:06<00:00, 13.6MB/s]


Maximum and average sentence lengths are found

In [6]:
max_size=0  ## this part of the code find maximum length of the network
count=0
sum= 0
for i in  range(len(train_data)):
  if max_size < len(train_data[i].text):
    max_size =len(train_data[i].text)
    print(max_size)
  count +=1
  sum +=len(train_data[i].text)
print("avarage: ", sum/count)

202
211
272
288
299
428
477
522
592
642
1001
1035
1532
avarage:  137.90252


#Splitting data as train,valid, test data

In [7]:
TEXT = torchtext.legacy.data.Field(tokenize=tokenizer, batch_first=True,fix_length= 137 # it is filled with average size 
                            , stop_words=stopwords.words('english')) # preprocessing parameters can be used to add aditional  preprocessing steps
LABEL = torchtext.legacy.data.LabelField(dtype = torch.float)
train_data, test_data = torchtext.legacy.datasets.IMDB.splits(TEXT, LABEL) 
test_data, valid_data = test_data.split()

The following cell is taken from Lab5

In [8]:
# Build vocabularies
TEXT.build_vocab(train_data, 
                 max_size = 30000, # Select only the most important 30000 words
                 # Load pretrained embeddings
                 vectors = "glove.6B.100d", 
                 # Set unknown vectors
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [02:43, 5.28MB/s]                           
100%|█████████▉| 399692/400000 [00:21<00:00, 19280.78it/s]

# Creating Iterators

In [9]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

In [10]:
input_dim = len(TEXT.vocab)
embedding_dim = 100 # in order to create a emb vector with dimension 100
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]
unk_idx = TEXT.vocab.stoi[TEXT.unk_token]

# LSTM Network

In [11]:
# In this part, 1-layer LSTM network is created. Before LSTM, embedding layer is used
# with dimension 100. At the output of LSTM, there is a dropout layer with prob 0.5
# Finally there is a linear layer, at the output

class LSTMNetwork(nn.Module):

  def __init__(self,input_dim,embedding_dim,pad_idx):
    super().__init__()
    self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = pad_idx)
    self.lstm = nn.LSTM(input_size =embedding_dim, hidden_size= 100, batch_first = True)
    self.dropout = nn.Dropout(0.5)
    self.linear = nn.Linear(100,1)

  def forward(self,input_ids):
        x = self.embedding(input_ids)
        lstm_out, (ht, ct) = self.lstm(x)
        ht = self.dropout(ht)
        out = self.linear(ht[-1])
        return out 

In [12]:
# Model is created and weigths are copied 
# Finally model is transferred to GPU
model = LSTMNetwork(input_dim, embedding_dim, pad_idx)
model.embedding.weight.data.copy_(TEXT.vocab.vectors)

model.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim)
model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim)
model = model.to(device)

# Optimizer and Loss function

In [13]:
optimizer = optim.Adam(model.parameters(), lr = 0.001)
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

Following parts are taken from IS784 - Lab5

In [14]:
# Helper functions
def accuracy(preds, y):
    """ Return accuracy per batch. """
    correct = (torch.round(torch.sigmoid(preds)) == y).float() 
    return correct.sum() / len(correct)

def epoch_time(start_time, end_time):
    '''Track training time. '''
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs
  
def train(model, iterator, optimizer, criterion):
    '''Train the model with specified data, optimizer, and loss function. '''
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        # Reset the gradient to not use them in multiple passes 
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = accuracy(predictions, batch.label)
        
        # Backprop
        loss.backward()
        
        # Optimize the weights
        optimizer.step()
        
        # Record accuracy and loss
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def evaluate(model, iterator, criterion):
    '''Evaluate model performance. '''
    epoch_loss = 0
    epoch_acc = 0
    
    # Turm off dropout while evaluating
    model.eval()
    
    # No need to backprop in eval
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)
            
            acc = accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [15]:
# Training loop
N_EPOCHS = 1

best_valid_loss = float('inf')
val_loss = []
val_acc = []
tr_loss = []
tr_acc = []

for epoch in range(N_EPOCHS):
    
    # Calculate training time
    start_time = time.time()
    # Get epoch losses and accuracies 
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    # Save training metrics
    val_loss.append(valid_loss)
    val_acc.append(valid_acc)
    tr_loss.append(train_loss)
    tr_acc.append(train_acc)
    
    print(f'Epoch: {epoch+1:2} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

100%|█████████▉| 399692/400000 [00:40<00:00, 19280.78it/s]

Epoch:  1 | Epoch Time: 0m 16s
	Train Loss: 0.690 | Train Acc: 51.93%
	 Val. Loss: 0.691 |  Val. Acc: 51.92%


In [16]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

In [17]:
print(f"Test accuracy for 1Epoch is {test_acc}")

Test accuracy for 1Epoch is 0.5178408498746635


In [18]:
N_EPOCHS = 9
for epoch in range(N_EPOCHS):
    
    # Calculate training time
    start_time = time.time()
    # Get epoch losses and accuracies 
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    # Save training metrics
    val_loss.append(valid_loss)
    val_acc.append(valid_acc)
    tr_loss.append(train_loss)
    tr_acc.append(train_acc)
    
    print(f'Epoch: {epoch+1:2} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch:  1 | Epoch Time: 0m 16s
	Train Loss: 0.680 | Train Acc: 56.19%
	 Val. Loss: 0.692 |  Val. Acc: 51.37%
Epoch:  2 | Epoch Time: 0m 14s
	Train Loss: 0.667 | Train Acc: 57.27%
	 Val. Loss: 0.690 |  Val. Acc: 51.96%
Epoch:  3 | Epoch Time: 0m 14s
	Train Loss: 0.654 | Train Acc: 60.90%
	 Val. Loss: 0.684 |  Val. Acc: 54.27%
Epoch:  4 | Epoch Time: 0m 14s
	Train Loss: 0.648 | Train Acc: 61.09%
	 Val. Loss: 0.700 |  Val. Acc: 49.95%
Epoch:  5 | Epoch Time: 0m 12s
	Train Loss: 0.674 | Train Acc: 56.36%
	 Val. Loss: 0.684 |  Val. Acc: 53.77%
Epoch:  6 | Epoch Time: 0m 12s
	Train Loss: 0.611 | Train Acc: 67.56%
	 Val. Loss: 0.714 |  Val. Acc: 50.81%
Epoch:  7 | Epoch Time: 0m 12s
	Train Loss: 0.667 | Train Acc: 56.86%
	 Val. Loss: 0.686 |  Val. Acc: 53.20%
Epoch:  8 | Epoch Time: 0m 12s
	Train Loss: 0.627 | Train Acc: 62.40%
	 Val. Loss: 0.730 |  Val. Acc: 55.27%
Epoch:  9 | Epoch Time: 0m 12s
	Train Loss: 0.501 | Train Acc: 77.56%
	 Val. Loss: 0.640 |  Val. Acc: 65.91%


In [19]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

In [20]:
print(f"Test accuracy for 10 Epoch is {test_acc}")

Test accuracy for 10 Epoch is 0.6628894031482891
