In [37]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [38]:
from google.colab import drive
drive.mount('/content/drive')

data_path = '/content/drive/MyDrive/University/NLP/Final-Project/data'
# data_path = '/content/drive/MyDrive/data'

train_data_path = f'{data_path}/train'
true_train_path = f'{train_data_path}/true.csv'
false_train_path = f'{train_data_path}/false.csv'

test_data_path = f'{data_path}/test'
true_test_path = f'{test_data_path}/true.csv'
false_test_path = f'{test_data_path}/false.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [39]:
# Pytorch's nn module has lots of useful feature
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from nltk.tokenize import sent_tokenize, word_tokenize
from torch.nn.utils.rnn import pad_sequence
import csv

def pad_batched_sequence(batch):
    tweets = []
    tweets_lengths = []
    labels = []
    for (tweet, label) in batch:
      tweets.append(torch.tensor(tweet).cuda())
      tweets_lengths.append(len(tweet))
      labels.append(label)

    tweets = pad_sequence(tweets, padding_value=0, batch_first=True).cuda()
    tweets_lengths = torch.tensor(tweets_lengths).cuda()
    labels = torch.tensor(labels).cuda()
    return tweets, tweets_lengths, labels

word_to_idx = {
  '<pad>': 0,
  '<start>': 1,
  '<stop>': 2
}
class SpellCheckingDataset(Dataset):

    def __init__(self, data_paths, labels, batch_size=32):
        self.dataset = []
        
        idx = 3
        for i in range(len(data_paths)):
          data_path = data_paths[i]
          with open(data_path, 'r', encoding='utf-8') as file:
              data = csv.reader(file)
              for item in data:
                tokenized_tweet = ['<start>'] + word_tokenize(item[0]) + ['<stop>']
                for word in tokenized_tweet:
                  if (word not in word_to_idx):
                    word_to_idx[word] = idx
                    idx += 1
                self.dataset.append((tokenized_tweet, labels[i]))
        self.batch_size = batch_size
        self.vocab_size = len(word_to_idx)

    def __getitem__(self, idx):
        return [word_to_idx[w] for w in self.dataset[idx][0]], self.dataset[idx][1]

    def __len__(self):
      return len(self.dataset)

class LSTMNet(nn.Module):
    
    def __init__(self,vocab_size,embedding_dim,hidden_dim,output_dim,n_layers,bidirectional,dropout):
        
        super(LSTMNet,self).__init__()
        
        # Embedding layer converts integer sequences to vector sequences
        self.embedding = nn.Embedding(vocab_size,embedding_dim)
        
        # LSTM layer process the vector sequences 
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_dim,
                            num_layers = n_layers,
                            bidirectional = bidirectional,
                            dropout = dropout,
                            batch_first = True
                           )
        
        # Dense layer to predict 
        self.fc = nn.Linear(hidden_dim * 2,output_dim)
        # Prediction activation function
        self.sigmoid = nn.Sigmoid()
        
    
    def forward(self,text,text_lengths):
        embedded = self.embedding(text)
        
        # Thanks to packing, LSTM don't see padding tokens 
        # and this makes our model better
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(),batch_first=True, enforce_sorted=False)
        
        packed_output,(hidden_state,cell_state) = self.lstm(packed_embedded)
        
        # Concatenating the final forward and backward hidden states
        hidden = torch.cat((hidden_state[-2,:,:], hidden_state[-1,:,:]), dim = 1)
        
        dense_outputs=self.fc(hidden)

        #Final activation function
        outputs=self.sigmoid(dense_outputs)
        
        return outputs
    

In [40]:

true_label = 1
false_label = 0

train_dataset = SpellCheckingDataset([true_train_path, false_train_path], 
                                          [true_label, false_label])

test_dataset = SpellCheckingDataset([true_test_path, false_test_path], 
                                          [true_label, false_label])

SIZE_OF_VOCAB = train_dataset.vocab_size
EMBEDDING_DIM = 100
NUM_HIDDEN_NODES = 64
NUM_OUTPUT_NODES = 1
NUM_LAYERS = 2
BIDIRECTION = True
DROPOUT = 0.2

In [41]:
model = LSTMNet(SIZE_OF_VOCAB,
                EMBEDDING_DIM,
                NUM_HIDDEN_NODES,
                NUM_OUTPUT_NODES,
                NUM_LAYERS,
                BIDIRECTION,
                DROPOUT
               )



In [42]:
import torch.optim as optim
model = model.cuda()
optimizer = optim.Adam(model.parameters(),lr=1e-4)
criterion = nn.BCELoss()
criterion = criterion.cuda()

In [43]:
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc

In [44]:
def train(model,iterator,optimizer,criterion):
    
    epoch_loss = 0.0
    epoch_acc = 0.0
    
    model.train()
    
    for batch in iterator:
        
        # cleaning the cache of optimizer
        optimizer.zero_grad()
        
        text,text_lengths, labels = batch
        
        # forward propagation and squeezing
        predictions = model(text,text_lengths).squeeze()
        
        # computing loss / backward propagation
        loss = criterion(predictions.double(),labels.double())
        loss.backward()
        
        # accuracy
        acc = binary_accuracy(predictions,labels)
        
        # updating params
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    # It'll return the means of loss and accuracy
    return epoch_loss / len(iterator), epoch_acc / len(iterator)
        

In [45]:
def evaluate(model,iterator,criterion):
    
    epoch_loss = 0.0
    epoch_acc = 0.0
    
    # deactivate the dropouts
    model.eval()
    
    # Sets require_grad flat False
    with torch.no_grad():
        for batch in iterator:
            text,text_lengths, labels = batch
            
            predictions = model(text,text_lengths).squeeze()
              
            #compute loss and accuracy
            loss = criterion(predictions.double(), labels.double())
            acc = binary_accuracy(predictions, labels)
            
            #keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [46]:
from torch.utils.data.dataloader import DataLoader

EPOCH_NUMBER = 15
train_loader = DataLoader(train_dataset, 
                          batch_size=32, 
                          shuffle=True,
                          drop_last=True, 
                          collate_fn=pad_batched_sequence)

test_loader = DataLoader(test_dataset, 
                          batch_size=32, 
                          shuffle=True,
                          drop_last=True, 
                          collate_fn=pad_batched_sequence)

for epoch in range(1,EPOCH_NUMBER+1):
    
    train_loss,train_acc = train(model,train_loader,optimizer,criterion)
    
    valid_loss,valid_acc = evaluate(model,test_loader,criterion)
    
    # Showing statistics
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    print()

	Train Loss: 0.692 | Train Acc: 52.11%
	 Val. Loss: 0.692 |  Val. Acc: 51.88%

	Train Loss: 0.669 | Train Acc: 58.39%
	 Val. Loss: 0.698 |  Val. Acc: 53.89%

	Train Loss: 0.632 | Train Acc: 63.48%
	 Val. Loss: 0.698 |  Val. Acc: 55.10%

	Train Loss: 0.589 | Train Acc: 68.44%
	 Val. Loss: 0.714 |  Val. Acc: 56.64%

	Train Loss: 0.542 | Train Acc: 72.23%
	 Val. Loss: 0.740 |  Val. Acc: 57.96%

	Train Loss: 0.497 | Train Acc: 75.29%
	 Val. Loss: 0.743 |  Val. Acc: 59.78%

	Train Loss: 0.457 | Train Acc: 78.12%
	 Val. Loss: 0.779 |  Val. Acc: 59.41%

	Train Loss: 0.414 | Train Acc: 80.82%
	 Val. Loss: 0.835 |  Val. Acc: 59.97%

	Train Loss: 0.380 | Train Acc: 82.61%
	 Val. Loss: 0.869 |  Val. Acc: 60.98%

	Train Loss: 0.344 | Train Acc: 84.58%
	 Val. Loss: 0.956 |  Val. Acc: 60.26%

	Train Loss: 0.310 | Train Acc: 86.54%
	 Val. Loss: 0.977 |  Val. Acc: 62.02%

	Train Loss: 0.278 | Train Acc: 88.12%
	 Val. Loss: 1.016 |  Val. Acc: 61.99%

	Train Loss: 0.249 | Train Acc: 89.44%
	 Val. Loss: 