In [None]:
"""
This file is for training model and save final output
1) read the processed files and vocabulary
2) train model
3) model inference
4) generate and persist submission csv
"""

In [15]:
# import statements
import pandas as pd

import torch

import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

import torchtext

In [10]:
def get_conf():
    conf = {
            "path": "/Users/jaydeepchakraborty/JC/git-projects/model_util/",
            "data":{
                    "vocab_path": "DataSets/NLPwithDisasterTweets/disaster_tweets.pt",
                    "train_dataset": "DataSets/NLPwithDisasterTweets/train_dataset.pt",
                    "validation_dataset": "DataSets/NLPwithDisasterTweets/validation_dataset.pt",
                    "test_dataset": "DataSets/NLPwithDisasterTweets/test_dataset.pt",
            },
            "model":{
                "model_path": "Models/NLPwithDisasterTweets/disaster_tweet_",
                "train_batch_size": 5,
                "validation_batch_size": 5,
                "test_batch_size": 1,
                "n_epoch": 100,
                "valid_epoch": 10
            },
            "op":{
                "op_path": "DataSets/NLPwithDisasterTweets/submission.csv"
            }
    }
    
    return conf

In [11]:
class DisasterTweetsDataSet(torch.utils.data.Dataset):
    def __init__(self, conf, ind="train"):
        self.conf = conf
        if ind == "train":
            self.data = pd.read_csv(self.conf['path'] + self.conf['data']['train_data_path'])
            self.data = self.data.astype({"id": 'int64', "keyword": 'string', "location": 'string', "text": 'string', "target": 'int64'})
        if ind == "test":
            self.data = pd.read_csv(self.conf['path'] + self.conf['data']['test_data_path'])
            self.data = self.data.astype({"id": 'int64', "keyword": 'string', "location": 'string', "text": 'string'})
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data.iloc[idx]

In [19]:
class DisasterTweetModel(nn.Module):
    def __init__(self, vocab_size, **kwargs):
        #Constructor
        super(DisasterTweetModel, self).__init__(**kwargs)
        
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        # variables
        self.embedding_dim = 300
        self.hidden_dim = 100
        self.num_layers = 1
        self.bidirectional = True
        self.batch_first = True
        self.output_dim = 1

        #embedding layer
        self.embedding = nn.Embedding(num_embeddings=vocab_size, 
                                      embedding_dim=self.embedding_dim)

        #lstm layer
        self.lstm = nn.LSTM(input_size=self.embedding_dim,
                            hidden_size=self.hidden_dim, 
                            num_layers=self.num_layers, 
                            bidirectional=self.bidirectional,
                            batch_first=self.batch_first)

        #dense layer / linear layer
        self.fc = nn.Linear(self.hidden_dim * 2, self.output_dim)

        #activation function
        self.act = nn.Sigmoid()

    def forward(self, txt):

        '''
        Step 1: pass through the embedding layer to convert text into vectors
        '''
        # embed_txt ~ [batch_size, seq_len, embedding_dim] 
        embed_txt = self.embedding(txt)

        '''
        Step 2: passing the embeddings through LSTM layer
        '''
        # lstm_out ~ [batch_size, seq_len, (2 * hidden_dim)] 
        lstm_out, (h_n, c_n) = self.lstm(embed_txt)

        '''
        Step 3: sum all the hidden states
        '''
        # lstm_out ~ [include dimention, remove dimention, include dimention] 
        # concat_out ~ [batch_size, (2 * hidden_dim)] #concatenate hidden states
        # concat_out = lstm_out[ : , -1, : ]  #concatenate hidden states
        sum_ip = lstm_out.sum(dim=1)  #summing up hidden states
        # avg_ip = lstm_out.mean(dim=1)  #averaging the hidden states

        '''
        Step 4: feeding the weighted value to a linear layer
        '''
        # fc_out ~ [batch_size, output_dim]
        fc_out = self.fc(sum_ip)

        '''
        Step 5: feeding the linear output to activation function
        '''
        # out ~ [batch_size, output_dim]
        out = self.act(fc_out)

        return out

In [24]:
class DisasterTweetModelHelper:
    
    def __init__(self, conf):
        self.conf = conf
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        self.train_dataset = None
        self.validation_dataset = None
        self.test_dataset = None
        self.vocab = None
    
    # loading the data
    def load_data(self):
        self.train_dataset = torch.load(self.conf['path'] + conf['data']['train_dataset']) 
        self.validation_dataset = torch.load(self.conf['path'] + conf['data']['validation_dataset'])
        self.test_dataset = torch.load(self.conf['path'] + conf['data']['test_dataset'])
        self.vocab = torch.load(self.conf['path'] + conf['data']['vocab_path'])
        
        # pretrained FastText vector 
        pretrained_vectors = torchtext.vocab.FastText()
        self.pretrained_embedding = pretrained_vectors.get_vecs_by_tokens(self.vocab.get_itos())
        
        return

    # for training and validation
    def collate_batch(self, batch):
        text_transform = lambda x: [self.vocab['<sos>']] + [self.vocab[token] for token in x.split(" ")] + [self.vocab['<eos>']]
        label_transform = lambda x: 1.0 if x == 1 else 0.0
        
        keyword_lst, location_lst, text_lst, trgt_lst = [], [], [], []

        for _id, _keyword, _location, _text, _trgt in batch:
            
            processed_keyword = torch.tensor(text_transform(_keyword if isinstance(_keyword, str) else ''))
            keyword_lst.append(processed_keyword)
            
            processed_location = torch.tensor(text_transform(_location if isinstance(_location, str) else ''))
            location_lst.append(processed_location)
            
            processed_text = torch.tensor(text_transform(_text if isinstance(_text, str) else ''))
            text_lst.append(processed_text)
            
            trgt_lst.append(label_transform(_trgt))

        
        return_keyword_lst = pad_sequence(keyword_lst, padding_value=3.0, batch_first=True) # 3 is for <unk>
        return_location_lst = pad_sequence(location_lst, padding_value=3.0, batch_first=True) # 3 is for <unk>
        return_text_lst = pad_sequence(text_lst, padding_value=3.0, batch_first=True) # 3 is for <unk>
        return_trgt_lst = torch.tensor(trgt_lst)
        
        return return_keyword_lst, return_location_lst,  return_text_lst, return_trgt_lst
    
    # for testing
    def collate_batch_test(self, batch):
        text_transform = lambda x: [self.vocab['<sos>']] + [self.vocab[token] for token in x.split(" ")] + [self.vocab['<eos>']]
        label_transform = lambda x: 1.0 if x == 1 else 0.0
        
        id_lst, keyword_lst, location_lst, text_lst = [], [], [], []

        for _id, _keyword, _location, _text in batch:
            
            id_lst.append(_id) # needed for submission
            
            processed_keyword = torch.tensor(text_transform(_keyword if isinstance(_keyword, str) else ''))
            keyword_lst.append(processed_keyword)
            
            processed_location = torch.tensor(text_transform(_location if isinstance(_location, str) else ''))
            location_lst.append(processed_location)
            
            processed_text = torch.tensor(text_transform(_text if isinstance(_text, str) else ''))
            text_lst.append(processed_text)

        
        return_id_lst = torch.tensor(id_lst)
        return_keyword_lst = pad_sequence(keyword_lst, padding_value=3.0, batch_first=True) # 3 is for <unk>
        return_location_lst = pad_sequence(location_lst, padding_value=3.0, batch_first=True) # 3 is for <unk>
        return_text_lst = pad_sequence(text_lst, padding_value=3.0, batch_first=True) # 3 is for <unk>
        
        return return_id_lst, return_keyword_lst, return_location_lst,  return_text_lst
    
    def gen_loader(self):
        self.train_dataloader = DataLoader(self.train_dataset,
                                           batch_size = self.conf['model']['train_batch_size'],
                                           collate_fn=self.collate_batch)
        
        self.validation_dataloader = DataLoader(self.validation_dataset,
                                   batch_size = self.conf['model']['validation_batch_size'],
                                   collate_fn=self.collate_batch)
        
        self.test_dataloader = DataLoader(self.test_dataset,
                                   batch_size = self.conf['model']['test_batch_size'],
                                   collate_fn=self.collate_batch_test)
        
        return
    
    #define metric
    def binary_accuracy(self, preds, y):
        #round predictions to the closest integer
        rounded_preds = torch.round(preds)
        correct = (rounded_preds == y).float() 
        acc = correct.sum() / len(correct)
        return acc
    
    # training model
    def train(self):
        #instantiate the model
        train_model = DisasterTweetModel(len(self.vocab))
        # assigning pretrained_embedding 
        train_model.embedding.weight.data = self.pretrained_embedding
        train_model = train_model.to(self.device)
        
        
        #define the optimizer
        optimizer = optim.Adam(train_model.parameters())

        #define the loss
        criterion = nn.BCELoss()
        criterion = criterion.to(self.device)

        #set the model in training phase
        train_model.train()
        
        N_EPOCHS = self.conf['model']['n_epoch']
        VALIDATION_EPOCH = self.conf['model']['valid_epoch']

        for epoch in range(N_EPOCHS+1):
            #initialize every epoch 
            epoch_loss = 0
            epoch_acc = 0

            for idx, (_keyword, _location, _text, _trgt) in enumerate(self.train_dataloader):
                #resets the gradients after every batch
                optimizer.zero_grad()
                
                #get prediction
                predictions = train_model(_text)
                preds = predictions.squeeze(-1) #convert to 1D tensor
                
                #compute the loss
                loss = criterion(preds, _trgt)

                #compute the binary accuracy
                acc = self.binary_accuracy(preds, _trgt)   

                #backpropage the loss and compute the gradients
                loss.backward()

                #update the weights
                optimizer.step() 

                # compute loss and accuracy
                epoch_loss += loss.item()
                epoch_acc += acc.item()

            if epoch%VALIDATION_EPOCH == 0:
                train_model.eval() # set the model in eval phase
                valid_epoc_loss, valid_epoch_acc = self.test(train_model, criterion)
                train_model.train() # return back to training phase

                print("epoch:- ",epoch)
                print("training===> ","loss:- ", epoch_loss / len(self.train_dataloader), "  accuracy:- ", epoch_acc / len(self.train_dataloader))
                print("validation===> ","loss:- ", valid_epoc_loss, "  accuracy:- ", valid_epoch_acc)
                
                # saving intermediate model
                torch.save(train_model.state_dict(), conf['path']+conf['model']['model_path']+str(epoch)+'.pt')

            if epoch == N_EPOCHS-1:
                torch.save(train_model.state_dict(), conf['path']+conf['model']['model_path']+'final'+'.pt')
                    
        return

    
    # validation
    def test(self, train_model, criterion):
        
        epoch_loss = 0
        epoch_acc = 0
        
        with torch.no_grad():
            for idx, (_keyword, _location, _text, _trgt) in enumerate(self.validation_dataloader):
                #get prediction
                predictions = train_model(_text)
                preds = predictions.squeeze(-1) #convert to 1D tensor
                
                #compute the loss
                loss = criterion(preds, _trgt)
                
                #compute the binary accuracy
                acc = self.binary_accuracy(preds, _trgt)
                
                # compute loss and accuracy
                epoch_loss += loss.item()
                epoch_acc += acc.item()
                
        valid_epoc_loss = epoch_loss / len(self.validation_dataloader)
        valid_epoch_acc = epoch_acc / len(self.validation_dataloader)

        return valid_epoc_loss, valid_epoch_acc
    
    def inference(self):
        
        #instantiate the model
        inf_model = DisasterTweetModel(len(self.vocab))
        inf_model = inf_model.to(self.device)
        
        #loading the model
        model_path = conf['path'] + conf['model']['model_path'] + 'final' + '.pt'
        inf_model.load_state_dict(torch.load(model_path))
        
        inf_model.eval() # set the model in eval phase
        
        result_lst = []
        with torch.no_grad():
            
            for idx, (_id, _keyword, _location, _text) in enumerate(self.test_dataloader):
                predictions = inf_model(_text)
                result_lst.append([int(_id.item()), int(predictions.item())])
                
        df = pd.DataFrame(result_lst, columns =['id', 'target'], dtype = 'int64')
        
        df.to_csv(conf['path']+conf['op']['op_path'], index=False,)
            
        
    def perform(self):
        self.load_data()
        self.gen_loader()
        # self.train()
        self.inference()

In [25]:
%%time
if __name__ == "__main__":
    
    conf = get_conf()
    disaster_tweets_model_obj = DisasterTweetModelHelper(conf)
    disaster_tweets_model_obj.perform()

CPU times: user 8.27 s, sys: 3.45 s, total: 11.7 s
Wall time: 12.9 s
