In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score
import transformers
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
#device = 'cpu'
print(device)
from torch import nn
import torch.nn.functional as F
#from tqdm import trange
from tqdm.notebook import tqdm, trange
import os
#from torch.optim import AdamW
from transformers import AdamW, get_linear_schedule_with_warmup
from model import TransTCN

from torchtext.legacy.data import Field,LabelField,BucketIterator,TabularDataset 
from torchtext import vocab
from nltk import word_tokenize 
import nltk
nltk.download('punkt')


cuda


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hardeep\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
rev = pd.read_json('yelp_review_training_dataset.jsonl',lines=True)
rev = rev.drop(columns={'review_id'})
rev.head()

Unnamed: 0,text,stars
0,Total bill for this horrible service? Over $8G...,1
1,I *adore* Travis at the Hard Rock's new Kelly ...,5
2,I have to say that this office really has it t...,5
3,Went in for a lunch. Steak sandwich was delici...,5
4,Today was my second out of three sessions I ha...,1


In [3]:
OUT_PATH = 'C:/Users/Hardeep/Desktop/nlp_proj/'
# X = rev['text']
# y = rev['stars']
# X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.1, random_state=123)
# X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.4, random_state=123)

df,test_df = train_test_split(rev[:1000],test_size=0.25,random_state=123)
# stratify tries to split in a manner that distribution of 'toxic' is same in both train and test

train_df,val_df = train_test_split(df,test_size=0.20,random_state=123)

train_df.reset_index(drop=True),val_df.reset_index(drop=True), test_df.reset_index(drop=True)
train_df.to_csv(OUT_PATH+'train.csv',index=False)
val_df.to_csv(OUT_PATH+'val.csv',index=False)
test_df.to_csv(OUT_PATH+'test.csv',index=False)

# X_train.to_csv(OUT_PATH+'training_data.csv', index=False)
# X_val.to_csv(OUT_PATH+'validation_data.csv', index=False)
# X_test.to_csv(OUT_PATH+'test_data.csv', index=False)

# y_train.to_csv(OUT_PATH+'training_labels.csv', index=False)
# y_val.to_csv(OUT_PATH+'validation_labels.csv', index=False)
# y_test.to_csv(OUT_PATH+'test_labels.csv', index=False)



In [4]:
glove = vocab.Vectors('glove.6B.300d.txt', OUT_PATH)

In [5]:
text_field = Field(tokenize=word_tokenize)
# tokenize text using word_tokenize and convert to numerical form using default parameters

label_field = LabelField(dtype=torch.float) 
# useful for label string to LabelEncoding. Not useful here but doesn't hurt either

fields = [('text',text_field),('stars',label_field)] 
# (column name,field object to use on that column) pair for the dictonary


train, val, test = TabularDataset.splits(path=OUT_PATH, train='train.csv',validation='val.csv',test='test.csv', 
                                         format='csv',skip_header=True,fields=fields)
print('done splitting')


text_field.build_vocab(train,max_size=100000,vectors=glove,unk_init=torch.Tensor.zero_) 

# unk_init = torch.tensor.normal_ set the initial vectors of vocab as the glove vectors and  
# initialize unknown words as normal distribution instead of zeros
print('done building vocav')

label_field.build_vocab(train) 


train_iter, val_iter, test_iter = BucketIterator.splits((train,val,test), batch_sizes=(32,128,128),
                                              sort_key=lambda x: len(x.comment_text),
                                              sort_within_batch=False,
                                              device=device) 

done splitting
done building vocav


In [54]:
in_neuron = len(text_field.vocab)
embedding_dim = 10000
rnn_kwargs = {'num_layers':4,'bidirectional':True}

In [55]:
class DeepNetwork(torch.nn.Module):
    '''
    Deep RNN Network which can have either one both of stacked and bi-directional properties
    '''
    
    def __init__(self,in_neuron,embedding_dim=100,hidden_size=256,out_neuron=1,m_type='rnn',drop=0.33,**kwargs):
        '''
        Constructor of the class which will instantiate the layers while initialisation.
        
        args:
            in_neuron: input dimensions of the first layer {int}
            embedding_dim: number of latent features you want to calculate from the input data {int} default=100
            hidden_size: neurons you want to have in your hidden RNN layer {int} default=256
            out_neuron: number of outputs you want to have at the end.{int} default=1
            model: whether to use 'rnn','lstm' or 'gru' {string} 
            drop: proportion of values to dropout from the previous values randomly {float 0-1} default=0.53
            **kwargs: any valid torch.nn.RNN, torch.nn.LSTM or torch.nn.GRU args with either 'bidirectional'=True 
                      or 'num_layers'>1
        out: 
            return a tensor of shape {batch,out_neuron} as output 
        '''
        super(DeepNetwork,self).__init__()
        
        self.m_type = m_type
        
        self.embedding = torch.nn.Embedding(in_neuron,embedding_dim)
        
        if self.m_type == 'lstm':
            self.lstm = torch.nn.LSTM(embedding_dim,hidden_size,**kwargs)
        elif self.m_type == 'gru':
            self.gru = torch.nn.GRU(embedding_dim,hidden_size,**kwargs)
        else:
            self.rnn = torch.nn.RNN(embedding_dim,hidden_size,**kwargs) 
        
        self.dropout = torch.nn.Dropout(drop) 
        
        self.dense = torch.nn.Linear(hidden_size*2,5)
        # Last output Linear Layer will have the two Hidden States from both the directions to have the result
        
    
    def forward(self,t):
        '''
        Activate the forward propagation
        args:
            t: tensors in the form of a batch {torch.tensor}
        '''
        t = self.dropout(self.embedding(t)) # get embeddings and dropout
    
        if self.m_type == 'lstm':
            out, (hidden,_) = self.lstm(t)
        elif self.m_type == 'gru':
            out, hidden = self.gru(t)
        else:
            out, hidden = self.rnn(t)
        # shape of rnn = (seq_len, batch, num_directions * hidden_size)
        
        # Concatenate the last and second last hidden. One is from backward and one is from forward
        t = self.dropout(torch.cat((hidden[-2,:,:],hidden[-1,:,:]),dim=1))
       
        return self.dense(t)

In [56]:
network = DeepNetwork(in_neuron,m_type='lstm',**rnn_kwargs)
network.to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = AdamW(network.parameters(), lr=5e-3, correct_bias=True)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,num_training_steps=len(train_iter) * 3)

In [57]:
def train_network(network,train_iter,optimizer,loss_fn,epoch_num):
    '''
    train the network using given parameters
    args:
        network: any Neural Network object 
        train_batch: iterator of training data
        optimizer: optimizer for gradients calculation and updation
        loss_fn: appropriate loss function
        epoch_num = Epoch number so that it can show which epoch number in tqdm Bar
    out:
        a tuple of (average_loss,average_accuracy) of floating values for a single epoch
    '''
    training_acc = 0 
    losses = []
    network.train() # set the model in training mode as it requires gradients calculation and updtion
    # turn off while testing using  model.eval() and torch.no_grad() block
    
    for batch in tqdm(train_iter,f"Epoch: {epoch_num}"): 
        # data will be shown to model in batches per epoch to calculate gradients per batch
        
        output = network(batch.text.to(device))
        #print(output)
        prediction = torch.max(output, 1)[1]
        #import pdb; pdb.set_trace()
        #print(prediction)
        #import pdb; pdb.set_trace()
        training_loss = criterion(output, batch.stars.long())
        if torch.sum(prediction == 0) == len(prediction):
            print('all zeroes')
        
        #print(training_loss)
        training_acc += torch.sum(prediction == batch.stars)
   
        losses.append(training_loss.item())
        training_loss.backward()
        nn.utils.clip_grad_norm_(network.parameters(), max_norm=1)
        
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
    return training_acc / 1000, np.mean(losses)

In [58]:
for epoch in trange(3):
    print('Epoch: ' , str(epoch))
    print('==================================')
    training_accuracy, training_loss = train_network(network, train_iter,optimizer, criterion, epoch+1)
    print(training_accuracy)
    print(training_loss)
    print('==================================')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))

Epoch:  0


HBox(children=(HTML(value='Epoch: 1'), FloatProgress(value=0.0, max=19.0), HTML(value='')))

all zeroes
all zeroes
all zeroes
all zeroes
all zeroes
all zeroes
all zeroes
all zeroes

tensor(0.2030, device='cuda:0')
1.5368494234587018
Epoch:  1


HBox(children=(HTML(value='Epoch: 2'), FloatProgress(value=0.0, max=19.0), HTML(value='')))

all zeroes
all zeroes
all zeroes
all zeroes
all zeroes
all zeroes
all zeroes
all zeroes
all zeroes
all zeroes
all zeroes
all zeroes
all zeroes
all zeroes
all zeroes

tensor(0.2530, device='cuda:0')
1.4470274260169582
Epoch:  2


HBox(children=(HTML(value='Epoch: 3'), FloatProgress(value=0.0, max=19.0), HTML(value='')))

all zeroes
all zeroes
all zeroes
all zeroes
all zeroes
all zeroes

tensor(0.2580, device='cuda:0')
1.381242545027482

