In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from string import punctuation
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm
from numba import cuda
device=torch.device("cuda:0")
train_on_gpu=True #To use GPU make this variable True
%matplotlib inline

# Data-preprocessing

In [2]:
with open('Data/reviews.txt') as f:
    reviews=f.read()
with open('Data/labels.txt') as t:
    labels=t.read()


In [3]:
reviews=reviews.lower()
all_text=''.join([c for c in reviews if c not in punctuation])
reviews_split=all_text.split('\n')
words=all_text.split()

In [4]:
unique_words=set(words)
wordsTOint={word:token+1 for token,word in enumerate(unique_words) }

In [5]:
reviews_int=[[]]*len(reviews_split)
for i in range(len(reviews_split)):
    reviews_int[i]=reviews_split[i].split()
    reviews_int[i]=[wordsTOint[j] for j in reviews_int[i]]

In [6]:
labels_split=labels.split('\n')

In [7]:
#1 for positive ,0 for negative
labels_int=[0]*len(labels_split)
j=0
for i in labels_split:
    if(i=='positive'):
        labels_int[j]=1
    j+=1

In [8]:
# removing 0 length text
count=0
idx=0
for i in range(len(reviews_int)):
    if(len(reviews_int[i])==0):
        count+=1
        idx=i  

del reviews_int[idx]
del labels_int[idx]

In [9]:
#Padding and truncation to 250 words
reviews_inp=[[]]*len(reviews_int)
for i in range(len(reviews_int)):
    if(len(reviews_int[i])>=250):
        reviews_inp[i]=[reviews_int[i][j] for j in range(250)]
    else:
        reviews_inp[i]=[reviews_int[i][j] for j in range(len(reviews_int[i]))]
        reviews_inp[i]+=[0]*(250-len(reviews_int[i]))

In [10]:
#Creating training testing and validation dataset of train size 80 percent
size_train=int(len(reviews_inp)*0.8)
train_x,remaining_x=reviews_inp[:size_train],reviews_inp[size_train:len(reviews_inp)]
train_y,remaining_y=labels_int[:size_train],labels_int[size_train:len(reviews_inp)]

size_test=int(len(remaining_x)*0.5)
test_x,val_x=reviews_inp[:size_test],reviews_inp[size_test:len(remaining_x)]
test_y,val_y=labels_int[:size_test],labels_int[size_test:len(remaining_x)]

In [11]:
#Batching using Dataloader
train_data=TensorDataset(torch.Tensor(train_x),torch.Tensor(train_y))
test_data=TensorDataset(torch.Tensor(test_x),torch.Tensor(test_y))
val_data=TensorDataset(torch.Tensor(val_x),torch.Tensor(val_y))

batch_size=50
train_loader=DataLoader(train_data,batch_size=batch_size,shuffle=True)
test_loader=DataLoader(test_data,batch_size=batch_size)
val_loader=DataLoader(val_data,batch_size=batch_size)


In [12]:
#output is [batch_size,seq len]
dataiter=iter(train_loader)
x,y=next(dataiter)
print(x.size(),y.size())
print(x)

torch.Size([50, 250]) torch.Size([50])
tensor([[20742.,  5747.,  3181.,  ...,     0.,     0.,     0.],
        [22608., 10591., 24425.,  ...,     0.,     0.,     0.],
        [60595., 24425., 32252.,  ...,     0.,     0.,     0.],
        ...,
        [20742., 27207., 23826.,  ...,     0.,     0.,     0.],
        [ 5796., 24368., 21501.,  ..., 24425.,  8500., 69416.],
        [26358., 69505., 18028.,  ...,     0.,     0.,     0.]])


# Model

In [5]:
class SentimentRnn(nn.Module):
    def __init__(self,vocab_size,emb_size,hidden_size,output_size,n_layers,drop_prob):
        super().__init__()
        self.vocab_size=vocab_size
        self.emb_size=emb_size
        self.hidden_size=hidden_size #hidden_size or memory_size
        self.output_size=output_size
        self.n_layers=n_layers
        self.drop_prob=drop_prob
        
        self.embedding=nn.Embedding(self.vocab_size,self.emb_size)
        self.lstm=nn.LSTM(self.emb_size,self.hidden_size,num_layers=self.n_layers,batch_first=True,dropout=self.drop_prob)
        
        self.L1=nn.Linear(self.hidden_size,self.output_size)
        self.sig=nn.Sigmoid()
        
    def forward(self,x,memory):
        batch_size=x.size(0)
        x=x.long()
        x=self.embedding(x)
        x,memory=self.lstm(x,memory)
        x=x.contiguous().view(-1,self.hidden_size)
        x=self.sig(self.L1(x))
        x=x.view(batch_size,-1)
        x=x[:,-1]
        return x,memory
    
    def init_hidden(self,batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        
        weight = next(self.parameters()).data
        
        if(train_on_gpu):
            memory=(weight.new(self.n_layers,batch_size,self.hidden_size).zero_().cuda(),
                    weight.new(self.n_layers,batch_size,self.hidden_size).zero_().cuda())
        else:
            memory=(weight.new(self.n_layers,batch_size,self.hidden_size).zero_(),
                    weight.new(self.n_layers,batch_size,self.hidden_size).zero_())
        
        
        

# Hyper parameters

In [14]:
vocab_size=len(unique_words)+1 # +1 for padding token
emb_size=500
hidden_size=250
output_size=1
n_layers=2
drop_prob=0.25
model=SentimentRnn(vocab_size,emb_size,hidden_size,output_size,n_layers,drop_prob)
print(model)

SentimentRnn(
  (embedding): Embedding(74073, 500)
  (lstm): LSTM(500, 250, num_layers=2, batch_first=True, dropout=0.25)
  (L1): Linear(in_features=250, out_features=1, bias=True)
  (sig): Sigmoid()
)


# Training

In [15]:
if(train_on_gpu):
    model.to(device)
epochs=20
clip=5 # gradient clipping
lr=0.001
criterion=nn.BCELoss()
optimizer=torch.optim.Adam(model.parameters(),lr=lr)
epoch=1
for i in range(epochs):
    model.train()
    torch.cuda.empty_cache()
    memory=model.init_hidden(batch_size)
    for inputs,labels in tqdm(train_loader):
        if(train_on_gpu):
            inputs,labels=inputs.to(device),labels.to(device)
        pred,memory=model(inputs,memory)
        memory=tuple([each.data for each in memory])
        loss=criterion(pred,labels.float())
        optimizer.zero_grad()
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
    
    model.eval()
    torch.cuda.empty_cache()
    val_h=model.init_hidden(batch_size)
    val_losses=[]
    for inputs,labels in val_loader:
        if(train_on_gpu):
            inputs,labels=inputs.to(device),labels.to(device)
        output,val_h=model(inputs,val_h)
        val_h=tuple([each.data for each in val_h])
        val_loss=criterion(output,labels.float())
        val_losses.append(val_loss.item())
    
    print(f'Epoch:{epoch}',f'loss:{loss.item()}',f'val_loss:{np.mean(val_losses)}')
    epoch+=1   
        

100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [01:43<00:00,  3.86it/s]


Epoch:20 loss:0.6972181797027588 val_loss:0.6889009487628937


100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [02:03<00:00,  3.23it/s]


Epoch:20 loss:0.6628994345664978 val_loss:0.682300478219986


100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [02:48<00:00,  2.38it/s]


Epoch:20 loss:0.42085981369018555 val_loss:0.3356804347038269


100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [02:46<00:00,  2.40it/s]


Epoch:20 loss:0.27438297867774963 val_loss:0.23062832683324813


100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [03:00<00:00,  2.21it/s]


Epoch:20 loss:0.2518778443336487 val_loss:0.15734880149364472


100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [03:07<00:00,  2.14it/s]


Epoch:20 loss:0.05890902504324913 val_loss:0.08432800490409136


100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [03:25<00:00,  1.94it/s]


Epoch:20 loss:0.13598604500293732 val_loss:0.05255762210115791


100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [03:18<00:00,  2.01it/s]


Epoch:20 loss:0.020424121990799904 val_loss:0.025153682632371784


100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [03:18<00:00,  2.02it/s]


Epoch:20 loss:0.17024913430213928 val_loss:0.04317868530750275


100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [03:45<00:00,  1.77it/s]


Epoch:20 loss:0.04330524429678917 val_loss:0.020837718909606336


100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [03:52<00:00,  1.72it/s]


Epoch:20 loss:0.006044161971658468 val_loss:0.013915030073840172


100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [04:35<00:00,  1.45it/s]


Epoch:20 loss:0.24662530422210693 val_loss:0.010781166146043688


100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [04:55<00:00,  1.35it/s]


Epoch:20 loss:0.02429526299238205 val_loss:0.015440317369066179


100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [05:29<00:00,  1.21it/s]


Epoch:20 loss:0.0032402127981185913 val_loss:0.011880716888699681


100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [05:44<00:00,  1.16it/s]


Epoch:20 loss:0.0028072355780750513 val_loss:0.011756487370003016


100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [05:46<00:00,  1.15it/s]


Epoch:20 loss:0.0052403113804757595 val_loss:0.03468785788398236


100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [05:51<00:00,  1.14it/s]


Epoch:20 loss:0.020875845104455948 val_loss:0.0069986439589411024


100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [05:15<00:00,  1.27it/s]


Epoch:20 loss:0.008807538077235222 val_loss:0.008414144580019638


100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [05:47<00:00,  1.15it/s]


Epoch:20 loss:0.009807484224438667 val_loss:0.005250590714276768


100%|████████████████████████████████████████████████████████████████████████████████| 400/400 [06:04<00:00,  1.10it/s]


Epoch:20 loss:0.002642920007929206 val_loss:0.004757403862895444


# Save Model


In [20]:
model_name = 'rnn_20_epoch.net'

checkpoint = {'vocab_size': vocab_size,
              'emb_size': emb_size,
              'hidden_size': hidden_size,
              'output_size': output_size,
              'n_layers': n_layers,
              'drop_prob': drop_prob,
              'state_dict': model.state_dict(),
               'wordTOint':wordsTOint}

with open(model_name, 'wb') as f:
    torch.save(checkpoint, f)

# Testing

In [23]:
if(train_on_gpu):
    model.to(device)
batch_size=50
model.eval()
torch.cuda.empty_cache()
test_h=model.init_hidden(batch_size)
correct_test=0
for inputs,labels in test_loader:
    if(train_on_gpu):
        inputs,labels=inputs.to(device),labels.to(device)
    output,test_h=model(inputs,test_h)
    test_h=tuple([each.data for each in test_h])
    output=torch.round(output).detach().cpu().numpy()
    labels=labels.cpu().numpy()
    for i in range(len(labels)):
        if(labels[i]==output[i]):
            correct_test+=1
print(f'test_accuracy: {correct_test/len(test_loader.dataset)}') 

test_accuracy: 0.9988


# Load Model

In [22]:
with open('rnn_20_epoch.net', 'rb') as f:
    checkpoint = torch.load(f)
vocab_size=checkpoint['vocab_size']
emb_size=checkpoint['emb_size']
hidden_size=checkpoint['hidden_size']
output_size=checkpoint['output_size']
n_layers=checkpoint['n_layers']
drop_prob=checkpoint['drop_prob']
wordTOint=checkpoint['wordTOint']
model= SentimentRnn(vocab_size,emb_size,hidden_size,output_size,n_layers,drop_prob)
model.load_state_dict(checkpoint['state_dict'])


<All keys matched successfully>

# Inference

In [12]:
class pred:
    def __init__(self):
        with open('rnn_20_epoch.net', 'rb') as f:
            checkpoint = torch.load(f)
        self.vocab_size=checkpoint['vocab_size']
        self.emb_size=checkpoint['emb_size']
        self.hidden_size=checkpoint['hidden_size']
        self.output_size=checkpoint['output_size']
        self.n_layers=checkpoint['n_layers']
        self.drop_prob=checkpoint['drop_prob']
        self.wordTOint=checkpoint['wordTOint']
        self.loaded = SentimentRnn(self.vocab_size,self.emb_size,self.hidden_size,self.output_size,self.n_layers,self.drop_prob)
        self.loaded.load_state_dict(checkpoint['state_dict'])

        
    def predict(self,text):
        batch_size=1;
        reviews=text.lower()
        all_text=''.join([c for c in reviews if c not in punctuation])
        reviews=all_text.split()
        reviews_int=[self.wordTOint[i] for i in reviews]
        if(len(reviews_int)>=250):
            reviews_inp=reviews_int[:250]
        else:
            reviews_inp=reviews_int+[0]*(250-len(reviews_int))
        inputs=torch.Tensor(reviews_inp)
        inputs=inputs.unsqueeze(0)
        if(train_on_gpu):
            inputs.to(device)
        self.loaded.eval()
        h=self.loaded.init_hidden(batch_size)
        output,h=self.loaded(inputs,h)
        output=torch.round(output)
        return output
    

In [18]:
a=pred()
b=a.predict('This movie had the best acting and the dialogue was so good. I loved it.')
if(b==0):
    print("NEGATIVE")
else:
    print("POSITIVE")

POSITIVE
