In [35]:
from pathlib import Path
from google_drive_downloader import GoogleDriveDownloader as gdd

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import DataLoader, Dataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [36]:
DATA_PATH = 'data/imdb_reviews.csv'
if not Path(DATA_PATH).is_file():
    gdd.download_file_from_google_drive(
        file_id='1zfM5E6HvKIe7f3rEt1V2gBpw5QOSSKQz',
        dest_path=DATA_PATH)
    
df = pd.read_csv(DATA_PATH)  
len(df.label.tolist())
data = df.iloc[:,0]
labels = df.iloc[:,1].tolist()

def split_data(review, label, training_ratio):
    total = len(review)
    n = round(total * training_ratio)
    
    training_data = review[0:n]
    training_lables = label[0:n]
    evaluation_data = review[n:].tolist()
    evaluation_labels = label[n:]
    
    return training_data, training_lables, evaluation_data, evaluation_labels
  
  
train_data, train_labels, test_data, test_labels = split_data(data, labels, 0.7)
print(len(train_data), ' ', len(test_data))
train_data[0]  
test_data[0]

43508   18647


'This gripping tale of intergenerational love, jealousy and revenge was even more enjoyable to see on DVD years after its PBS broadcast, with a sharper picture and crisper sound. My only reservations are that the plot has a few improbable moments and that some of the stronger Manchester accents are difficult at times. Luckily even missing a word here and there won\'t spoil the fun: the primary actors are ideally cast. Robson Green brings an enigmatic smile, a go-for-broke temperament and an athletic physicality to his role as a young surgeon who falls hopelessly for the wife of his boss at the hospital where he\'s just begun to work. Francesca Annis is one of the most striking 50-ish women imaginable; her acting rivals her beauty. (The love scenes between these two demonstrate better than words how little the age difference matters to them!) Each of the supporting characters is sharply drawn and excellently portrayed as well. The mix of pithy dialog and passionate excess makes this a d

In [37]:
class Corpus(Dataset):
    def __init__(self, data, labels, max_seq_len):
        self.max_seq_len = max_seq_len
        vectorizer = CountVectorizer(stop_words='english', min_df=0.01)
        vectorizer.fit(data)
        self.labels = labels
        self.token2idx = vectorizer.vocabulary_
        self.token2idx['<PAD>'] = max(self.token2idx.values()) + 1
        
        self.tokenizer = vectorizer.build_analyzer() 
        
        ## ADD YOUR CODE HERE
        ## Encode review
          
        self.encoded_text = []
        for review in data:
            encoded_review = self.encodeReview(review)
            self.encoded_text.append(encoded_review)
        print(self.token2idx)
        print(self.encoded_text[0])

        # for review in data:
        #     print(review)
            
    def encodeReview(self,review):
        import string
        encoded_text=[]
        exclude = set(string.punctuation)
        pad = self.token2idx['<PAD>']
        sentence = review.lower().split()
        for words in sentence:
            word2 = ''.join(ch for ch in words if ch not in exclude) #remove the punctuat
            if word2 in self.token2idx:
                encoded_text.append(self.token2idx[word2])
        num_pads = self.max_seq_len - len(encoded_text)
        for i in range(num_pads):
            encoded_text.append(self.token2idx['<PAD>'])
        if len(encoded_text)>100:
            del encoded_text[100:]
        return encoded_text
      
    def __getitem__(self, i):
        ## ADD YOUR CODE HERE
        # return the encoded_text[i] and its label[i] 
        return self.encoded_text[i], self.labels[i]

    
    def __len__(self):
        ## ADD YOUR CODE HERE
        # return the number of encoded_texts  
        return len(self.encoded_text)
                      
        
max_seq_len = 100        
dataset = Corpus(train_data, train_labels, max_seq_len)   
len(dataset.token2idx)

[907, 904, 483, 811, 926, 88, 1355, 1174, 1189, 731, 351, 190, 212, 211, 1098, 399, 543, 763, 1373, 351, 190, 211, 1099, 190, 1057, 252, 738, 1367, 135, 1214, 80, 1494, 518, 989, 607, 1475, 1029, 1350, 1378, 134, 831, 1409, 658, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535, 1535]


1536

In [0]:
## convert each sequence and label to LongTensor and FloatTensor. 
def collate(batch):
    inputs = torch.LongTensor([x[0] for x in batch])
    target = torch.FloatTensor([x[1] for x in batch])
    return inputs, target

batch_size = 512
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate)

# Long-Short Term Memory Neural Network

![alt text](https://drive.google.com/uc?id=1dcjUz2wYWW43c3y3GHvsCjjSjceIV33a)


Layer       | Operations           | Input Size          | Output Size
------------| ---------------------|---------------------|---------------
Layer 1     | Embedding            |  Sentence Size      | Embedding size
Layer 2     | LSTM (NumOfLayer = 1)|  Embedding Size     | Hidden Size
Layer 3     | Fully connected      |  Hidden Size        | 1 


Embedding 

The nn.Embedding function takes at least two parameters: Vocabulary size and Embedding size 


![alt text](https://drive.google.com/uc?id=1xq_Pzde-M-SPWcGOafY-aN9JMGTT4i-N)

In [0]:
class LSTM(nn.Module):
    ## ADD YOUR CODE HERE

    
    def __init__(self, hidden_size, num_layers,num_classes, vocab_size,embedding_size):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.encoder = nn.Embedding(vocab_size,embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True)
        self.decoder = nn.Linear(hidden_size, num_classes)     
        self.weight_init()
    
    def weight_init(self):
        ## Embedding layer
        nn.init.xavier_uniform_(self.encoder.weight)
        ## Fully connected layer
        nn.init.xavier_uniform_(self.decoder.weight)

      
    def forward(self, inputs, hidden):
        ## ADD YOUR CODE HERE
        output = self.encoder(inputs)
        output, hidden = self.lstm(output, hidden)
        output = self.decoder(output[:, -1, :])
        output = output.squeeze(1)
        return output
        
        

In [0]:
num_layer = 1
hidden_size = max_seq_len 

## ADD YOUR CODE HERE
num_classes = 1
vocab_size = len(dataset.token2idx)
embedding_size = 128
model = LSTM(hidden_size, num_layer,num_classes,vocab_size,embedding_size)
model = model.to(device)
model = model.to(device)

## binary_cross_entropy_with_logits (aka BCE with sigmoid) 
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)


In [41]:
num_epoch = 30

model.train()
for epoch in range(num_epoch):
    losses = []
    total = 0
    for i, (inputs, target) in enumerate(train_loader):
        inputs = inputs.to(device)
        target = target.to(device)
        
        # num_layer x batch_size x hidden_size
        h0 = torch.zeros(num_layer, inputs.size(0), hidden_size).to(device) 
        c0 = torch.zeros(num_layer, inputs.size(0), hidden_size).to(device)
        hidden = (h0, c0)
        
        model.zero_grad()
        
        output = model(inputs, hidden)
    
        loss = criterion(output, target)
        
        loss.backward()
              
        nn.utils.clip_grad_norm_(model.parameters(), 1)

        optimizer.step()
        
        losses.append(loss.item())
        total += 1
    
    
    epoch_loss = sum(losses) / total
    print(f'Epoch {epoch + 1}:\tLoss: {epoch_loss:.3f}')


Epoch 1:	Loss: 0.683
Epoch 2:	Loss: 0.513
Epoch 3:	Loss: 0.318
Epoch 4:	Loss: 0.278
Epoch 5:	Loss: 0.251
Epoch 6:	Loss: 0.231
Epoch 7:	Loss: 0.213
Epoch 8:	Loss: 0.195
Epoch 9:	Loss: 0.176
Epoch 10:	Loss: 0.164
Epoch 11:	Loss: 0.151
Epoch 12:	Loss: 0.142
Epoch 13:	Loss: 0.154
Epoch 14:	Loss: 0.130
Epoch 15:	Loss: 0.126
Epoch 16:	Loss: 0.120
Epoch 17:	Loss: 0.108
Epoch 18:	Loss: 0.106
Epoch 19:	Loss: 0.104
Epoch 20:	Loss: 0.098
Epoch 21:	Loss: 0.095
Epoch 22:	Loss: 0.097
Epoch 23:	Loss: 0.089
Epoch 24:	Loss: 0.085
Epoch 25:	Loss: 0.085
Epoch 26:	Loss: 0.092
Epoch 27:	Loss: 0.083
Epoch 28:	Loss: 0.088
Epoch 29:	Loss: 0.084
Epoch 30:	Loss: 0.094


In [0]:
def predict_sentiment(text):
    model.eval()                        
    with torch.no_grad():               # do not save history
        ## ADD YOUR CODE HERE
        encoded_review = dataset.encodeReview(text)

        inputs = torch.LongTensor(encoded_review)
        inputs = inputs.to(device)
        inputs = inputs.unsqueeze(0)
        # num_layer x batch_size (now 1), hidden size
        h0 = torch.zeros(num_layer, 1, hidden_size).to(device)
        c0 = torch.zeros(num_layer, 1, hidden_size).to(device)
        hidden = (h0, c0)
        prediction = model(inputs, hidden)
        
        return prediction


In [43]:
TP = 0
TN = 0
FP = 0
FN = 0
threshold = 0 

for i in range(len(test_data)):
    test_text = test_data[i]
    prediction = predict_sentiment(test_text)
    if (test_labels[i]):
       if (prediction > threshold):
          TP += 1
       else: 
          FN += 1
    else:
      if (prediction > threshold):
          FP += 1
      else:
          TN += 1
   

accuracy = (TP + TN) / (len(test_data))   

print('Accuracy:',accuracy * 100)

Accuracy: 87.40279937791601
