In [2]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import string
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import mean_squared_error
import re
import spacy
import jovian

<IPython.core.display.Javascript object>

In [3]:
#loading the data
os.chdir("/Users/rhyschua/Desktop/Capstone Project/Practice/Data")
reviews = pd.read_csv("IMDB Dataset.csv")
reviews.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
reviews['sentiment'] = reviews['sentiment'].replace(['positive'], "1")
reviews['sentiment'] = reviews['sentiment'].replace(['negative'], "0")

In [5]:
reviews['sentiment'] = pd.to_numeric(reviews['sentiment'], errors='coerce')

In [6]:
reviews['review_length'] = reviews['review'].apply(lambda x: len(x.split()))
np.mean(reviews['review_length'])

231.15694

In [7]:
#tokenization; removing special characters, punctuation and change characters to lower case
tok = spacy.load('en')
def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') 
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

In [8]:
from collections import Counter

#count number of occurences of each word
counts = Counter()
for index, row in reviews.iterrows():
    counts.update(tokenize(row['review']))

In [9]:
#deleting infrequent words
print("num_words before:",len(counts.keys()))
for word in list(counts):
    if counts[word] < 2:
        del counts[word]
print("num_words after:",len(counts.keys()))

num_words before: 99453
num_words after: 62136


In [10]:
#creating vocabulary to index mapping and encode our review text 
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [11]:
# chose length of sentence to be 240 because mean length of sentences is 231
def encode_sentence(text, vocab2index, N=240):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

In [12]:
reviews['encoded'] = reviews['review'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
reviews.head()

Unnamed: 0,review,sentiment,review_length,encoded
0,One of the other reviewers has mentioned that ...,1,307,"[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, ..."
1,A wonderful little production. <br /><br />The...,1,162,"[[53, 194, 195, 196, 13, 32, 33, 32, 13, 4, 19..."
2,I thought this was a wonderful way to spend ti...,1,166,"[[121, 274, 25, 38, 53, 194, 275, 64, 276, 202..."
3,Basically there's a family where a little boy ...,0,138,"[[348, 349, 187, 53, 350, 88, 53, 195, 351, 20..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,230,"[[1, 395, 187, 20, 314, 47, 4, 202, 3, 396, 20..."


In [13]:
#split for trainign and testing
X = list(reviews['encoded'])
y = list(reviews['sentiment'])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [14]:
#turn data into pytoch dataset

class ReviewsDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]

In [15]:
train_df = ReviewsDataset(X_train, y_train)
test_df = ReviewsDataset(X_test, y_test)

In [17]:
##set to loader format to feed data into lstm model
batch_size = 10000
vocab_size = len(words)
train_loader = DataLoader(train_df, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_df, batch_size=batch_size)

In [41]:
class LSTM_model(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 2)
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [42]:
from torch.optim import Adam

lstm_model = LSTM_model(vocab_size, 50, 50)
parameters = filter(lambda p: p.requires_grad, lstm_model.parameters())
optimizer = torch.optim.Adam(parameters, lr=0.001)
loss_fn = nn.CrossEntropyLoss()

In [43]:
##train model and return accurcy
def train_model(model, epochs=10, lr=0.001):
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_loader:
            x = x.long()
            y = y.long()
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = loss_fn(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        test_loss, test_acc, test_rmse =test_metrics(model, test_loader)
        if i % 5 == 1:
            print("train loss %.3f, test loss %.3f, test accuracy %.3f, and test rmse %.3f" % (sum_loss/total, test_loss, test_acc, test_rmse))

def test_metrics (model, test_loader):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in test_loader:
        x = x.long()
        y = y.long()
        y_hat = model(x, l)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total, sum_rmse/total

In [44]:
train_model(lstm_model)

train loss 0.693, test loss 0.693, test accuracy 0.503, and test rmse 0.705
train loss 0.691, test loss 0.693, test accuracy 0.508, and test rmse 0.701
