In [1]:
import torch 
import torch.nn as nn 
import torch.nn.functional as F
import torchtext
import time

import pandas as pd 
import numpy as np 
import re
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, classification_report 
import matplotlib.pyplot  as plt

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
#from spellchecker import SpellChecker
from tqdm import tqdm
# allows to have a progress bar in pandas, useful for long processing operations
tqdm.pandas()
from collections import Counter
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader

In [2]:
data = pd.read_csv("IMDB Dataset.csv")
data.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
def transform_label(label):
    return 1 if label == 'positive' else 0


data['label'] = data['sentiment'].progress_apply(transform_label)

100%|██████████| 50000/50000 [00:00<00:00, 1391524.06it/s]


In [4]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')
stopwords = set(stopwords.words('english'))

def rm_link(text):
    return re.sub(r'http\S+', '', text)


# handle case like "shut up okay?Im only 10 years old"
# become "shut up okay Im only 10 years old"
def rm_punct2(text):
    # return re.sub(r'[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~]', ' ', text)
    return re.sub(r'[\"\#\$\%\&\'\(\)\*\+\/\:\;\<\=\>\@\[\\\]\^\_\`\{\|\}\~]', ' ', text)


def rm_html(text):
    # remove html tags
    text = re.sub(r'<.*?>', '', text)
    # remove <br /> tags
    return re.sub(r'<br />', '', text)


def space_bt_punct(text):
    pattern = r'([.,!?-])'
    s = re.sub(pattern, r' \1 ', text)  # add whitespaces between punctuation
    s = re.sub(r'\s{2,}', ' ', s)  # remove double whitespaces
    return s


def rm_number(text):
    return re.sub(r'\d+', '', text)


def rm_whitespaces(text):
    return re.sub(r'\s+', ' ', text)


def rm_nonascii(text):
    return re.sub(r'[^\x00-\x7f]', r'', text)


def rm_emoji(text):
    emojis = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE
    )
    return emojis.sub(r'', text)


def spell_correction(text):
    # if too slow: return text
    return text
    # https://pypi.org/project/pyspellchecker/
    spell = SpellChecker()
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            candidate = spell.correction(word)
            if candidate is not None:
                corrected_text.append(candidate)
            else:
                corrected_text.append(word)
        else:
            corrected_text.append(word)
    return ' '.join(corrected_text)

def clean_pipeline(text):
    text = text.lower()
    no_link = rm_link(text)
    no_html = rm_html(no_link)
    space_punct = space_bt_punct(no_html)
    no_punct = rm_punct2(space_punct)
    no_number = rm_number(no_punct)
    no_whitespaces = rm_whitespaces(no_number)
    no_nonasci = rm_nonascii(no_whitespaces)
    no_emoji = rm_emoji(no_nonasci)
    #spell_corrected = spell_correction(no_emoji)
    return no_emoji

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/harshayarravarapu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/harshayarravarapu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/harshayarravarapu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/harshayarravarapu/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
data['review'] = data['review'].progress_apply(clean_pipeline)


100%|██████████| 50000/50000 [00:06<00:00, 7917.41it/s]


In [6]:
def tokenize(text):
    return word_tokenize(text)


def rm_stopwords(text):
    return [i for i in text if i not in stopwords]


def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(t) for t in text]
    # make sure lemmas does not contains stopwords
    return rm_stopwords(lemmas)


def preprocess_pipeline(text):
    tokens = tokenize(text)
    no_stopwords = rm_stopwords(tokens)
    lemmas = lemmatize(no_stopwords)
    return ' '.join(lemmas)

In [7]:
data['review'] = data['review'].progress_apply(preprocess_pipeline)

100%|██████████| 50000/50000 [00:51<00:00, 970.19it/s] 


In [8]:
reviews = data.review.values
# merge into single variable, separated by whitespaces
words = ' '.join(reviews)
# obtain list of words
words = words.split()
# build vocabulary
counter = Counter(words)
# only keep top 2000 words
vocab = sorted(counter, key=counter.get, reverse=True)[:2000]
int2word = dict(enumerate(vocab, 2))
int2word[0] = '<PAD>'
int2word[1] = '<UNK>'
word2int = {word: id for id, word in int2word.items()}

In [9]:
reviews_enc = [[word2int[word] if word in word2int else word2int['<UNK>'] for word in review.split()] for review in tqdm(reviews, desc='encoding')]


encoding: 100%|██████████| 50000/50000 [00:00<00:00, 63428.11it/s]


In [10]:
def pad_features(reviews, pad_id, seq_length=128):
    # features = np.zeros((len(reviews), seq_length), dtype=int)
    features = np.full((len(reviews), seq_length), pad_id, dtype=int)

    for i, row in enumerate(reviews):
        start_index = max(0, seq_length - len(row))
        # if seq_length < len(row) then review will be trimmed
        features[i, start_index:] = np.array(row)[:min(seq_length, len(row))]

    return features


seq_length = 128
features = pad_features(reviews_enc, pad_id=word2int['<PAD>'], seq_length=seq_length)


In [11]:
labels = data.label.to_numpy()

# train test split
train_size = .75  # we will use 75% of whole data as train set
val_size = .5  # and we will use 50% of test set as validation set

# stratify will make sure that train and test set have same distribution of labels
train_x, test_x, train_y, test_y = train_test_split(features, labels, test_size=1 - train_size, stratify=labels)

# split test set into validation and test set
val_x, test_x, val_y, test_y = train_test_split(test_x, test_y, test_size=val_size, stratify=test_y)

In [12]:
# define batch size
batch_size = 64

# create tensor datasets
train_dataset = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_dataset = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_dataset = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# create dataloaders
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_dataset, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)

In [13]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, num_layers):
        super(SentimentRNN, self).__init__()
        self.hidden_dim = hidden_dim
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.dropout = nn.Dropout(0.7)
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
        
    def forward(self, x, lengths):
        embedded = self.embedding(x)

        x = pack_padded_sequence(embedded, lengths.to('cpu'), batch_first=True, enforce_sorted=False)
        out, _ = self.rnn(x)

        out, _ = pad_packed_sequence(out, batch_first=True)

        out = out[:, -1, :]
        
        out = self.dropout(out)
        out = self.fc(out)
        out = self.sig(out).squeeze()
        return out


In [14]:
vocab_size = len(vocab) + 2
output_size = 1
embedding_dim = 100
hidden_dim = 256
num_layers = 2

model = SentimentRNN(vocab_size, output_size, embedding_dim, hidden_dim, num_layers)

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = model.to(device)

In [16]:
lr = 0.0001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [17]:
def calculate_accuracy(output, labels):
    pred = (output > 0.5).float()
    
    correct = pred.eq(labels.view_as(pred)).sum().item()
    total = labels.size(0)
    return correct / total

epochs = 40

model.train()
best_val_loss = float("Inf")

for epoch in range(epochs):
    total_loss = 0.0
    total_acc = 0.0
    
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        lengths = torch.clamp(inputs.sum(dim=1), max=seq_length)
        optimizer.zero_grad()

        output = model(inputs, lengths)

        loss = criterion(output, labels.float())
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()

        accuracy = calculate_accuracy(output, labels)
        total_loss += loss.item()
        total_acc += accuracy

    #wandb.log({'Training Loss': total_loss / len(train_loader), 'Training Accuracy': total_acc / len(train_loader)})

    total_val_loss = 0.0
    total_val_acc = 0.0
    model.eval()
    for inputs, labels in valid_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        lengths = torch.clamp(inputs.sum(dim=1), max=seq_length)
        output = model(inputs, lengths)
        val_loss = criterion(output, labels.float())

        accuracy = calculate_accuracy(output, labels)
        total_val_loss += val_loss
        total_val_acc += accuracy

    avg_val_loss = total_val_loss / len(valid_loader)
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), "model_2.pth")
        print("Model saved at ", best_val_loss)
        
    #wandb.log({'Validation Loss': total_val_loss / len(valid_loader), 'Validation Accuracy': total_val_acc / len(valid_loader)})

    model.train()
    print("Epoch: {}/{}...".format(epoch+1, epochs),
          "Loss: {:.6f}...".format(total_loss / len(train_loader)),
          "Accuracy: {:.6f}...".format(total_acc / len(train_loader)),
          "Val Loss: {:.6f}".format(total_val_loss / len(valid_loader)),
         "val accuracy: {:.6f}".format(total_val_acc / len(valid_loader)))

Model saved at  tensor(0.6213, grad_fn=<DivBackward0>)
Epoch: 1/40... Loss: 0.676340... Accuracy: 0.565742... Val Loss: 0.621272 val accuracy: 0.662977
Model saved at  tensor(0.6146, grad_fn=<DivBackward0>)
Epoch: 2/40... Loss: 0.615677... Accuracy: 0.669721... Val Loss: 0.614592 val accuracy: 0.680834
Model saved at  tensor(0.5838, grad_fn=<DivBackward0>)
Epoch: 3/40... Loss: 0.587790... Accuracy: 0.699844... Val Loss: 0.583782 val accuracy: 0.703816
Model saved at  tensor(0.5488, grad_fn=<DivBackward0>)
Epoch: 4/40... Loss: 0.556519... Accuracy: 0.725818... Val Loss: 0.548774 val accuracy: 0.736493
Model saved at  tensor(0.5380, grad_fn=<DivBackward0>)
Epoch: 5/40... Loss: 0.532498... Accuracy: 0.742689... Val Loss: 0.537975 val accuracy: 0.738498
Model saved at  tensor(0.5056, grad_fn=<DivBackward0>)
Epoch: 6/40... Loss: 0.516622... Accuracy: 0.756787... Val Loss: 0.505560 val accuracy: 0.757296
Epoch: 7/40... Loss: 0.500474... Accuracy: 0.768391... Val Loss: 0.517035 val accuracy: 

In [18]:
model.eval() 

test_loss = 0.0
test_acc = 0.0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        lengths = torch.clamp(inputs.sum(dim=1), max=seq_length)

        output = model(inputs, lengths)
        loss = criterion(output, labels.float())
        test_loss += loss.item()

        accuracy = calculate_accuracy(output, labels)
        test_acc += accuracy

avg_test_loss = test_loss / len(test_loader)
avg_test_acc = test_acc / len(test_loader)

print(f'Test Loss: {avg_test_loss:.6f}')
print(f'Test Accuracy: {avg_test_acc:.6f}')

Test Loss: 0.460258
Test Accuracy: 0.822119
