# Description

- contest link [kaggle](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)

# Contents

# Getting data from Kaggle API

In [None]:
! pip install -q kaggle

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
from google.colab import files

files.upload()

In [None]:
! mkdir ~/.kaggle

In [None]:
! cp kaggle.json ~/.kaggle/

! chmod 600 ~/.kaggle/kaggle.json

In [None]:
import os

os.chdir('/content/drive/My Drive/Hackerearth')
os.getcwd()

'/content/drive/My Drive/Hackerearth'

In [None]:
! rm kaggle.json

In [None]:
! mkdir 50k-movie-review

% cd 50k-movie-review

/content/drive/MyDrive/Hackerearth/50k-movie-review


In [None]:
! kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Downloading imdb-dataset-of-50k-movie-reviews.zip to /content/drive/My Drive/Hackerearth/50k-movie-review
 86% 22.0M/25.7M [00:00<00:00, 27.1MB/s]
100% 25.7M/25.7M [00:00<00:00, 65.1MB/s]


In [None]:
! ls && pwd

imdb-dataset-of-50k-movie-reviews.zip
/content/drive/My Drive/Hackerearth/50k-movie-review


In [None]:
! unzip imdb-dataset-of-50k-movie-reviews.zip

Archive:  imdb-dataset-of-50k-movie-reviews.zip
  inflating: IMDB Dataset.csv        


# Play around with data

### Explore data

In [1]:
WORKING_DIR = '/content/drive/My Drive/Hackerearth/50k-movie-review'

import os
os.chdir(WORKING_DIR)

In [2]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import torch 
import torch.nn as nn

In [4]:
# df['sentiment'] = df.apply(lambda row: 1 if row.sentiment == 'positive' else 0, axis=1) #1 if df['sentiment'] == 'positive' else 0
# df.to_csv(WORKING_DIR + '/IMDB Dataset.csv', index=False)

In [5]:
df = pd.read_csv(WORKING_DIR + '/IMDB Dataset.csv') # /content/drive/MyDrive/Hackerearth/50k-movie-review/IMDB Dataset.csv
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [6]:
# Num of data points for both classes is same :)
# print(df[df['sentiment'] == 'positive'].count(), '\n\n', df[df['sentiment'] == 'negative'].count())
df.groupby('sentiment').count()

Unnamed: 0_level_0,review
sentiment,Unnamed: 1_level_1
0,25000
1,25000


### use torch text

In [9]:
import torch
from torchtext import data  

In [12]:
# Denotes the transformatioin to be applied

Text = data.Field(sequential=True, use_vocab=True, tokenize='spacy', batch_first=True)
Labels = data.LabelField(dtype=torch.float, batch_first=True)

In [13]:
# which cols to consider and how, in the dataset, and which transformation to apply

fields = {'review': ('text', Text), 'sentiment': ('label', Labels)}

In [14]:
#loading custom dataset
training_data=data.TabularDataset(path = 'IMDB Dataset.csv',format = 'csv',fields = fields)

In [15]:
import random
train_data, valid_data = training_data.split(split_ratio=0.7)

In [16]:
# build vocabulary

Text.build_vocab(train_data,min_freq=3)  
Labels.build_vocab(train_data)

In [17]:
#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(Text.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(Labels.vocab))

#Commonly used words
print(Text.vocab.freqs.most_common(10))  

#Word dictionary
# print(Text.vocab.stoi) 

# print(':=> Index, Labels: ', Text.vocab.stoi['of'], Text.vocab.itos[7])

Size of TEXT vocabulary: 58058
Size of LABEL vocabulary: 2
[('the', 402537), (',', 379954), ('.', 327313), ('a', 216917), ('and', 216649), ('of', 199774), ('to', 184767), ('is', 150305), ('in', 122074), ('I', 108713)]


In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

#set batch size
BATCH_SIZE = 64

#Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True,
    device = device)

In [21]:

for batch in train_iterator:
  print(batch.text.shape)
  # [batch_size, max_length_of_sentence_in_batch]
  break

torch.Size([64, 212])


# Training Starts

In [19]:
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import torch
import torch.nn as nn

### Dataset and dataloader

### Define Model

###### Model:
 - nn.Lstm followed by a linear layer

In [58]:
class Rnn(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, batch_size, n_layers=1):
        super(Rnn, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.embed_dim = embed_dim

        self.embedding = nn.Embedding(input_dim, embed_dim)
        self.rnn = nn.LSTM(embed_dim, hidden_dim, n_layers, dropout=0.2, batch_first=True)
        self.norm = nn.BatchNorm1d(hidden_dim)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_dim, 2)
        
    def forward(self, src):
        
        #src = [src len, batch size] or [batch size, seq len]
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [src len, batch size, emb dim]
        outputs, (hidden, cell) = self.rnn(embedded)
        # print(f'outputs: {outputs.shape}, hidden: {hidden.shape}')
        
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]

        hidden = torch.squeeze(hidden, 0)
        x = self.relu(self.norm(hidden))
        return self.fc(self.dropout(x))
        
        #outputs are always from the top hidden layer
        
        # return hidden, cell

In [59]:
model = Rnn(input_dim=len(Text.vocab), embed_dim=100, hidden_dim=50, batch_size=64)
model = model.to(device)

## Prep for training

In [60]:
optimizer = torch.optim.Adam(model.parameters())
criterian = nn.CrossEntropyLoss().to(device)



## Train

In [63]:
from tqdm import tqdm
for epoch in range(5):
    running_loss = 0

    for batch in tqdm(train_iterator):
        inputs, labels = batch.text, batch.label.long()
        out = model(batch.text)
        # print('Debug: ', out, labels)

        loss = criterian(out, labels)
        loss.backward()

        optimizer.step()
        running_loss += loss.item() * inputs.shape[0]

    print(f'\tEpoch: [{epoch+1}] Running Loss: {running_loss / len(train_iterator) :.2f}')
#   [batch_size, max_length_of_sentence_in_batch]

100%|██████████| 547/547 [09:37<00:00,  1.06s/it]
  0%|          | 0/547 [00:00<?, ?it/s]

Epoch: [ 1 ] Running Loss:  43.92558913257048


100%|██████████| 547/547 [09:13<00:00,  1.01s/it]
  0%|          | 0/547 [00:00<?, ?it/s]

Epoch: [ 2 ] Running Loss:  44.41492696218125


100%|██████████| 547/547 [07:05<00:00,  1.28it/s]
  0%|          | 0/547 [00:00<?, ?it/s]

Epoch: [ 3 ] Running Loss:  44.090254821114826


100%|██████████| 547/547 [05:06<00:00,  1.79it/s]
  0%|          | 0/547 [00:00<?, ?it/s]

Epoch: [ 4 ] Running Loss:  44.1910320182605


100%|██████████| 547/547 [04:22<00:00,  2.09it/s]

Epoch: [ 5 ] Running Loss:  43.932779648622166





# Queries ?
- how to decide embeding dim, hidden dim effectively