In [1]:
import torch
from torch.nn import BCEWithLogitsLoss
import torch.nn as nn
import torch.nn.functional as F

import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader

import numpy as np
from pytictoc import TicToc
t = TicToc()

from preprocess import *
# from lstm import MyLSTM
from l import MyL

[nltk_data] Downloading package punkt to /home/ishikaa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ishikaa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv('IMDB Dataset.csv')
df.head()
X,y = df['review'].values,df['sentiment'].values
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y)

In [3]:
y_train = np.asarray([0 if y=='positive' else 1 for y in y_train])
y_test = np.asarray([0 if y=='positive' else 1 for y in y_test])

In [4]:
t.tic()
#Clean
X_train = clean(X_train)
X_test = clean(X_test)

#Tokenize
word2idx, idx2word = count_tokenize(X_train)

#Vectorize
X_train = vectorize(X_train,word2idx=word2idx)
X_test = vectorize(X_test,word2idx=word2idx)

t.toc()

0% done
13% done
26% done
40% done
53% done
66% done
80% done
93% done
100%done
Elapsed time is 418.492036 seconds.


In [5]:
X_train.shape,y_train.shape

((37500, 200), (37500,))

In [6]:
train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
test_data = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))

batch_size = 400

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)


In [7]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [8]:
device

device(type='cuda')

In [37]:
vocab = len(word2idx)+1
lstm = MyL(vocab,embed=400,hidden=256,layers=2,device=device)
lstm = lstm.to(device)
lstm = lstm.float()

In [38]:
lr=0.005
criterion = BCEWithLogitsLoss()
optimizer = torch.optim.Adam(lstm.parameters(), lr=lr)

In [39]:
from tqdm.notebook import tqdm

In [40]:
epochs = 2
counter = 0
print_every = 1000
clip = 5
valid_loss_min = np.Inf

lstm.train()
for i in range(epochs):
    t_loss = 0.0
    for inputs, labels in tqdm(train_loader):
        h = lstm.initiliaze(inputs.shape[0])
        h = tuple([e.data for e in h])
        # print(inputs.shape,h[0].shape)
        inputs, labels = inputs.to(device).long(), labels.to(device).long()
        lstm.zero_grad()
        output, h = lstm(inputs, h)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm_(lstm.parameters(), clip)
        optimizer.step()
        t_loss += loss.item()
        
    print("Epoch {0} loss: {1:.4f}".format(i,t_loss/batch_size))

  0%|          | 0/94 [00:00<?, ?it/s]

Epoch 0 loss: 0.1147


  0%|          | 0/94 [00:00<?, ?it/s]

Epoch 1 loss: 0.0750


In [42]:
test_losses = []
num_correct = 0

lstm.eval()
for inputs, labels in test_loader:
    h = lstm.initiliaze(inputs.shape[0])
    h = tuple([each.data for each in h])
    inputs, labels = inputs.to(device).long(), labels.to(device).long()
    output, h = lstm(inputs, h)
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    pred = torch.round(output.squeeze())  # Rounds the output to 0/1
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)

print("Test loss: {:.3f}".format(np.mean(test_losses)))
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}%".format(test_acc*100))

Test loss: 1.063
Test accuracy: 2.144%
