In [1]:
import pandas as pd
import os
import shutil
from wordcloud import WordCloud

from torch.utils.data import Dataset, DataLoader

import nltk
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from torchtext.vocab import build_vocab_from_iterator

STOPWORDS = set(stopwords.words("english"))
print('stopwords = ', STOPWORDS)

import matplotlib.pyplot as plt
import seaborn as sns
import json
import random
from collections import Counter
import numpy as np
from torch.nn.utils.rnn import pad_sequence
import torch
from torch import nn

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn import metrics

import sys
sys.path.append('../')

from utils import training

import logging 


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nguye\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


stopwords =  {'hers', 'over', 'than', 'against', 'again', 'd', 'below', 'before', "wouldn't", 'on', "you've", 'whom', 'will', 'about', 'for', 'of', "couldn't", 'is', 'were', 'when', 'the', "wasn't", 'couldn', "mightn't", "shouldn't", 'does', 'down', 'any', "needn't", 'between', 'here', 'both', 'your', 'there', 'nor', 'are', 'm', "you'll", 'isn', 'himself', 'because', 'very', 'won', "didn't", 'most', 'only', 'her', 'just', 'did', 'haven', 'under', 'above', 'mustn', 'you', 'as', 'that', 'me', 'until', "shan't", 'into', 'shouldn', 'didn', 'we', 'their', 'all', 'having', "hasn't", 'those', "that'll", 'have', "don't", 'shan', 'while', 'then', 'to', 'so', 'where', 'now', 'further', 'and', 'being', 'after', 'doesn', 'been', "mustn't", 'wouldn', 'my', 'i', 'yourselves', 'such', 'a', 'themselves', 'o', 'our', 'herself', 'once', 'it', 'other', 'am', 're', 'needn', 'll', 'yours', 'she', 'don', 'this', "you'd", 'no', 've', 'with', 'had', 'hasn', 'ain', 'out', 'from', 'what', 'he', 'off', "hadn't",

In [2]:
print(training.Trainer())

INFO:root:created trainer


<utils.training.Trainer object at 0x000001AAA26C71C0>


In [3]:
root = os.getcwd()
data_path = os.path.join(root, 'dataset', 'processed_amazon_reviews.csv')
assert os.path.exists(data_path), f'data path does not exist {data_path}'

In [4]:
df = pd.read_csv(data_path)
print(df.shape)
df.head()


(2999812, 3)


Unnamed: 0,review,text,tokens
0,3,more like funchuck Gave this to my dad for a g...,like funchuck gave dad gag gift directing nuns...
1,5,Inspiring I hope a lot of people hear this cd....,inspiring hope lot people hear cd need strong ...
2,5,The best soundtrack ever to anything. I'm read...,best soundtrack ever anything im reading lot r...
3,4,Chrono Cross OST The music of Yasunori Misuda ...,chrono cross ost music yasunori misuda without...
4,5,Too good to be true Probably the greatest soun...,good true probably greatest soundtrack history...


In [5]:
df.drop('text', axis = 1, inplace = True)
# df.drop('Unnamed: 0', axis = 1, inplace = True)
df.head()

Unnamed: 0,review,tokens
0,3,like funchuck gave dad gag gift directing nuns...
1,5,inspiring hope lot people hear cd need strong ...
2,5,best soundtrack ever anything im reading lot r...
3,4,chrono cross ost music yasunori misuda without...
4,5,good true probably greatest soundtrack history...


In [6]:
print(df.shape)

(2999812, 2)


In [7]:
train_df = df[:10000].copy()
train_df.head()

Unnamed: 0,review,tokens
0,3,like funchuck gave dad gag gift directing nuns...
1,5,inspiring hope lot people hear cd need strong ...
2,5,best soundtrack ever anything im reading lot r...
3,4,chrono cross ost music yasunori misuda without...
4,5,good true probably greatest soundtrack history...


In [None]:
sns.countplot(x = train_df['review'])

In [9]:
tokens = train_df['tokens'].values
print(tokens[:10])
print(len(tokens))

['like funchuck gave dad gag gift directing nunsense got reall kick'
 'inspiring hope lot people hear cd need strong positive vibes like great vocals fresh tunes crosscultural happiness blues gut pop sounds catchy mature'
 'best soundtrack ever anything im reading lot reviews saying best game soundtrack figured id write review disagree bit opinino yasunori mitsudas ultimate masterpiece music timeless im listening years beauty simply refuses fadethe price tag pretty staggering must say going buy cd much money one feel would worth every penny'
 'chrono cross ost music yasunori misuda without question close second great nobuo uematsuchrono cross ost wonderful creation filled rich orchestra synthesized sounds ambiance one musics major factors yet times uplifting vigorous favourite tracks include scars left time girl stole stars another world'
 'good true probably greatest soundtrack history usually better played game first enjoyable anyway worked hard getting soundtrack spending money get 

### Build the vocabulary

In [85]:
class Vocab():
    def __init__(self, token_list, 
                 min_freq = 0) -> None:
        '''
        @params
            tokens_col: list, 2D list. each row of token is a list of tokens seperated by comma,
                        eg: a,bc,d,...
        '''
        special_tokens = ['<pad>', '<unk>']
        self.unk = '<unk>'
        self.pad = '<pad>'

        #buil corpus from tokens
        self.corpus = Counter()

        for row in token_list:
            tokens = row.split(' ')
            self.corpus.update(tokens)
        
        print('corpus length = ', len(self.corpus))

        # with open('corpus.txt', 'w') as file:
        #     json.dump(self.corpus.most_common(), file)

        #build vocab from tokens
        self._token_to_idx = {}
        self._idx_to_token = {}

        for i, t in enumerate(special_tokens):
            self._token_to_idx[t] = i
            self._idx_to_token[i] = t


        for k, freq in self.corpus.most_common():
            if  freq >= min_freq:
                idx =  len(self._token_to_idx)
                self._token_to_idx[k] = idx
                self._idx_to_token[idx] = k
            else:
                break
        
        print("built the vocab, len = ", len(self._token_to_idx))
    
    def __len__(self):
        return len(self._token_to_idx)
    
    
    def get_index(self, tokens):
        '''
        @params:
            tokens: list[str], list of tokens, eg: ('a', 'b', 'c')
        @return:
            res: list[int], list of int
        '''
        assert isinstance(tokens, list), 'tokens must be list of tokens, eg: ["a", "b", "c", ..]'

        res = []
        for token in tokens:
            if token in self._token_to_idx:
                res.append(self._token_to_idx[token])
            else:
                res.append(self._token_to_idx[self.unk])
        return res
    
    def get_tokens(self, indices):
        assert isinstance(indices, list), 'indices must be list of index, eg: [1,2,3, ..]'
        res = []
        for index in indices:
            if index in self._idx_to_token:
                res.append(self._idx_to_token[index])
            else:
                unk_idx = self._token_to_idx[self.unk]
                res.append(self._idx_to_token[unk_idx])
        return res

    
vocab = Vocab(train_df['tokens'])   

corpus length =  39914
built the vocab, len =  39916


In [86]:
print(f'tokens to idx, len = {len(vocab._token_to_idx)}')
print(f'idx to tokens, len = {len(vocab._idx_to_token)}')

tokens to idx, len = 39916
idx to tokens, len = 39916


In [87]:
tokens = train_df.iloc[0]['tokens'].split(' ')
print(tokens)

res = vocab.get_index(tokens)
print(res)

res = vocab.get_index('something here is good'.split(' '))
print(res)

res = vocab.get_tokens([5, 1, 238, 2105, 6483, 462, 3434, 1, 44, 1, 2890])
print(res)

['like', 'funchuck', 'gave', 'dad', 'gag', 'gift', 'directing', 'nunsense', 'got', 'reall', 'kick']
[5, 17002, 238, 2105, 6483, 462, 3434, 17003, 44, 12194, 2890]
[77, 1, 1, 4]
['like', '<unk>', 'gave', 'dad', 'gag', 'gift', 'directing', '<unk>', 'got', '<unk>', 'kick']


In [88]:
train_df.head()

Unnamed: 0,review,tokens
0,3,like funchuck gave dad gag gift directing nuns...
1,5,inspiring hope lot people hear cd need strong ...
2,5,best soundtrack ever anything im reading lot r...
3,4,chrono cross ost music yasunori misuda without...
4,5,good true probably greatest soundtrack history...


In [97]:
#Using vocab object to build the train dataset
X_train = []
y_train = []

for row in train_df.iterrows():
    review = row[1][0]
    tokens = row[1][1].split(" ")

    #map reviews to good (4,5) or bad (1,2,3)
    # review = 0 if review in (1,2,3) else 1

    X_train.append(torch.tensor(vocab.get_index(tokens)))
    y_train.append(torch.tensor(review))

X_train = pad_sequence(X_train, batch_first= True, padding_value = vocab._token_to_idx[vocab.pad])
print('X train shape = ',X_train.shape)
y_train = torch.tensor(y_train)
print('y train shape = ',y_train.shape)

X train shape =  torch.Size([10000, 140])
y train shape =  torch.Size([10000])


In [98]:
for x,y in zip(X_train, y_train):
    print('tokens = ', x)
    print('labels = ', y)
    break

tokens =  tensor([    5, 17002,   238,  2105,  6483,   462,  3434, 17003,    44, 12194,
         2890,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     

In [99]:
class AmazonReviewSentimentDataset(Dataset):
    def __init__(self, X, y):
        super().__init__()
        self.X =X 
        self.y =y 
    
    def __getitem__(self, index):
        return self.X[index], self.y[index]
    
    def __len__(self):
        return len(self.X)

In [92]:
BATCH_SIZE = 32

In [100]:
train_dataset = AmazonReviewSentimentDataset(X_train, y_train)
train_dataloader = DataLoader(train_dataset, BATCH_SIZE)

print('train dataset len = ', len(train_dataset))
print('train dataloader len = ', len(train_dataloader))
sampleX, sampleY = next(iter(train_dataloader))

print('X shape = ', sampleX.shape)
print('y shape = ', sampleY.shape)

train dataset len =  10000
train dataloader len =  313
X shape =  torch.Size([32, 140])
y shape =  torch.Size([32])


### Building LSTM model

In [94]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, nn.LSTM):
        for name, param in m.named_parameters():
            if "bias" in name:
                nn.init.zeros_(param)
            elif "weight" in name:
                nn.init.orthogonal_(param)

class SentimentRNN(nn.Module):

    def __init__(self, num_classes, vocab_size, hidden_size, embedding_size, num_layers, dropout = 0.3):
        super().__init__()
        # self.num_layers = num_layers
        # self.vocab_size = vocab_size
        # self.hidden_size = hidden_size
        # self.num_steps = num_steps
        # self.embedding_size = embedding_size
        # self.dropout = dropout
        self.num_layers = num_layers
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.rnn = nn.LSTM(input_size = embedding_size, hidden_size = hidden_size,\
                             num_layers = num_layers, batch_first = True) 

        self.dropout = nn.Dropout(dropout) 

        self.dense = nn.LazyLinear(num_classes)

        self.sigmoid = nn.Sigmoid()
    
    def init_state(self, batch):
        hidden = torch.rand(self.num_layers, batch, self.hidden_size, dtype = torch.float32, device = X.device)
        cell = torch.rand(self.num_layers, batch, self.hidden_size, dtype = torch.float32, device = X.device)
        return (hidden, cell)

    
    def forward(self, X, state):

        embeds = self.embedding(X)

        output, state = self.rnn(embeds, state)
        output = self.dropout(output)
        output = self.sigmoid(self.dense(output))

        return output[:, -1], state



In [95]:
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# print(device)
device = 'cpu'

test_model = SentimentRNN(num_classes= 1, vocab_size = len(vocab), hidden_size = 10, embedding_size=5, num_layers = 2)
test_model.to(device)

sampleX, sampleY = next(iter(train_dataloader))

sampleX = sampleX.to(device)
print("sample x shape = ", sampleX.shape)
sampleY = sampleY.to(device)
print("sample Y shape = ", sampleY.shape)

output,(hidden, cell) = test_model(sampleX)
print(output.shape)

loss_fn = nn.BCELoss()

loss = loss_fn(output.squeeze(-1), sampleY.float())
print(loss.item())

sample x shape =  torch.Size([32, 140])
sample Y shape =  torch.Size([32])




TypeError: forward() missing 1 required positional argument: 'state'

### Training loop

In [104]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('running on ', device)


model = SentimentRNN(num_classes= 5, vocab_size = len(vocab), hidden_size = 500, embedding_size= 400, num_layers = 2)
model.to(device)

optim = torch.optim.Adam(model.parameters(), lr = 0.001)
loss_fn = nn.CrossEntropyLoss()
grad_clip = 5

for e in range(10):
    state = None
    running_loss = 0.0
    loop = tqdm(train_dataloader)
    state = None
    for X,y in loop:
        X = X.to(device)
        y = y.to(device)

        batch, step = X.shape
        if state == None or state[0].shape[0] != batch:
            state = model.init_state(batch)
        else:
            state = (state[0].data, state[1].data)

        optim.zero_grad()
        outputs, state = model(X, state)
        outputs = outputs.squeeze(-1)

        loss = loss_fn(outputs, y)
        loss.backward()
        nn.utils.clip_grad_norm(model.parameters(), grad_clip)
        optim.step()
        running_loss += loss.item()

        loop.set_description(f'epoch = {e}, loss = {running_loss:.5f}')
        break
    break
        

running on  cuda




RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
