<a href="https://colab.research.google.com/github/rename-z/Deep-Learning/blob/master/LSTMs/Multiclass%20Text%20Classification%20using%20BiLSTMs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn

In [2]:
import pandas as pd
import numpy as np

In [3]:
# loading data from google drive
reviews = pd.read_csv('/content/drive/My Drive/LSTMs/review.csv')
print(reviews.shape)
reviews.head()

(23486, 11)


Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [4]:
# filling Null with empty string in Title and Review Text columns
reviews['Title'] = reviews['Title'].fillna('')
reviews['Review Text'] = reviews['Review Text'].fillna('')

reviews['review'] = reviews['Title'] + ' ' + reviews['Review Text']

In [5]:
reviews.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,review
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates,Absolutely wonderful - silky and sexy and com...
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses,Love this dress! it's sooo pretty. i happen...
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses,Some major design flaws I had such high hopes ...
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants,"My favorite buy! I love, love, love this jumps..."
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses,Flattering shirt This shirt is very flattering...


In [6]:
# keeping only relevent columns and calculating sentence lengths

reviews = reviews[['review', 'Rating']]
reviews.columns = ['review', 'rating']
reviews['review_length'] = reviews['review'].apply(lambda x : len(x.split()))
reviews.head()

Unnamed: 0,review,rating,review_length
0,Absolutely wonderful - silky and sexy and com...,4,8
1,Love this dress! it's sooo pretty. i happen...,5,62
2,Some major design flaws I had such high hopes ...,3,102
3,"My favorite buy! I love, love, love this jumps...",5,25
4,Flattering shirt This shirt is very flattering...,5,38


In [7]:
#change rating to zero numbering
zero_numbering = {1:0, 2:1, 3:2, 4:3, 5:4}
reviews['rating'] = reviews['rating'].apply(lambda x : zero_numbering[x])

In [8]:
#mean sentence length
np.mean(reviews['review_length'])

60.832921740611425

In [9]:
# libraries for preprocessing the text
import spacy
import re
import string

In [10]:
#tokenization
tok = spacy.load('en')
def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

In [11]:
from collections import Counter

In [12]:
#count number of occurences of each word
counts = Counter()
for index, row in reviews.iterrows():
    counts.update(tokenize(row['review']))

In [13]:
#deleting infrequent words
print("num_words before:",len(counts.keys()))
for word in list(counts):
    if counts[word] < 2:
        del counts[word]
print("num_words after:",len(counts.keys()))

num_words before: 14138
num_words after: 8263


In [14]:
#creating vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [15]:
def encode_sentence(text, vocab2index, N=70):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

In [16]:
reviews['encoded'] = reviews['review'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
reviews.head()

Unnamed: 0,review,rating,review_length,encoded
0,Absolutely wonderful - silky and sexy and com...,3,8,"[[2, 3, 4, 5, 6, 7, 8, 7, 9, 0, 0, 0, 0, 0, 0,..."
1,Love this dress! it's sooo pretty. i happen...,4,62,"[[2, 10, 11, 12, 5, 13, 14, 15, 16, 5, 17, 18,..."
2,Some major design flaws I had such high hopes ...,2,102,"[[54, 55, 56, 57, 17, 58, 59, 60, 61, 62, 11, ..."
3,"My favorite buy! I love, love, love this jumps...",4,25,"[[68, 109, 110, 2, 17, 10, 2, 10, 2, 10, 11, 1..."
4,Flattering shirt This shirt is very flattering...,4,38,"[[122, 123, 11, 123, 52, 92, 122, 19, 124, 125..."


In [17]:
#check how balanced the dataset is
Counter(reviews['rating'])

Counter({0: 842, 1: 1565, 2: 2871, 3: 5077, 4: 13131})

In [18]:
X = list(reviews['encoded'])
y = list(reviews['rating'])

In [19]:
from torch.utils.data import Dataset, DataLoader, random_split

In [20]:
class ReviewsDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]

In [21]:
dataset = ReviewsDataset(X, y)

In [22]:
val_size = 4500
train_size = len(dataset) - val_size

In [23]:
train_ds, val_ds = random_split(dataset, [train_size, val_size])
len(train_ds), len(val_ds)

(18986, 4500)

In [24]:
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import mean_squared_error

In [25]:
def train_model(model, epoch=10, lr=0.001) :
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr)
    for i in range(epoch) :
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl :
            x = x.long()
            y = y.long()
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item() * y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, val_rmse = validation_metrics(model, val_ds)
        if i % 5 == 1 :
            print("train loss %.3f, val loss %.3f, val accuracy %.3f and val rmse %.3f" %(sum_loss / total, val_loss, val_acc, val_rmse))

In [26]:
def validation_metrics(model, val_ds) :
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in val_dl:
        x = x.long()
        y = y.long()
        y_hat = model(x, l)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item() * y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total, sum_rmse/total

In [27]:
batch_size = 5000
vocab_size = len(words)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=batch_size)

In [None]:
class LSTM_fixed_len(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.linear = nn.Linear(hidden_dim, 5)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [None]:
model_fixed =  LSTM_fixed_len(vocab_size, 50, 50)

In [None]:
train_model(model_fixed, epoch=30, lr=0.01)

train loss 1.176, val loss 1.123, val accuracy 0.569 and val rmse 1.328
train loss 0.930, val loss 0.934, val accuracy 0.618 and val rmse 0.984
train loss 0.812, val loss 0.854, val accuracy 0.646 and val rmse 0.824
train loss 0.732, val loss 0.835, val accuracy 0.650 and val rmse 0.773
train loss 0.667, val loss 0.868, val accuracy 0.658 and val rmse 0.796
train loss 0.608, val loss 0.874, val accuracy 0.660 and val rmse 0.776


In [None]:
train_model(model_fixed, epoch=30, lr=0.01)

train loss 0.652, val loss 0.920, val accuracy 0.649 and val rmse 0.801
train loss 0.552, val loss 0.918, val accuracy 0.655 and val rmse 0.777
train loss 0.507, val loss 0.970, val accuracy 0.649 and val rmse 0.784
train loss 0.467, val loss 1.021, val accuracy 0.644 and val rmse 0.797
train loss 0.432, val loss 1.098, val accuracy 0.641 and val rmse 0.800
train loss 0.400, val loss 1.174, val accuracy 0.642 and val rmse 0.805


In [28]:
class LSTM_variable_input(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(0.3)
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.linear = nn.Linear(hidden_dim, 5)
        
    def forward(self, x, s):
        x = self.embeddings(x)
        x = self.dropout(x)
        x_pack = pack_padded_sequence(x, s, batch_first=True, enforce_sorted=False)
        out_pack, (ht, ct) = self.lstm(x_pack)
        out = self.linear(ht[-1])
        return out

In [29]:
model = LSTM_variable_input(vocab_size, 50, 50)

In [30]:
train_model(model, epoch=30, lr=0.1)

train loss 1.207, val loss 1.108, val accuracy 0.586 and val rmse 1.226
train loss 0.865, val loss 0.915, val accuracy 0.629 and val rmse 0.921
train loss 0.765, val loss 0.895, val accuracy 0.636 and val rmse 0.865
train loss 0.720, val loss 0.912, val accuracy 0.633 and val rmse 0.842
train loss 0.696, val loss 0.925, val accuracy 0.631 and val rmse 0.841
train loss 0.675, val loss 0.938, val accuracy 0.634 and val rmse 0.844


In [31]:
train_model(model, epoch=30, lr=0.05)

train loss 0.689, val loss 0.937, val accuracy 0.622 and val rmse 0.835
train loss 0.646, val loss 0.944, val accuracy 0.631 and val rmse 0.831
train loss 0.632, val loss 0.949, val accuracy 0.631 and val rmse 0.837
train loss 0.618, val loss 0.955, val accuracy 0.637 and val rmse 0.836
train loss 0.617, val loss 0.963, val accuracy 0.640 and val rmse 0.824
train loss 0.607, val loss 0.964, val accuracy 0.639 and val rmse 0.828


In [32]:
train_model(model, epoch=30, lr=0.01)

train loss 0.606, val loss 0.965, val accuracy 0.635 and val rmse 0.844
train loss 0.587, val loss 0.970, val accuracy 0.635 and val rmse 0.833
train loss 0.588, val loss 0.973, val accuracy 0.640 and val rmse 0.839
train loss 0.583, val loss 0.977, val accuracy 0.636 and val rmse 0.837
train loss 0.576, val loss 0.977, val accuracy 0.635 and val rmse 0.831
train loss 0.571, val loss 0.980, val accuracy 0.638 and val rmse 0.830
