In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import logging
import gzip
import gensim 
import re
import spacy
import math
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import string
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from tqdm import tqdm


In [4]:
print(torch.cuda.is_available())
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device

True


'cuda:0'

# Preprocessing & Loading data

## Preprocess 

### Clothing Review Dataset

In [5]:
# Load dataset for clothing reviews


spam = pd.read_csv("/content/drive/MyDrive/data /spam.csv",encoding="latin1")
spam.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],axis=1,inplace=True)
spam = spam[['v2', 'v1']]
spam.columns = ['content', 'detection']
spam['content_length'] = spam['content'].apply(lambda x: len(x.split()))
#reviews = pd.read_csv("/content/drive/MyDrive/data /Womens Clothing E-Commerce Reviews.csv")
#reviews = reviews.dropna()
#print(reviews.shape)
spam.head()

Unnamed: 0,content,detection,content_length
0,"Go until jurong point, crazy.. Available only ...",ham,20
1,Ok lar... Joking wif u oni...,ham,6
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam,28
3,U dun say so early hor... U c already then say...,ham,11
4,"Nah I don't think he goes to usf, he lives aro...",ham,13


In [6]:
tracker = []
for aDetection in spam['detection']: 
  if not(aDetection in tracker): 
    tracker.append(aDetection)
print(len(tracker))

number_Detection = {}
index = 0
for aDetection in tracker: 
  number_Detection[aDetection] = index 
  index = index + 1
spam['detection'] = spam['detection'].apply(lambda x: number_Detection[x])
# reviews['Title'] = reviews['Title'].fillna('')
# reviews['Review Text'] = reviews['Review Text'].fillna('')
# reviews['review'] = reviews['Title'] + ' ' + reviews['Review Text']

2


In [7]:
spam.head()

Unnamed: 0,content,detection,content_length
0,"Go until jurong point, crazy.. Available only ...",0,20
1,Ok lar... Joking wif u oni...,0,6
2,Free entry in 2 a wkly comp to win FA Cup fina...,1,28
3,U dun say so early hor... U c already then say...,0,11
4,"Nah I don't think he goes to usf, he lives aro...",0,13


In [8]:
# reviews = reviews[['review', 'Rating']]
# reviews.columns = ['review', 'rating']
# reviews.head()
spam = spam[['content', 'detection']]
spam.columns = ['content', 'detection']
spam.head()

Unnamed: 0,content,detection
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


#### Tokenize each sentence

In [9]:
#take advantage of nltk to tokenize all sentences
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
en_stop_words = set(stopwords.words('english'))
tokenizer = nltk.RegexpTokenizer(r'\w+|\$[\d\.]+')

def tokenize_sent(sent):
    
    tokenized = tokenizer.tokenize(sent)
    filtered = [w.lower() for w in tokenized if w.lower() not in en_stop_words]
    return filtered

spam['tokenized'] = spam['content'].apply(lambda x: tokenize_sent(x))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:
# Sanity cehck for tokenizers
print(spam.head(3))

                                             content  ...                                          tokenized
0  Go until jurong point, crazy.. Available only ...  ...  [go, jurong, point, crazy, available, bugis, n...
1                      Ok lar... Joking wif u oni...  ...                     [ok, lar, joking, wif, u, oni]
2  Free entry in 2 a wkly comp to win FA Cup fina...  ...  [free, entry, 2, wkly, comp, win, fa, cup, fin...

[3 rows x 3 columns]


#### Encode and truncate sentence

In [11]:
# get unique words in the corpus
all_words = []
for x in spam['tokenized']:
    all_words.extend(x)

word_set = list(set(all_words))
word_count = Counter(all_words)

# filter out words with low frequency
for word_list in spam.tokenized:
    new_list = []
    for word in word_list:
        if word_count[word] > 2:
            new_list.append(word)
    word_list = new_list

# update set of words after removing the ones with low frequency
new_word_list = []
for x in spam['tokenized']:
    new_word_list.extend(x)
word_set = list(set(new_word_list))

# map each unique words & unknown token in reviews.encoded to an index
word2index = {}
word2index['<UNK>'] = 0
word2index['<PAD>'] = 1

for i, word in enumerate(word_set, 2):
    word2index[word] = i

# encode the original sequence
def encode(sent_list):
    result = []
    for x in sent_list:
        index = word2index[x]
        result.append(index)
    return result

spam['encoded'] = spam.tokenized.apply(lambda x: encode(x))

print(spam.head())

# get sequence average length
total_len = 0
for x in spam.encoded:
    total_len += len(x)

ave_len = math.floor(total_len/spam.shape[0])

# filter out long sequences --> encode all sequence to length = ave_len
# pad short sequence

for i, row in spam.iterrows():
    size = min(len(row.encoded), ave_len)
    new_encoded = row.encoded[:size]
    if size < ave_len:
        for j in range(0, ave_len - len(row.encoded)):
            new_encoded.append(1)
    spam.at[i, 'encoded'] = new_encoded

spam['review_length'] = spam.encoded.apply(lambda x: len(x))

                                             content  ...                                            encoded
0  Go until jurong point, crazy.. Available only ...  ...  [3194, 1875, 4828, 7044, 5523, 6292, 5095, 334...
1                      Ok lar... Joking wif u oni...  ...                [4382, 5733, 8411, 8301, 1827, 593]
2  Free entry in 2 a wkly comp to win FA Cup fina...  ...  [2784, 4004, 4322, 2887, 6307, 2658, 2107, 647...
3  U dun say so early hor... U c already then say...  ...  [1827, 7942, 2349, 401, 5292, 1827, 6794, 8354...
4  Nah I don't think he goes to usf, he lives aro...  ...          [7668, 3948, 147, 4748, 6178, 3142, 1458]

[5 rows x 4 columns]


In [12]:
# sanity check for encoding:
print(len(all_words))
print(ave_len)
lengths = [len(x) for x in spam.encoded]
print(max(lengths))
print(min(lengths))
print(set(spam.detection))
review_length = spam.review_length
spam.head()
print(len(word_set))

53413
9
9
9
{0, 1}
8577


In [13]:
# Train test split from skearln
data_size = len(spam['encoded'])
assert data_size == len(spam['detection']) 
X, y = list(zip(list(spam['encoded']),(list(spam['review_length'])))), list(spam['detection'])
# X(data, length)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

In [14]:
X_train_new=[]
for tup in X_train:
  arr = np.asarray(tup)
  X_train_new.append(arr)

X_test_new=[]
for tup in X_test:
  arr = np.asarray(tup)
  X_test_new.append(arr)

  return array(a, dtype, copy=False, order=order)


In [15]:
# Sanity check for length match
print(X_train_new[0])

[list([3567, 1240, 3734, 5492, 7159, 1240, 459, 3360, 4444]) 9]


## Dataset and Dataloader

### Clothing Review Dataset

In [16]:
# class Dataset_Glove(Dataset):
#     def __init__(self, X, y): 
#         self.X = X
#         self.y = y

#     def __len__(self):
#         return len(self.X)

#     def __getitem__(self, idx):
#         return  self.X[idx], self.y[idx]


In [17]:
class ReviewsDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(np.array(self.X[idx][0])), self.y[idx], self.X[idx][1]

In [18]:
# trainSet_glv = Dataset_Glove(X_train, y_train)
# testSet_glv = Dataset_Glove(X_test, y_test)
train_ds = ReviewsDataset(X_train_new, y_train)
valid_ds = ReviewsDataset(X_test_new, y_test)

In [19]:
def train_model(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    epoch = 0
    best_val_acc = 0.0  
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        batch_count=0
        total = 0
        train_correct = 0.0
        epoch += 1
        print(f"At epoch {i}")
        for x, y, l in train_dl:
            x = x.long()
            y = y.long()
            batch_count+=1
            optimizer.zero_grad()
            y_pred = model(x, l)
            pred = torch.argmax(y_pred, 1)
            train_correct += sum((pred == y))
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, precision, recall, f1  = validation_metrics(model, val_dl)
        print("train loss %.3f, train accuracy %.3f, val loss %.3f, val accuracy %.3f, precision %.3f, recall %.3f, F1 %.3f" 
              % (sum_loss/total, train_correct/total, val_loss, val_acc, precision, recall, f1))
        if val_acc > best_val_acc and i>=1:
            best_val_acc = val_acc

            #torch.save(model.state_dict(), NEW_PATH)
            print(f"\t=> Best model saved at {i}th epoch with valication accuracy of {val_acc}")
def validation_metrics (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    y_total = []
    y_pred_total = []
    for x, y, l in valid_dl:
        x = x.long()
        y = y.long()
        y_hat = model(x, l)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        y_total.extend(y.tolist())
        y_pred_total.extend(pred.tolist())
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        #sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]

    f1 = (f1_score(y_total, y_pred_total, average='weighted'))
    precision = (precision_score(y_total, y_pred_total, average='weighted'))
    recall =(recall_score(y_total, y_pred_total, average='weighted'))
   
    return sum_loss/total, correct/total,  precision, recall, f1


In [20]:
batch_size = 5000
vocab_size = len(word2index)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=batch_size)

In [21]:
class LSTM_fixed_len(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 5)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [22]:
model_fixed =  LSTM_fixed_len(vocab_size, 50, 50)
train_model(model_fixed, epochs=30, lr=0.01)

At epoch 0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


train loss 1.544, train accuracy 0.506, val loss 1.073, val accuracy 0.694, precision 0.862, recall 0.694, F1 0.768
At epoch 1
train loss 1.094, train accuracy 0.690, val loss 0.802, val accuracy 0.776, precision 0.845, recall 0.776, F1 0.809
	=> Best model saved at 1th epoch with valication accuracy of 0.7757847309112549
At epoch 2


  _warn_prf(average, modifier, msg_start, len(result))


train loss 0.815, train accuracy 0.776, val loss 0.654, val accuracy 0.829, precision 0.829, recall 0.829, F1 0.826
	=> Best model saved at 2th epoch with valication accuracy of 0.82869952917099
At epoch 3


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


train loss 0.662, train accuracy 0.824, val loss 0.542, val accuracy 0.860, precision 0.835, recall 0.860, F1 0.834
	=> Best model saved at 3th epoch with valication accuracy of 0.8600896596908569
At epoch 4
train loss 0.544, train accuracy 0.856, val loss 0.457, val accuracy 0.861, precision 0.820, recall 0.861, F1 0.812
	=> Best model saved at 4th epoch with valication accuracy of 0.8609865307807922
At epoch 5
train loss 0.444, train accuracy 0.874, val loss 0.462, val accuracy 0.864, precision 0.855, recall 0.864, F1 0.805
	=> Best model saved at 5th epoch with valication accuracy of 0.8636771440505981
At epoch 6
train loss 0.424, train accuracy 0.873, val loss 0.485, val accuracy 0.864, precision 0.882, recall 0.864, F1 0.803
At epoch 7
train loss 0.438, train accuracy 0.876, val loss 0.450, val accuracy 0.866, precision 0.884, recall 0.866, F1 0.809
	=> Best model saved at 7th epoch with valication accuracy of 0.8663676977157593
At epoch 8
train loss 0.392, train accuracy 0.880, v

In [23]:
train_model(model_fixed, epochs=30, lr=0.01)

At epoch 0
train loss 0.056, train accuracy 0.987, val loss 0.172, val accuracy 0.951, precision 0.950, recall 0.951, F1 0.948
At epoch 1
train loss 0.056, train accuracy 0.986, val loss 0.141, val accuracy 0.959, precision 0.958, recall 0.959, F1 0.957
	=> Best model saved at 1th epoch with valication accuracy of 0.9587444067001343
At epoch 2
train loss 0.046, train accuracy 0.990, val loss 0.124, val accuracy 0.961, precision 0.960, recall 0.961, F1 0.960
	=> Best model saved at 2th epoch with valication accuracy of 0.9605380892753601
At epoch 3
train loss 0.043, train accuracy 0.991, val loss 0.123, val accuracy 0.964, precision 0.964, recall 0.964, F1 0.964
	=> Best model saved at 3th epoch with valication accuracy of 0.9641255736351013
At epoch 4
train loss 0.041, train accuracy 0.991, val loss 0.121, val accuracy 0.967, precision 0.967, recall 0.967, F1 0.967
	=> Best model saved at 4th epoch with valication accuracy of 0.9668161273002625
At epoch 5
train loss 0.036, train accura

In [24]:
train_model(model_fixed, epochs=30, lr=0.01)

At epoch 0
train loss 0.002, train accuracy 0.999, val loss 0.205, val accuracy 0.966, precision 0.965, recall 0.966, F1 0.965
At epoch 1
train loss 0.002, train accuracy 1.000, val loss 0.194, val accuracy 0.967, precision 0.966, recall 0.967, F1 0.966
	=> Best model saved at 1th epoch with valication accuracy of 0.9668161273002625
At epoch 2
train loss 0.002, train accuracy 1.000, val loss 0.184, val accuracy 0.968, precision 0.967, recall 0.968, F1 0.967
	=> Best model saved at 2th epoch with valication accuracy of 0.9677129983901978
At epoch 3
train loss 0.002, train accuracy 1.000, val loss 0.166, val accuracy 0.970, precision 0.970, recall 0.970, F1 0.970
	=> Best model saved at 3th epoch with valication accuracy of 0.9704036116600037
At epoch 4
train loss 0.001, train accuracy 1.000, val loss 0.158, val accuracy 0.969, precision 0.968, recall 0.969, F1 0.968
At epoch 5
train loss 0.001, train accuracy 1.000, val loss 0.154, val accuracy 0.970, precision 0.969, recall 0.970, F1 0