In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import logging
import gzip
import gensim 
import re
import spacy
import math
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import string
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from tqdm import tqdm


In [None]:
print(torch.cuda.is_available())
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device

True


'cuda:0'

# Preprocessing & Loading data

## Preprocess 

### Clothing Review Dataset

In [None]:
# Load dataset for clothing reviews


spam = pd.read_csv("/content/drive/MyDrive/data /spam.csv",encoding="latin1")
spam.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],axis=1,inplace=True)
spam = spam[['v2', 'v1']]
spam.columns = ['content', 'detection']
spam['content_length'] = spam['content'].apply(lambda x: len(x.split()))
#reviews = pd.read_csv("/content/drive/MyDrive/data /Womens Clothing E-Commerce Reviews.csv")
#reviews = reviews.dropna()
#print(reviews.shape)
spam.head()

Unnamed: 0,content,detection,content_length
0,"Go until jurong point, crazy.. Available only ...",ham,20
1,Ok lar... Joking wif u oni...,ham,6
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam,28
3,U dun say so early hor... U c already then say...,ham,11
4,"Nah I don't think he goes to usf, he lives aro...",ham,13


In [None]:
tracker = []
for aDetection in spam['detection']: 
  if not(aDetection in tracker): 
    tracker.append(aDetection)
print(len(tracker))

number_Detection = {}
index = 0
for aDetection in tracker: 
  number_Detection[aDetection] = index 
  index = index + 1
spam['detection'] = spam['detection'].apply(lambda x: number_Detection[x])
# reviews['Title'] = reviews['Title'].fillna('')
# reviews['Review Text'] = reviews['Review Text'].fillna('')
# reviews['review'] = reviews['Title'] + ' ' + reviews['Review Text']

2


In [None]:
spam.head()

Unnamed: 0,content,detection,content_length
0,"Go until jurong point, crazy.. Available only ...",0,20
1,Ok lar... Joking wif u oni...,0,6
2,Free entry in 2 a wkly comp to win FA Cup fina...,1,28
3,U dun say so early hor... U c already then say...,0,11
4,"Nah I don't think he goes to usf, he lives aro...",0,13


In [None]:
# reviews = reviews[['review', 'Rating']]
# reviews.columns = ['review', 'rating']
# reviews.head()
spam = spam[['content', 'detection']]
spam.columns = ['content', 'detection']
spam.head()

Unnamed: 0,content,detection
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


#### Tokenize each sentence

In [None]:
#take advantage of nltk to tokenize all sentences
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
en_stop_words = set(stopwords.words('english'))
tokenizer = nltk.RegexpTokenizer(r'\w+|\$[\d\.]+')

def tokenize_sent(sent):
    
    tokenized = tokenizer.tokenize(sent)
    filtered = [w.lower() for w in tokenized if w.lower() not in en_stop_words]
    return filtered

spam['tokenized'] = spam['content'].apply(lambda x: tokenize_sent(x))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Sanity cehck for tokenizers
print(spam.head(3))

                                             content  ...                                          tokenized
0  Go until jurong point, crazy.. Available only ...  ...  [go, jurong, point, crazy, available, bugis, n...
1                      Ok lar... Joking wif u oni...  ...                     [ok, lar, joking, wif, u, oni]
2  Free entry in 2 a wkly comp to win FA Cup fina...  ...  [free, entry, 2, wkly, comp, win, fa, cup, fin...

[3 rows x 3 columns]


#### Encode and truncate sentence

In [None]:
# get unique words in the corpus
all_words = []
for x in spam['tokenized']:
    all_words.extend(x)

word_set = list(set(all_words))
word_count = Counter(all_words)

# filter out words with low frequency
for word_list in spam.tokenized:
    new_list = []
    for word in word_list:
        if word_count[word] > 2:
            new_list.append(word)
    word_list = new_list

# update set of words after removing the ones with low frequency
new_word_list = []
for x in spam['tokenized']:
    new_word_list.extend(x)
word_set = list(set(new_word_list))

# map each unique words & unknown token in reviews.encoded to an index
word2index = {}
word2index['<UNK>'] = 0
word2index['<PAD>'] = 1

for i, word in enumerate(word_set, 2):
    word2index[word] = i

# encode the original sequence
def encode(sent_list):
    result = []
    for x in sent_list:
        index = word2index[x]
        result.append(index)
    return result

spam['encoded'] = spam.tokenized.apply(lambda x: encode(x))

print(spam.head())

# get sequence average length
total_len = 0
for x in spam.encoded:
    total_len += len(x)

ave_len = math.floor(total_len/spam.shape[0])

# filter out long sequences --> encode all sequence to length = ave_len
# pad short sequence

for i, row in spam.iterrows():
    size = min(len(row.encoded), ave_len)
    new_encoded = row.encoded[:size]
    if size < ave_len:
        for j in range(0, ave_len - len(row.encoded)):
            new_encoded.append(1)
    spam.at[i, 'encoded'] = new_encoded

spam['review_length'] = spam.encoded.apply(lambda x: len(x))

                                             content  ...                                            encoded
0  Go until jurong point, crazy.. Available only ...  ...  [7168, 3515, 1023, 8227, 2507, 5881, 5867, 211...
1                      Ok lar... Joking wif u oni...  ...               [2299, 8466, 5707, 4338, 6370, 7100]
2  Free entry in 2 a wkly comp to win FA Cup fina...  ...  [3032, 5436, 4819, 8152, 7439, 6932, 7180, 456...
3  U dun say so early hor... U c already then say...  ...  [6370, 421, 7744, 2811, 1797, 6370, 707, 3377,...
4  Nah I don't think he goes to usf, he lives aro...  ...         [1534, 3517, 1537, 3578, 6912, 5234, 6301]

[5 rows x 4 columns]


In [None]:
# sanity check for encoding:
print(len(all_words))
print(ave_len)
lengths = [len(x) for x in spam.encoded]
print(max(lengths))
print(min(lengths))
print(set(spam.detection))
review_length = spam.review_length
spam.head()
print(len(word_set))

53413
9
9
9
{0, 1}
8577


In [None]:
# Train test split from skearln
data_size = len(spam['encoded'])
assert data_size == len(spam['detection']) 
X, y = list(zip(list(spam['encoded']),(list(spam['review_length'])))), list(spam['detection'])
# X(data, length)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

In [None]:
# Sanity check for length match
print(X_train[0])

([5845, 3147, 6436, 1340, 1277, 3147, 4879, 3348, 3729], 9)


## Dataset and Dataloader

### Clothing Review Dataset

In [None]:
class Dataset_Glove(Dataset):
    def __init__(self, X, y): 
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return  self.X[idx], self.y[idx]

In [None]:
trainSet_glv = Dataset_Glove(X_train, y_train)
testSet_glv = Dataset_Glove(X_test, y_test)

In [None]:
print(trainSet_glv[0])

(([5845, 3147, 6436, 1340, 1277, 3147, 4879, 3348, 3729], 9), 0)


In [None]:
def myCollate(batch):
    data = [item[0] for item in batch]
    label = [item[1] for item in batch]
    # sort sequence according to it's length in a batch
    data = data
    # data.sort(key=lambda x: x[1], reverse = True)

    # data (review: list, length: int)
    review = torch.tensor([x[0] for x in data], dtype = torch.long)
    label = torch.tensor(label, dtype = torch.long)
    seq_len = [x[1] for x in data]
    return review, label, seq_len

In [None]:
trainSet_glv = Dataset_Glove(X_train, y_train)
testSet_glv = Dataset_Glove(X_test, y_test)

In [None]:
trainLoader_glv = DataLoader(dataset = trainSet_glv, batch_size = 16, collate_fn = myCollate, shuffle = True, drop_last = True)
testLoader_glv = DataLoader(dataset = testSet_glv, batch_size = 16, collate_fn = myCollate, shuffle = True, drop_last = True)
it = iter(trainLoader_glv)
x, y, seq_len = it.next()

In [None]:
x.data

tensor([[1070, 7874, 5365, 1802,    1,    1,    1,    1,    1],
        [ 558, 8203,    1,    1,    1,    1,    1,    1,    1],
        [7545, 6588, 6612,  799, 1277, 7312, 2241,    1,    1],
        [4795, 8203, 3194, 3110, 2489, 2779, 5287, 4468, 2629],
        [8506,  948, 5555,    1,    1,    1,    1,    1,    1],
        [ 421, 8055, 1297, 1448, 6370,  421, 7312, 6133, 3138],
        [1802, 3929, 5920, 3272, 2425, 1685, 4819, 6475, 4523],
        [4104, 8131, 4198, 3173,    1,    1,    1,    1,    1],
        [ 337, 2114, 6988, 8510, 5834, 4732, 1801, 6441, 4732],
        [1271, 7168, 4819, 5300, 6851, 4236, 7777, 8494, 8335],
        [1318, 3502,    4, 1387, 5820, 7801, 6971, 4236, 6093],
        [ 304, 3118, 4871,    1,    1,    1,    1,    1,    1],
        [2477, 6133, 1318, 8305,  421, 2097, 1318, 3916, 4236],
        [1780, 4498,    1,    1,    1,    1,    1,    1,    1],
        [ 163, 8434, 5350, 1662, 5435,    1,    1,    1,    1],
        [6947, 2099, 1144,    1,    1,  

In [None]:
y

tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
seq_len

[9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]

## Embedding

In [None]:
def load_glove_vectors(glove_file="/content/drive/MyDrive/data /glove.6B.300d.txt"):
    """Load the glove word vectors"""
    word_vectors = {}
    with open(glove_file) as f:
        for line in f:
            split = line.split()
            word_vectors[split[0]] = np.array([float(x) for x in split[1:]])
    return word_vectors


In [None]:
# create embedding matrix
def emb_matrix(model, all_words, word2index):
    matrix_size = len(word_set) + 2
    emb_matrix = np.zeros((matrix_size, 300))
    emb_matrix[0] = np.random.uniform(-0.25, 0.25, 300) # vector for UNK
    emb_matrix[1] = np.zeros(300) # vector for padding, has no weight

    # vector for every other word in the dictionary
    for i in range(2, len(all_words)+2):
        word = all_words[i-2]
        index = word2index[word]
        try:
            vector = model[word]
        except Exception as e:
            # word does not exist in the pretrained embedding
            vector = emb_matrix[0]
        emb_matrix[index] = vector
    return emb_matrix

In [None]:
word2vec = load_glove_vectors()
data = list(word2vec.values())
an_array=np.array(data)

In [None]:
embedding = emb_matrix(word2vec, word_set, word2index)

In [None]:
print(embedding.shape)

(8579, 300)


In [None]:
print(embedding)

[[ 0.09517959  0.08528715  0.22070221 ...  0.24855686 -0.00454442
  -0.03334954]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.050635    0.47353     0.42023    ...  0.081678    0.07216
  -0.41351   ]
 ...
 [ 0.09517959  0.08528715  0.22070221 ...  0.24855686 -0.00454442
  -0.03334954]
 [ 0.28183    -0.44428    -0.53486    ...  0.32397     0.2253
   0.45062   ]
 [ 0.0024875  -0.36866    -0.095369   ... -0.013226   -0.87491
  -0.10378   ]]


In [None]:
def validation_metrics(model, valid_dl):
    print("current in vlaidaiton")
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    criterion = nn.CrossEntropyLoss()
    model = model
    y_total = []
    y_pred_total = []
    for x, y, seq_len in valid_dl:
        x = x.long()
        y = y.long()
        y_hat = model(x, seq_len)
        loss = criterion(y_hat, y)
        
        # print(f"shape of y: {y.shape}")
        # print(f"shape of y_pred: {y_hat.shape}")
        # print(y_hat)
        pred = torch.argmax(y_hat, 1)
        y_total.extend(y.tolist())
        y_pred_total.extend(pred.tolist())
        
        # print(pred)
        correct += sum((pred == y))
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        # print(len(y_total))
        # print(len(y_pred_total))

      # calculate precision, recall, and f1
    f1 = (f1_score(y_total, y_pred_total, average='weighted'))
    precision = (precision_score(y_total, y_pred_total, average='weighted'))
    recall =(recall_score(y_total, y_pred_total, average='weighted'))
    return sum_loss/total, correct/total, precision, recall, f1

In [None]:
def train_model(model, fname, epochs = 50, lr = 0.0001):
    parameters = filter(lambda p:p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr = lr)
    criterion = nn.CrossEntropyLoss()
    model = model
    PATH = '/content/drive/MyDrive/McGill/Comp 550/' + fname
    epoch = 0
    best_val_acc = 0.0
    for i in range(epochs):
        epoch += 1
        print(f"At epoch {i}")
        batch_count = 0
        model.train()
        sum_loss = 0.0
        total = 0.0
        train_correct = 0.0
        for x, y, seq_len in tqdm(trainLoader_glv):
            x = x.long()
            y = y.long()
            batch_count += 1
            optimizer.zero_grad()
            y_pred = model(x, seq_len)
            pred = torch.argmax(y_pred, 1)
            train_correct += sum((pred == y))
            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, precision, recall, f1 = validation_metrics(model, testLoader_glv)
        print("train loss %.3f, train accuracy %.3f, val loss %.3f, val accuracy %.3f, precision %.3f, recall %.3f, F1 %.3f" 
              % (sum_loss/total, train_correct/total, val_loss, val_acc, precision, recall, f1))
        if val_acc > best_val_acc and i>=1:
            best_val_acc = val_acc
            NEW_PATH = PATH+f'_{i}_{val_acc}.pth'
            #torch.save(model.state_dict(), NEW_PATH)
            print(f"\t=> Best model saved at {i}th epoch with valication accuracy of {val_acc}")

In [None]:
weight = torch.FloatTensor(an_array)
word2vec_embedding = nn.Embedding.from_pretrained(weight)

class LSTM_word2vec(torch.nn.Module):
    def __init__(self, emb_matrix, embedding_dim, hidden_dim):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.embeddings = nn.Embedding(emb_matrix.shape[0], embedding_dim, padding_idx = 0)
        self.embeddings.weight.data.copy_(torch.from_numpy(emb_matrix))
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.linear = nn.Linear(hidden_dim, 5)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x, seq_len):
        x = self.embeddings(x)
        # x = nn.utils.rnn.pack_padded_sequence(x, self.embedding_dim, batch_first = True)
        x = self.dropout(x)
        # x_padded = pack_padded_sequence(x, seq_len, batch_first = True)
        output, (h_n, c_n) = self.lstm(x)
        # print(h_n[-1].shape)
        # output (N, ave_len, num_class)
        # reshape to out (N, num_class)
        
        return self.linear(h_n[-1])

In [None]:
model = LSTM_word2vec(embedding, 300, 100)
train_model(model, fname='word2vec_lstm', epochs=20)

At epoch 0


100%|██████████| 278/278 [00:06<00:00, 40.41it/s]
  _warn_prf(average, modifier, msg_start, len(result))


current in vlaidaiton
train loss 0.818, train accuracy 0.848, val loss 0.346, val accuracy 0.861, precision 0.742, recall 0.861, F1 0.797
At epoch 1


100%|██████████| 278/278 [00:06<00:00, 42.61it/s]


current in vlaidaiton
train loss 0.215, train accuracy 0.920, val loss 0.163, val accuracy 0.950, precision 0.949, recall 0.950, F1 0.947
	=> Best model saved at 1th epoch with valication accuracy of 0.9501811861991882
At epoch 2


100%|██████████| 278/278 [00:06<00:00, 40.92it/s]


current in vlaidaiton
train loss 0.134, train accuracy 0.960, val loss 0.127, val accuracy 0.956, precision 0.954, recall 0.956, F1 0.955
	=> Best model saved at 2th epoch with valication accuracy of 0.9556159377098083
At epoch 3


100%|██████████| 278/278 [00:07<00:00, 36.12it/s]


current in vlaidaiton
train loss 0.108, train accuracy 0.964, val loss 0.110, val accuracy 0.961, precision 0.960, recall 0.961, F1 0.960
	=> Best model saved at 3th epoch with valication accuracy of 0.9610507488250732
At epoch 4


100%|██████████| 278/278 [00:07<00:00, 35.27it/s]


current in vlaidaiton
train loss 0.091, train accuracy 0.970, val loss 0.110, val accuracy 0.965, precision 0.965, recall 0.965, F1 0.963
	=> Best model saved at 4th epoch with valication accuracy of 0.9646739363670349
At epoch 5


100%|██████████| 278/278 [00:07<00:00, 35.40it/s]


current in vlaidaiton
train loss 0.077, train accuracy 0.976, val loss 0.103, val accuracy 0.966, precision 0.966, recall 0.966, F1 0.965
	=> Best model saved at 5th epoch with valication accuracy of 0.9664855003356934
At epoch 6


100%|██████████| 278/278 [00:07<00:00, 35.32it/s]


current in vlaidaiton
train loss 0.070, train accuracy 0.978, val loss 0.091, val accuracy 0.969, precision 0.969, recall 0.969, F1 0.968
	=> Best model saved at 6th epoch with valication accuracy of 0.9692028760910034
At epoch 7


100%|██████████| 278/278 [00:07<00:00, 35.01it/s]


current in vlaidaiton
train loss 0.064, train accuracy 0.981, val loss 0.086, val accuracy 0.969, precision 0.969, recall 0.969, F1 0.969
At epoch 8


100%|██████████| 278/278 [00:08<00:00, 34.20it/s]


current in vlaidaiton
train loss 0.050, train accuracy 0.984, val loss 0.087, val accuracy 0.971, precision 0.971, recall 0.971, F1 0.970
	=> Best model saved at 8th epoch with valication accuracy of 0.9710144996643066
At epoch 9


100%|██████████| 278/278 [00:07<00:00, 35.43it/s]


current in vlaidaiton
train loss 0.046, train accuracy 0.984, val loss 0.094, val accuracy 0.972, precision 0.972, recall 0.972, F1 0.971
	=> Best model saved at 9th epoch with valication accuracy of 0.9719203114509583
At epoch 10


100%|██████████| 278/278 [00:07<00:00, 34.93it/s]


current in vlaidaiton
train loss 0.044, train accuracy 0.987, val loss 0.082, val accuracy 0.974, precision 0.973, recall 0.974, F1 0.973
	=> Best model saved at 10th epoch with valication accuracy of 0.9737318754196167
At epoch 11


100%|██████████| 278/278 [00:08<00:00, 32.47it/s]


current in vlaidaiton
train loss 0.042, train accuracy 0.987, val loss 0.082, val accuracy 0.975, precision 0.975, recall 0.975, F1 0.974
	=> Best model saved at 11th epoch with valication accuracy of 0.9746376872062683
At epoch 12


100%|██████████| 278/278 [00:07<00:00, 35.13it/s]


current in vlaidaiton
train loss 0.036, train accuracy 0.989, val loss 0.080, val accuracy 0.976, precision 0.976, recall 0.976, F1 0.976
	=> Best model saved at 12th epoch with valication accuracy of 0.9764492511749268
At epoch 13


100%|██████████| 278/278 [00:07<00:00, 35.31it/s]


current in vlaidaiton
train loss 0.031, train accuracy 0.992, val loss 0.081, val accuracy 0.976, precision 0.975, recall 0.976, F1 0.975
At epoch 14


100%|██████████| 278/278 [00:07<00:00, 35.12it/s]


current in vlaidaiton
train loss 0.028, train accuracy 0.991, val loss 0.078, val accuracy 0.976, precision 0.975, recall 0.976, F1 0.975
At epoch 15


100%|██████████| 278/278 [00:07<00:00, 35.04it/s]


current in vlaidaiton
train loss 0.030, train accuracy 0.989, val loss 0.074, val accuracy 0.976, precision 0.976, recall 0.976, F1 0.976
At epoch 16


100%|██████████| 278/278 [00:07<00:00, 34.95it/s]


current in vlaidaiton
train loss 0.024, train accuracy 0.993, val loss 0.080, val accuracy 0.976, precision 0.976, recall 0.976, F1 0.976
At epoch 17


100%|██████████| 278/278 [00:07<00:00, 35.43it/s]


current in vlaidaiton
train loss 0.024, train accuracy 0.993, val loss 0.079, val accuracy 0.977, precision 0.977, recall 0.977, F1 0.977
	=> Best model saved at 17th epoch with valication accuracy of 0.9773550629615784
At epoch 18


100%|██████████| 278/278 [00:07<00:00, 35.27it/s]


current in vlaidaiton
train loss 0.018, train accuracy 0.995, val loss 0.087, val accuracy 0.976, precision 0.976, recall 0.976, F1 0.976
At epoch 19


100%|██████████| 278/278 [00:07<00:00, 35.24it/s]


current in vlaidaiton
train loss 0.021, train accuracy 0.993, val loss 0.073, val accuracy 0.979, precision 0.979, recall 0.979, F1 0.979
	=> Best model saved at 19th epoch with valication accuracy of 0.9791666865348816
