In [1]:
import os
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
os.chdir("drive/MyDrive/Colab Notebooks/CS7650/final")
os.listdir()

Mounted at /content/drive


['data', 'Preprocessing', 'Models', 'resources.gdoc', 'BiLSTM.pt']

## Libraries

In [2]:
import pandas as pd
import json
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import tqdm
from ast import literal_eval

## GPU check

In [3]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

print(f'GPU available: {torch.cuda.is_available()}')

Wed Apr 28 17:51:38 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Vocab

In [4]:
class Vocab:
  def __init__(self, w2i_file, wc_file, min_count=1):
    with open(w2i_file, 'r') as file:
      self.word2id = json.load(file)
    with open(wc_file, 'r') as file:
      self.word_counts = json.load(file)
    self.num_words = len(self.word2id.keys())
    self.min_count = min_count
    self.infrequent = [k for k,v in self.word_counts.items() if v <= min_count]
  
  def _word2id(self, word, train):
    if train and (word in self.infrequent and random.random() > 0.5):
      return 0
    else:
      return self.word2id.get(word, 0)

  def sentence2indices(self, sentence, train):
    return [self._word2id(word, train) for word in sentence.split()]
    #return [[self._word2id(word, train) for word in s.split()] for s in sentences]

In [5]:
w2i_file = 'data/vocab/word2id.json'
wc_file = 'data/vocab/word_counts.json'
vocab = Vocab(w2i_file, wc_file)

In [6]:
with open('data/gloVe/filtered_glove.json', 'r') as file:
  gloVe = json.load(file)
  file.close()

## Data

In [7]:
num_chunks = 4 #start with 1/4 of data for now
df_trains = []
df_tests = []
for i in range(num_chunks):
  temp_train = pd.read_csv('data/clean_indexed/train_clean_id{0}.csv'.format(i))
  temp_train['comment_text'] = temp_train['comment_text'].apply(literal_eval)
  temp_test = pd.read_csv('data/clean_indexed/test_clean_id{0}.csv'.format(i))
  temp_test['comment_text'] = temp_test['comment_text'].apply(literal_eval)
  df_trains.append(temp_train)
  df_tests.append(temp_test)
df_train = pd.concat(df_trains, axis=0, ignore_index=True)
df_test = pd.concat(df_tests, axis=0, ignore_index=True)
df_train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",0,0,0,0,0,0
1,000103f0d9cfb60f,"[0, 43, 44, 45, 46, 47, 39, 48, 49, 50, 51, 36...",0,0,0,0,0,0
2,000113f07ec002fd,"[57, 58, 39, 59, 60, 61, 62, 63, 64, 65, 17, 6...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"[80, 23, 86, 87, 88, 89, 90, 19, 91, 23, 92, 9...",0,0,0,0,0,0
4,0001d958c54c6e35,"[125, 149, 110, 7, 150, 88, 151, 125, 152, 153...",0,0,0,0,0,0


In [8]:
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
X_all = df_train['comment_text'].tolist()
Y_all = df_train[classes].values.tolist()
X_train, X_val, Y_train, Y_val = train_test_split(X_all, Y_all, test_size=0.25, random_state=1)
# X_train = df_train['comment_text'].tolist()
# Y_train = df_train[classes].values.tolist()
X_test = df_test['comment_text'].tolist()
Y_test = df_test[classes].values.tolist()

In [9]:
print(f'{len(X_train)} rows in train')
print(f'{len(X_val)} rows in validation')
print(f'{len(X_test)} rows in test')

119670 rows in train
39890 rows in validation
63568 rows in test


In [10]:
for i in range(len(classes)):
  print(f'class {classes[i]}')
  u_train, c_train = np.unique(np.array(Y_train)[:,i], return_counts=True)
  print(u_train, c_train / c_train.sum())
  u_val, c_val = np.unique(np.array(Y_val)[:,i], return_counts=True)
  print(u_val, c_val / c_val.sum())
  print('----')

class toxic
[0 1] [0.90420323 0.09579677]
[0 1] [0.90398596 0.09601404]
----
class severe_toxic
[0 1] [0.99025654 0.00974346]
[0 1] [0.98924542 0.01075458]
----
class obscene
[0 1] [0.94697919 0.05302081]
[0 1] [0.94725495 0.05274505]
----
class threat
[0 1] [0.99699173 0.00300827]
[0 1] [0.99704187 0.00295813]
----
class insult
[0 1] [0.95058076 0.04941924]
[0 1] [0.95078967 0.04921033]
----
class identity_hate
[0 1] [0.99118409 0.00881591]
[0 1] [0.99122587 0.00877413]
----


## Just BiLSTM
- https://www.aclweb.org/anthology/P16-2034.pdf

In [11]:
class BiLSTM(nn.Module):
  def __init__(self, VOCAB_SIZE, DIM_EMB=300, DIM_HID=300, NUM_LAYERS=1, NUM_CLASSES=6, gloVe=None, dropout=0.2):
    super(BiLSTM, self).__init__()
    self.NUM_CLASSES=NUM_CLASSES
    self.num_words = VOCAB_SIZE + 1#forgot padding character lol
    self.pad_idx = VOCAB_SIZE# + 1
    self.DIM_HID = DIM_HID
    self.num_layers = NUM_LAYERS
    
    #Embedding
    self.embed = nn.Embedding(num_embeddings=self.num_words, embedding_dim=DIM_EMB, padding_idx=self.pad_idx)
    if gloVe:
      #weights = torch.zeros_like(self.embed.weight)
      weights = torch.normal(0,1,size=self.embed.weight.shape)
      for w, e in gloVe.items():
        w_idx = vocab.word2id[w]
        weights[w_idx] = torch.FloatTensor(e)
      self.embed.weight.data.copy_(weights)   
    #BiLSTM
    self.lstm = nn.LSTM(input_size=DIM_EMB
                        ,hidden_size=self.DIM_HID
                        ,num_layers=self.num_layers
                        ,batch_first=True
                        ,bidirectional=True)
    #Final
    self.dropout = nn.Dropout(dropout)
    self.linear = nn.Linear(in_features=self.DIM_HID * self.num_layers * 2
                            ,out_features=self.NUM_CLASSES)
    #self.final = nn.Sigmoid()

  def forward(self, X, mask=None, train=True):
    #X, mask -> (batch size, max len)
    #embed
    embed = self.embed(X.cuda()) #(batch size, max len, DIM_EMB)

    #packed
    #print(embed.shape, X.shape)
    source_lengths = torch.sum(X != self.pad_idx, axis=1).cpu()
    #print(source_lengths[:5], source_lengths.shape)
    embed = nn.utils.rnn.pack_padded_sequence(embed, lengths=source_lengths, batch_first=True, enforce_sorted=False)

    #lstm
    output, (h_n, c_n) = self.lstm(embed) #(batch size, seq len, num_directions * hidden_size)
    hidden = torch.cat([h_n[i,:,:] for i in range(h_n.size(0))], dim=1) #(batch size, hidden_size * hidden layers * 2)

    if train:
      hidden = self.dropout(hidden)
    linear = self.linear(hidden)
    return linear
    #return self.final(linear)

## Train

In [12]:
def EvalNet(net, X, Y, threshold=0.5, verbose=True):
  net.eval()
  pred = np.zeros_like(Y)
  for i in tqdm.notebook.tqdm(range(len(X)), leave=False):
    x = torch.LongTensor(X[i]).unsqueeze(0)
    #idk what to do when the seq length is less than 3 lol
    if (x.size(-1) < 3):
      pred[i] = np.array([0,0,0,0,0,0])
      continue
    #probs = net.forward(x, train=False).cpu()
    probs = torch.sigmoid(net.forward(x, None, train=False)).cpu()
    pred[i] = np.array(probs > threshold, dtype=float)
  # x, x_mask = pad_input(X, net.pad_idx)
  # probs = net.forward(x, x_mask)
  # pred = np.array(probs > threshold, dtype=float)
  Y = np.array(Y)
  for i in range(len(classes)):
    #print(Y[:,i])
    #print(pred[:,i])
    acc = accuracy_score(Y[:,i], pred[:,i])
    rec = recall_score(Y[:,i], pred[:,i])
    prec = precision_score(Y[:,i], pred[:,i])
    f1 = f1_score(Y[:,i], pred[:,i])
    if verbose:
      print(f'{classes[i]} label')
      print(f'Accuracy: {acc} Recall {rec} Precision {prec} F1 {f1}')
      print('-----------------------')
  total_acc = accuracy_score(Y, pred)
  total_rec = recall_score(Y, pred, average='micro')
  total_prec = precision_score(Y, pred, average='micro')
  total_f1 = f1_score(Y, pred, average='micro')
  if verbose:
    print('Total')
    print(f'Accuracy: {total_acc} Recall {total_rec} Precision {total_prec} F1 {total_f1}')
  return total_f1

def shuffle_sentences(sentences, tags):
  shuffled_sentences = []
  shuffled_tags = []
  indices = list(range(len(sentences)))
  random.shuffle(indices)
  for i in indices:
    shuffled_sentences.append(sentences[i])
    shuffled_tags.append(tags[i])
  return (shuffled_sentences, shuffled_tags)

#Pad inputs to max sequence length (for batching)
def pad_input(X_list, pad_val):
  X_padded = torch.nn.utils.rnn.pad_sequence([torch.as_tensor(l) for l in X_list], batch_first=True, padding_value=pad_val).type(torch.LongTensor)
  X_mask = torch.nn.utils.rnn.pad_sequence([torch.as_tensor([1.0] * len(l)) for l in X_list], batch_first=True).type(torch.FloatTensor)
  return X_padded, X_mask

def Train(net, X, Y, max_iter, lr, PATH):
  print("Start Training!")
  optimizer = optim.Adam(net.parameters(), lr=lr)

  num_classes = len(classes)
  batch_size = 50

  best_f1 = 0.0
  f1_dec_ct = 0

  for epoch in range(max_iter):
    total_loss = 0.0
    net.train()   #Put the network into training model
    (X_shuffled, Y_shuffled) = shuffle_sentences(X, Y)
    for batch in tqdm.notebook.tqdm(range(0, len(X), batch_size), leave=False):
      x, x_mask = pad_input(X_shuffled[batch:batch+batch_size], net.pad_idx)
      y = torch.FloatTensor(Y_shuffled[batch:batch+batch_size]).cuda()

      net.zero_grad()
      probs = net.forward(x, x_mask, train=True)

      #crit = nn.BCELoss()
      crit = nn.BCEWithLogitsLoss()
      loss = crit(probs, y)
      total_loss += loss

      loss.backward()
      optimizer.step()
    print(f'loss on epoch {epoch} = {total_loss}')
    #validation
    f1_val = EvalNet(net, X_val, Y_val, verbose=False)
    print(f'validation f1-score: {f1_val}')
    if f1_val > best_f1:
      best_f1 = f1_val
      torch.save(net.state_dict(), PATH)
      f1_dec_ct = 0
    #early stopping
    else:
      f1_dec_ct += 1
      if f1_dec_ct == 5:
        break

## Model

In [13]:
PATH = 'Models/BiLSTM_2layers.pt'
net = BiLSTM(VOCAB_SIZE=vocab.num_words, NUM_LAYERS=2, gloVe=gloVe).cuda()
Train(net, X_train, Y_train, max_iter=50, lr=0.001, PATH=PATH)

Start Training!


HBox(children=(FloatProgress(value=0.0, max=2394.0), HTML(value='')))

loss on epoch 0 = 137.8453369140625


HBox(children=(FloatProgress(value=0.0, max=39890.0), HTML(value='')))



  _warn_prf(average, modifier, msg_start, len(result))


validation f1-score: 0.7388631899119195


HBox(children=(FloatProgress(value=0.0, max=2394.0), HTML(value='')))

loss on epoch 1 = 91.17369079589844


HBox(children=(FloatProgress(value=0.0, max=39890.0), HTML(value='')))

validation f1-score: 0.7489537135361359


HBox(children=(FloatProgress(value=0.0, max=2394.0), HTML(value='')))

loss on epoch 2 = 66.7081069946289


HBox(children=(FloatProgress(value=0.0, max=39890.0), HTML(value='')))

validation f1-score: 0.7238984976644365


HBox(children=(FloatProgress(value=0.0, max=2394.0), HTML(value='')))

loss on epoch 3 = 46.396419525146484


HBox(children=(FloatProgress(value=0.0, max=39890.0), HTML(value='')))

validation f1-score: 0.7302391329956414


HBox(children=(FloatProgress(value=0.0, max=2394.0), HTML(value='')))

loss on epoch 4 = 29.27111053466797


HBox(children=(FloatProgress(value=0.0, max=39890.0), HTML(value='')))

validation f1-score: 0.7129909365558913


HBox(children=(FloatProgress(value=0.0, max=2394.0), HTML(value='')))

loss on epoch 5 = 18.47577667236328


HBox(children=(FloatProgress(value=0.0, max=39890.0), HTML(value='')))

validation f1-score: 0.7260273972602741


HBox(children=(FloatProgress(value=0.0, max=2394.0), HTML(value='')))

loss on epoch 6 = 12.5684175491333


HBox(children=(FloatProgress(value=0.0, max=39890.0), HTML(value='')))

validation f1-score: 0.7147830122998374


In [14]:
net = BiLSTM(VOCAB_SIZE=vocab.num_words,  NUM_LAYERS=2, gloVe=gloVe).cuda()
net.load_state_dict(torch.load(PATH))
EvalNet(net, X_test, Y_test)

HBox(children=(FloatProgress(value=0.0, max=63568.0), HTML(value='')))

toxic label
Accuracy: 0.9226497608859804 Recall 0.8476190476190476 Precision 0.5640913561359414 F1 0.6773833737943704
-----------------------
severe_toxic label
Accuracy: 0.9926378051849988 Recall 0.35149863760217986 Precision 0.3593314763231198 F1 0.35537190082644626
-----------------------
obscene label
Accuracy: 0.9627170903599295 Recall 0.7353020861555134 Precision 0.6608229851473094 F1 0.6960759169017696
-----------------------
threat label
Accuracy: 0.9964919456330229 Recall 0.22748815165876776 Precision 0.4444444444444444 F1 0.3009404388714733
-----------------------
insult label
Accuracy: 0.9655644349358168 Recall 0.6051940472716661 Precision 0.7127147766323024 F1 0.6545684077639261
-----------------------
identity_hate label
Accuracy: 0.9910489554492826 Recall 0.26825842696629215 Precision 0.799163179916318 F1 0.4016824395373292
-----------------------
Total
Accuracy: 0.8785395167379814 Recall 0.7116843702579666 Precision 0.6114732724902217 F1 0.6577840112201964


0.6577840112201964