In [None]:
import os
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
os.chdir("drive/MyDrive/Colab Notebooks/CS7650/final")
os.listdir()

In [None]:
import pandas as pd
import json
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tqdm
from ast import literal_eval

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

print(f'GPU available: {torch.cuda.is_available()}')

## Prepare Data

### Vocab

In [None]:
class Vocab:
  def __init__(self, w2i_file, wc_file, min_count=1):
    with open(w2i_file, 'r') as file:
      self.word2id = json.load(file)
    with open(wc_file, 'r') as file:
      self.word_counts = json.load(file)
    self.num_words = len(self.word2id.keys())
    self.min_count = min_count
    self.infrequent = [k for k,v in self.word_counts.items() if v <= min_count]
  
  def _word2id(self, word, train):
    if train and (word in self.infrequent and random.random() > 0.5):
      return 0
    else:
      return self.word2id.get(word, 0)

  def sentence2indices(self, sentence, train):
    return [self._word2id(word, train) for word in sentence.split()]
    #return [[self._word2id(word, train) for word in s.split()] for s in sentences]

In [None]:
w2i_file = 'data/vocab/word2id.json'
wc_file = 'data/vocab/word_counts.json'
vocab = Vocab(w2i_file, wc_file)

In [None]:
with open('data/gloVe/filtered_glove.json', 'r') as file:
  gloVe = json.load(file)
  file.close()

### Data

In [None]:
num_chunks = 4 #start with 1/4 of data for now
df_trains = []
df_tests = []
for i in range(num_chunks):
  temp_train = pd.read_csv('data/clean_indexed/train_clean_id{0}.csv'.format(i))
  temp_train['comment_text'] = temp_train['comment_text'].apply(literal_eval)
  temp_test = pd.read_csv('data/clean_indexed/test_clean_id{0}.csv'.format(i))
  temp_test['comment_text'] = temp_test['comment_text'].apply(literal_eval)
  df_trains.append(temp_train)
  df_tests.append(temp_test)
df_train = pd.concat(df_trains, axis=0, ignore_index=True)
df_test = pd.concat(df_tests, axis=0, ignore_index=True)
df_train.head()

In [None]:
print(f'{len(df_train)} rows in train')
print(f'{len(df_test)} rows in test')

In [None]:
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
X_train = df_train['comment_text'].tolist()
Y_train = df_train[classes].values.tolist()
X_test = df_test['comment_text'].tolist()
Y_test = df_test[classes].values.tolist()

## Neural Bag of Words
https://www.aclweb.org/anthology/P15-1162.pdf

In [None]:
class NBOW(nn.Module):
    def __init__(self, VOCAB_SIZE, DIM_EMB=300, NUM_CLASSES=6, gloVe=None):
      super(NBOW, self).__init__()
      self.NUM_CLASSES=NUM_CLASSES
      self.num_words = VOCAB_SIZE + 1#forgot padding character lol
      self.pad_idx = VOCAB_SIZE# + 1
      self.V = nn.Embedding(num_embeddings=self.num_words, embedding_dim=DIM_EMB, padding_idx=self.pad_idx)
      self.g = nn.ReLU() #relu activation
      self.W = nn.Linear(DIM_EMB, NUM_CLASSES) #hidden layer
      #self.final = nn.Softmax(dim=0)
      self.final = nn.Sigmoid()

      if gloVe:
        #weights = torch.zeros_like(self.embed.weight)
        weights = torch.normal(0,1,size=self.V.weight.shape)
        for w, e in gloVe.items():
          w_idx = vocab.word2id[w]
          weights[w_idx] = torch.FloatTensor(e)
        self.V.weight.data.copy_(weights)

    def forward(self, X, mask=None, train=True):
      #X is (batchsize, max length)
      embed = self.V(X.cuda()) #(batchsize, max_length, DIM_EMB)
      #agg = torch.mean(embed, dim=0).cuda()
      #for batching
      if train:
        embed_mask = mask.unsqueeze(-1).expand(embed.size()).cuda()
        embed = embed * embed_mask #still (batchsize, max_length, DIM_EMB)
      #average over non padding
        agg = embed.sum(dim=1)/(embed!=0).sum(dim=1) #(batchsize, DIM_EMB) i think
      else:
        agg = torch.mean(embed, dim=0).cuda()

      act = self.g(agg) #(batchsize, DIM_EMB)
      val = self.W(act) #(batchsize, NUM_CLASSES)
      return self.final(val)

def EvalNet(net, X, Y, threshold=0.5):
  net.eval()
  pred = np.zeros_like(Y)
  for i in range(len(X)):
    x = torch.LongTensor(X[i])
    probs = net.forward(x, train=False).cpu()
    pred[i] = np.array(probs > threshold, dtype=float)
  # x, x_mask = pad_input(X, net.pad_idx)
  # probs = net.forward(x, x_mask)
  # pred = np.array(probs > threshold, dtype=float)
  Y = np.array(Y)
  for i in range(len(classes)):
    #print(Y[:,i])
    #print(pred[:,i])
    acc = accuracy_score(Y[:,i], pred[:,i])
    rec = recall_score(Y[:,i], pred[:,i])
    prec = precision_score(Y[:,i], pred[:,i])
    f1 = f1_score(Y[:,i], pred[:,i])
    print(f'{classes[i]} label')
    print(f'Accuracy: {acc} Recall {rec} Precision {prec} F1 {f1}')
    print('-----------------------')
  total_acc = accuracy_score(Y, pred)
  total_rec = recall_score(Y, pred, average='micro')
  total_prec = precision_score(Y, pred, average='micro')
  total_f1 = f1_score(Y, pred, average='micro')
  print('Total')
  print(f'Accuracy: {total_acc} Recall {total_rec} Precision {total_prec} F1 {total_f1}')

def shuffle_sentences(sentences, tags):
  shuffled_sentences = []
  shuffled_tags = []
  indices = list(range(len(sentences)))
  random.shuffle(indices)
  for i in indices:
    shuffled_sentences.append(sentences[i])
    shuffled_tags.append(tags[i])
  return (shuffled_sentences, shuffled_tags)

#Pad inputs to max sequence length (for batching)
def pad_input(X_list, pad_val):
  X_padded = torch.nn.utils.rnn.pad_sequence([torch.as_tensor(l) for l in X_list], batch_first=True, padding_value=pad_val).type(torch.LongTensor)
  X_mask = torch.nn.utils.rnn.pad_sequence([torch.as_tensor([1.0] * len(l)) for l in X_list], batch_first=True).type(torch.FloatTensor)
  return X_padded, X_mask

def Train(net, X, Y, n_iter, lr):
  print("Start Training!")
  optimizer = optim.Adam(net.parameters(), lr=lr)

  num_classes = len(classes)
  batch_size = 50

  for epoch in range(n_iter):
      num_correct = 0
      total_loss = 0.0
      net.train()   #Put the network into training model
      (X_shuffled, Y_shuffled) = shuffle_sentences(X, Y)
      for batch in tqdm.notebook.tqdm(range(0, len(X), batch_size), leave=False):
        x, x_mask = pad_input(X_shuffled[batch:batch+batch_size], net.pad_idx)
        y = torch.FloatTensor(Y_shuffled[batch:batch+batch_size]).cuda()
      # for i in tqdm.notebook.tqdm(range(len(X))):
      #   x = X[i].cuda()
      #   y = torch.zeros(net.NUM_CLASSES).cuda()
      #   y[int(Y[i])] = 1
      #   y = y.float()

        net.zero_grad()
        probs = net.forward(x, x_mask)

        crit = nn.BCELoss()
        loss = crit(probs, y)
        total_loss += loss

        loss.backward()
        optimizer.step()
      print("loss on epoch {0} = {1}".format(epoch, total_loss))

In [None]:
nbow = NBOW(VOCAB_SIZE=vocab.num_words, gloVe=gloVe).cuda()

In [None]:
Train(nbow, X_train, Y_train, n_iter=4, lr=0.01)

In [None]:
EvalNet(nbow, X_test, Y_test)