In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import sys
import os

data_loc = "/content/drive/MyDrive/Deep_Learning"
sys.path.append(os.path.abspath(data_loc))

In [None]:
import logging
import time
from platform import python_version
import gc

import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics import roc_auc_score
import matplotlib
import matplotlib.pyplot as plt
import re

import tensorflow as tf

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

In [None]:
data = pd.read_excel("/content/drive/MyDrive/Deep_Learning/data/train_sample_30K.xlsx")
data_og = pd.read_csv("/content/drive/MyDrive/Deep_Learning/data/train.csv")
df_v2 = pd.merge(data[['id']], data_og, on='id')
df = df_v2.sort_values(by=['id']).reset_index(drop=True)
df['toxic_binary'] = np.where((df['toxic']==1)|(df['severe_toxic']==1)|(df['obscene']==1)|
                              (df['threat']==1)|(df['insult']==1)|(df['identity_hate']==1), 1, 0)
df.head()

In [None]:
class Feedforward(torch.nn.Module):
    
    def __init__(self, input_size, hidden_size, dropout):
        super(Feedforward, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.dropout = nn.Dropout(dropout)
        self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(self.hidden_size, 6)
        self.sigmoid = torch.nn.Sigmoid()
    
    def forward(self, x):
        x = self.dropout(x)
        hidden = self.fc1(x)
        relu = self.relu(hidden)
        relu = self.dropout(relu)
        output = self.fc2(relu)
        output = self.sigmoid(output)
        return output



class KimCNN(nn.Module):
    
    def __init__(self, embed_num, embed_dim, class_num, kernel_num, kernel_sizes, dropout, static):
        super(KimCNN, self).__init__()
        V = embed_num
        D = embed_dim
        C = class_num
        Co = kernel_num
        Ks = kernel_sizes
        self.static = static
        self.embed = nn.Embedding(V, D)
        self.convs1 = nn.ModuleList([nn.Conv2d(1, Co, (K, D)) for K in Ks])
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(len(Ks) * Co, C)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        if self.static:
            x = Variable(x)
        x = x.unsqueeze(1)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1]
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
        x = torch.cat(x, 1)
        x = self.dropout(x)
        logit = self.fc1(x)
        output = self.sigmoid(logit)
        return output


class GRUNet(nn.Module):

    def __init__(self, input_dim, hidden_dim, output_dim, n_layers, drop_prob):
        super(GRUNet, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.gru = nn.GRU(input_dim, hidden_dim, n_layers, batch_first=True, bidirectional=True, dropout=drop_prob)
        self.fc = nn.Linear(2*hidden_dim, output_dim)
        self.relu = nn.ReLU()
        self.sigmoid = torch.nn.Sigmoid()
        
    def forward(self, x, h):
        out, h = self.gru(x, h)
        out = self.fc(self.relu(out[:,-1]))
        out = self.sigmoid(out)
        return out, h
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = weight.new(self.n_layers*2, batch_size, self.hidden_dim).zero_()
        return hidden

In [None]:
#Choose one model and comment the other lines---------------------------------
model = Feedforward(
    input_size=76800,
    hidden_size=1000,
    dropout=0.7
)

model = KimCNN(
    embed_num = 400
    embed_dim = 768
    class_num = 6
    kernel_num = 3
    kernel_sizes = [2, 3, 4]
    dropout = 0.7
    static = True
)

model = GRUNet(
    input_dim=100,
    hidden_dim=768,
    output_dim=6,
    n_layers=1,
    drop_prob=0.7
)

In [None]:
n_epochs = 5
batch_size = 50
lr = 0.0001
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_fn = nn.BCELoss()

In [None]:
def generate_batch_data(x, y, batch_size):
    i, batch = 0, 0
    for batch, i in enumerate(range(0, len(x) - batch_size, batch_size), 1):
        x_batch = x[i : i + batch_size]
        y_batch = y[i : i + batch_size]
        yield x_batch, y_batch, batch
    if i + batch_size < len(x):
        yield x[i + batch_size :], y[i + batch_size :], batch + 1
    if batch == 0:
        yield x, y, 1

MODEL TRAINING
- Choose one model and comment the other lines in the code (look for comments to understand what lines to comment)

In [None]:
#Choose one model and comment the other lines in the code (look for comments to understand what lines to comment)
train_losses, val_losses = [], []

for epoch in range(n_epochs):
    start_time = time.time()
    train_loss = 0
    h = model.init_hidden(batch_size)

    for i in range(0,25):
        if (i+1)%5==0:
            print("Epoch %d - data subset %d"%(epoch+1, i+1))
        file_to_read = "/content/drive/MyDrive/Deep_Learning/data_tensors/embed_"+str(i)+"_12.pt"
        x_train = torch.load(file_to_read)

        #CHOOSE ONE MODEL AND COMMENT THE OTHER LINES---------------------------------
        #Method 1 - FeedForward Neural Network
        x_train = x_train.view(x_train.size(0), -1)
        #Method 2 - Kim-CNN
        x_train = x_train
        #Method 3 - Bi-GRU
        x_train = x_train.transpose(-2, -1)

        y_train = torch.tensor(df[i*1000:((i+1)*1000)][['toxic','severe_toxic','obscene','threat','insult','identity_hate']].values.astype(np.float32))

        for x_batch, y_batch, batch in generate_batch_data(x_train, y_train, batch_size):
            
            #CHOOSE ONE MODEL AND COMMENT THE OTHER LINES---------------------------------
            #Method 1 - FeedForward Neural Network
            y_pred = model(x_batch)
            #Method 2 - Kim-CNN
            y_pred = model(x_batch)
            #Method 3 - Bi-GRU
            h = h.data
            y_pred, h = model(x_batch, h)
            
            optimizer.zero_grad()
            loss = loss_fn(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

    train_loss /= (batch*(i+1))
    train_losses.append(train_loss)
    elapsed = time.time() - start_time

    print(
        "Epoch %d Train loss: %.2f. Elapsed time: %.2fs."
        % (epoch + 1, train_losses[-1], elapsed)
    )

MODEL VALIDATION
- Choose one model and comment the other lines in the code (look for comments to understand what lines to comment)

In [None]:
model.eval() # disable dropout for deterministic output

with torch.no_grad(): # deactivate autograd engine to reduce memory usage and speed up computations
    y_preds = []
    batch = 0
    for i in range(0,30):
        if (i+1)%5==0:
            print("Epoch %d - data subset %d"%(epoch+1, i+1))

        # test = [0]*len(range(9,13))
        # for j in range(9,13):
        #     file_to_read = "/content/drive/MyDrive/Deep_Learning/data_tensors/embed_"+str(i)+"_"+str(j)+".pt"
        #     test[j-9] = torch.load(file_to_read)
            

        # x_test = torch.cat(test, dim=1)
        file_to_read = "/content/drive/MyDrive/Deep_Learning/data_tensors/embed_"+str(i)+"_12.pt"
        x_test = torch.load(file_to_read)

        #CHOOSE ONE MODEL AND COMMENT THE OTHER LINES---------------------------------
        #Method 1 - FeedForward Neural Network
        x_test = x_test.view(x_test.size(0), -1)
        #Method 2 - Kim-CNN
        x_test = x_test
        #Method 3 - Bi-GRU
        x_test = x_test.transpose(-2, -1)

        y_test = torch.tensor(df[i*1000:((i+1)*1000)][['toxic','severe_toxic','obscene','threat','insult','identity_hate']].values.astype(np.float32))

        for x_batch, y_batch, batch in generate_batch_data(x_test, y_test, batch_size):
            
            #CHOOSE ONE MODEL AND COMMENT THE OTHER LINES---------------------------------
            #Method 1 - FeedForward Neural Network
            y_pred = model(x_batch)
            #Method 2 - Kim-CNN
            y_pred = model(x_batch)
            #Method 3 - Bi-GRU
            h = model.init_hidden(x_batch.shape[0])
            y_pred, h = model(x_batch, h)

            y_preds.extend(y_pred.cpu().numpy().tolist())
    
    y_preds_np = np.array(y_preds)

In [None]:
#Get the target values
y_test_np = df[['toxic','severe_toxic','obscene','threat','insult','identity_hate']].values

In [None]:
#Compute the training AUC
auc_scores = roc_auc_score(y_test_np[0:25000], y_preds_np[0:25000], average=None)
df_accuracy = pd.DataFrame({"label": ['toxic','severe_toxic','obscene','threat','insult','identity_hate'], "auc": auc_scores})
df_accuracy.sort_values('auc')[::-1]

In [None]:
#Compute the test AUC
auc_scores = roc_auc_score(y_test_np[25000:], y_preds_np[25000:], average=None)
df_accuracy = pd.DataFrame({"label": ['toxic','severe_toxic','obscene','threat','insult','identity_hate'], "auc": auc_scores})
df_accuracy.sort_values('auc')[::-1]