# MSDS631 - Deep Learning Final Project 2019 
## Quora Insincere Questions Classification
Detect toxic content to improve online conversations

----
### _Team: Jenny Kong & Joy Qi_

# Using BiLSTM

In [1]:
import numpy as np 
import pandas as pd 
import os
import spacy
import string
import re
import numpy as np
from spacy.symbols import ORTH
from collections import Counter
from pathlib import Path
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [2]:
PATH = Path("../final-project/input/")
list(PATH.iterdir())

[PosixPath('../final-project/input/test.csv'),
 PosixPath('../final-project/input/train.csv'),
 PosixPath('../final-project/input/sample_submission.csv')]

# Processing Data

In [4]:
df_train = pd.read_csv(PATH/"train.csv")
df_test = pd.read_csv(PATH/"test.csv")

In [5]:
print("Train shape: ", df_train.shape)
print("Test shape: ", df_test.shape)

Train shape:  (1306122, 3)
Test shape:  (375806, 2)


In [6]:
df_train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [7]:
df_test.head()

Unnamed: 0,qid,question_text
0,0000163e3ea7c7a74cd7,Why do so many women become so rude and arroga...
1,00002bd4fb5d505b9161,When should I apply for RV college of engineer...
2,00007756b4a147d2b0b3,What is it really like to be a nurse practitio...
3,000086e4b7e1c7146103,Who are entrepreneurs?
4,0000c4c3fbe8785a3090,Is education really making good people nowadays?


In [8]:
df_train.target.value_counts()

0    1225312
1      80810
Name: target, dtype: int64

In [9]:
df_train.dtypes

qid              object
question_text    object
target            int64
dtype: object

In [10]:
df_train['question_text'] = df_train['question_text'].astype(str)
df_train['qid'] = df_train['qid'].astype(str)

# Tokenization

In [11]:
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
def sub_br(x): return re_br.sub("\n", x)

my_tok = spacy.load('en')
def spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(sub_br(x))]

In [12]:
text = df_train['question_text'][0]
spacy_tok(text)

['How',
 'did',
 'Quebec',
 'nationalists',
 'see',
 'their',
 'province',
 'as',
 'a',
 'nation',
 'in',
 'the',
 '1960s',
 '?']

# Computing vocab2index

In [13]:
counts = Counter()
for q in df_train['question_text']:
    counts.update(spacy_tok(q))

In [14]:
len(counts.keys())

262118

In [15]:
for word in list(counts):
    if counts[word] < 5:
        del counts[word]

In [16]:
len(counts.keys())

57655

In [17]:
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

# Dataset

In [18]:
def encode_sentence(row, vocab2index, N=100, padding_start=True):
    words = spacy_tok(row)
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in words])
    l = min(N, len(enc1))
    if padding_start:
        enc[:l] = enc1[:l]
    else:
        enc[N-l:] = enc1[:l]
    return enc, l

In [19]:
encode_sentence(df_train['question_text'][0], vocab2index, N=100, padding_start=False)

(array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
       dtype=int32), 14)

### Select max sequece length N according to the 98th percentile length

In [20]:
x_train_len = np.array([len(q.split()) for q in df_train['question_text']])
x_test_len = np.array([len(q.split()) for q in df_test['question_text']])

In [21]:
x_train_len.max(), x_test_len.max()

(134, 87)

In [22]:
# 99 percentile
N_train = np.percentile(x_train_len, 99)
N_test = np.percentile(x_test_len, 99)

N_train, N_test

(39.0, 39.0)

### Hence we chose 50 to be the N

In [23]:
N = 40

# Split train and valid dataset

In [24]:
train, valid = train_test_split(df_train, test_size=0.2)

In [25]:
train.shape, valid.shape

((1044897, 3), (261225, 3))

In [67]:
class QuoraDataset(Dataset):
    def __init__(self, df, vocab2index, is_test=False, N=40, padding_start=True):
        self.question = [encode_sentence(q, vocab2index, N, padding_start) for q in df['question_text']]
        self.is_test = is_test
        if self.is_test:
            self.y = None
        else:
            self.y = list(df["target"])

    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x, s = self.question[idx]
        
        if self.is_test:
            return x, s
        else:
            y = self.y[idx]
            return x, s, y

In [42]:
train_ds = QuoraDataset(train, vocab2index, N=40, padding_start=False)
valid_ds = QuoraDataset(valid, vocab2index, N=40, padding_start=False)

In [43]:
batch_size = 1000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

# Bidirectional LSTM

In [45]:
batch_size = 7
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

x,s,y = next(iter(train_dl)) # here s is the length of the sent

In [46]:
vocab_size = len(words)
embedding_dim = 10
hidden_dim = 9
embed = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
lstm1 = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
lstm2 = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, batch_first=True, dropout=0.1, bidirectional=True)

In [47]:
s, index = s.sort(0, descending=True)
x = x[index]
x = embed(x.long())
x_pack = pack_padded_sequence(x, list(s), batch_first=True)

In [49]:
class LSTMBiModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super(LSTMBiModel,self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, batch_first=True,
                            dropout=0.3, bidirectional=True)
        self.linear = nn.Linear(2*hidden_dim, 1)
        
    def forward(self, x, s):
        s, sort_index = torch.sort(s, 0,descending=True)
        s = s.numpy().tolist()
        x = x[sort_index]
        x = self.embeddings(x)
        x_pack = pack_padded_sequence(x, s, batch_first=True)
        out_pack, (ht, ct) = self.lstm(x_pack)
        h = torch.cat((ht[-2,:,:], ht[-1,:,:]), dim = 1)
        h = self.linear(h)
        return torch.zeros_like(h).scatter_(0, sort_index.unsqueeze(1), h)

In [50]:
def train_epocs(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, s, y in train_dl:
            x = x.long()
            y = y.float()
            y_pred = model(x, s)
            optimizer.zero_grad()
            loss = F.binary_cross_entropy_with_logits(y_pred, y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc = val_metrics(model, valid_dl)
        if i % 5 == 1:
            print("train loss %.3f val loss %.3f and val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

In [51]:
def val_metrics(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, s, y in valid_dl:
        x = x.long()
        y = y.float().unsqueeze(1)
        y_hat = model(x, s)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        y_pred = y_hat > 0
        correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total, correct/total

In [52]:
vocab_size = len(words)
model2 = LSTMBiModel(vocab_size, 50, 50)

In [57]:
train_epocs(model2, epochs=30, lr=0.01)

train loss 0.197 val loss 0.198 and val accuracy 0.944
train loss 0.191 val loss 0.197 and val accuracy 0.944
train loss 0.190 val loss 0.200 and val accuracy 0.944
train loss 0.190 val loss 0.200 and val accuracy 0.944
train loss 0.189 val loss 0.200 and val accuracy 0.944
train loss 0.190 val loss 0.200 and val accuracy 0.943


In [62]:
val_metrics(model2, valid_dl)

(0.2003046195636424, tensor(0.9438))