# MSDS631 - Deep Learning Final Project 2019 
## Quora Insincere Questions Classification
Detect toxic content to improve online conversations

----
### _Team: Jenny Kong & Joy Qi_

# Using LSTM 

In [1]:
import numpy as np 
import pandas as pd 
import os
import spacy
import string
import re
import numpy as np
from spacy.symbols import ORTH
from collections import Counter
from pathlib import Path
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [2]:
PATH = Path("../input/")
list(PATH.iterdir())

[PosixPath('../input/.DS_Store'),
 PosixPath('../input/test.csv'),
 PosixPath('../input/train.csv'),
 PosixPath('../input/glove.6B'),
 PosixPath('../input/sample_submission.csv')]

# Processing Data

In [3]:
df_train = pd.read_csv(PATH/"train.csv")
df_test = pd.read_csv(PATH/"test.csv")

In [4]:
print("Train shape: ", df_train.shape)
print("Test shape: ", df_test.shape)

Train shape:  (1306122, 3)
Test shape:  (375806, 2)


In [5]:
df_train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [6]:
df_test.head()

Unnamed: 0,qid,question_text
0,0000163e3ea7c7a74cd7,Why do so many women become so rude and arroga...
1,00002bd4fb5d505b9161,When should I apply for RV college of engineer...
2,00007756b4a147d2b0b3,What is it really like to be a nurse practitio...
3,000086e4b7e1c7146103,Who are entrepreneurs?
4,0000c4c3fbe8785a3090,Is education really making good people nowadays?


In [7]:
df_train.target.value_counts()

0    1225312
1      80810
Name: target, dtype: int64

In [8]:
df_train.dtypes

qid              object
question_text    object
target            int64
dtype: object

In [9]:
df_train['question_text'] = df_train['question_text'].astype(str)
df_train['qid'] = df_train['qid'].astype(str)

# Tokenization

In [10]:
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
def sub_br(x): return re_br.sub("\n", x)

my_tok = spacy.load('en')
def spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(sub_br(x))]

In [11]:
text = df_train['question_text'][0]
spacy_tok(text)

['How',
 'did',
 'Quebec',
 'nationalists',
 'see',
 'their',
 'province',
 'as',
 'a',
 'nation',
 'in',
 'the',
 '1960s',
 '?']

# Computing vocab2index

In [12]:
counts = Counter()
for q in df_train['question_text']:
    counts.update(spacy_tok(q))

In [13]:


len(counts.keys())

262118

In [14]:
for word in list(counts):
    if counts[word] < 5:
        del counts[word]

In [15]:
len(counts.keys())

57655

In [16]:
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

# Dataset

In [17]:
def encode_sentence(row, vocab2index, N=100, padding_start=True):
    words = spacy_tok(row)
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in words])
    l = min(N, len(enc1))
    if padding_start:
        enc[:l] = enc1[:l]
    else:
        enc[N-l:] = enc1[:l]
    return enc, l

In [18]:
encode_sentence(df_train['question_text'][0], vocab2index, N=100, padding_start=False)

(array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
       dtype=int32), 14)

### Select max sequece length N according to the 98th percentile length

In [19]:
x_train_len = np.array([len(q.split()) for q in df_train['question_text']])
x_test_len = np.array([len(q.split()) for q in df_test['question_text']])

In [20]:
x_train_len.max(), x_test_len.max()

(134, 87)

In [21]:
# 99 percentile
N_train = np.percentile(x_train_len, 99)
N_test = np.percentile(x_test_len, 99)

N_train, N_test

(39.0, 39.0)

### Hence we chose 50 to be the N

In [22]:
N = 40

# Split train and valid dataset

In [23]:
train, valid = train_test_split(df_train, test_size=0.2)

In [24]:
train.shape, valid.shape

((1044897, 3), (261225, 3))

In [25]:
# class QuoraDataset(Dataset):
#     def __init__(self, df, vocab2index, is_test=False, N=40, padding_start=True):
#         self.question = [encode_sentence(q, vocab2index, N, padding_start) for q in df['question_text']]
#         self.is_test = is_test
#         if self.is_test:
#             self.y = None
#         else:
#             self.y = list(df["target"])

#     def __len__(self):
#         return len(self.question)
    
#     def __getitem__(self, idx):
#         x = self.question[idx]
#         if self.is_test:
#             return x
#         else:
#             y = self.y[idx]
#             return x, y

class QuoraDataset(Dataset):
    def __init__(self, df, vocab2index, is_test=False, N=40, padding_start=True):
        self.question = [encode_sentence(q, vocab2index, N, padding_start) for q in df['question_text']]
        self.is_test = is_test
        if self.is_test:
            self.y = None
        else:
            self.y = list(df["target"])

    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x, s = self.question[idx]
        
        if self.is_test:
            return x
        else:
            y = self.y[idx]
            return x, s, y

In [26]:
train_ds = QuoraDataset(train, vocab2index, N=40, padding_start=False)
valid_ds = QuoraDataset(valid, vocab2index, N=40, padding_start=False)

In [27]:
batch_size = 1000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [28]:
# x, y = next(iter(train_dl))
x,s,y = next(iter(train_dl))

In [29]:
# x, y = next(iter(train_dl))
len(x)

1000

In [30]:
x.shape

torch.Size([1000, 40])

In [31]:
train_ds[0]

(array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
          123,    92,    13,   927,    29,  1763,   175, 24124,  5966,
        35388,     1,     1,    15], dtype=int32), 13, 0)

## Embedding Matrix


In [32]:
def loadGloveModel(gloveFile="../input/glove.6B/glove.6B.300d.txt"):
    """ Loads word vectors into a dictionary."""
    f = open(gloveFile,'r')
    word_vecs = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        word_vecs[word] = np.array([float(val) for val in splitLine[1:]])
    return word_vecs
word_vecs = loadGloveModel()

In [33]:
def random_word_vector(D=300):
    """Create arandom word vector
    
    0.25 is chosen so the unknown vectors have (approximately) same variance 
    as pre-trained ones
    """
    return np.random.uniform(-0.25,0.25,D)

def create_embedding_matrix(word_vecs, vocab2index, words, D=300):
    """Creates embedding matrix from word vectors. """
    V = len(words)
    W = np.zeros((V, D), dtype="float32")
    W[0] = np.zeros(D, dtype='float32')
    i = 1
    for i in range(1, V):
        if words[i] in word_vecs:
            W[i] = word_vecs[words[i]]
        else:
            W[i] = random_word_vector()
    return W

In [34]:
embedding_matrix = create_embedding_matrix(word_vecs, vocab2index, words)
embedding_matrix.shape

(57657, 300)

# Model

In [35]:
# class LSTMModel(torch.nn.Module) :
#     def __init__(self, vocab_size, embedding_dim, hidden_dim) :
#         super(LSTMModel,self).__init__()
#         self.hidden_dim = hidden_dim
#         self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
#         self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)  # (batch, seq, feature)
#         self.linear = nn.Linear(hidden_dim, 1)
#         self.dropout = nn.Dropout(0.5)
        
#     def forward(self, x):
#         x_emb = self.embeddings(x)
#         x_drop = self.dropout(x_emb)
#         out_pack, (ht, ct) = self.lstm(x_drop)
#         return self.linear(ht[-1])
class GRUModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, embedding_matrix):
        super(GRUModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))
        self.embedding.weight.requires_grad = False ## freeze embeddings
        self.dropout = nn.Dropout(0.5)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        
    def forward(self, x, s):
        s, sort_index = torch.sort(s, 0,descending=True)
        s = s.numpy().tolist()
        x = x[sort_index]
        x = self.embedding(x)
        x = self.dropout(x)
        x_pack = pack_padded_sequence(x, list(s), batch_first=True)
        out_pack, ht= self.gru(x_pack)
        out = self.linear(ht[-1])
        return torch.zeros_like(out).scatter_(0, sort_index.unsqueeze(1), out)

In [36]:
# def train_epocs(model, epochs=10, lr=0.001):
#     parameters = filter(lambda p: p.requires_grad, model.parameters())
#     optimizer = torch.optim.Adam(parameters, lr=lr)
#     for i in range(epochs):
#         model.train()
#         sum_loss = 0.0
#         total = 0
#         for x, s, y in train_dl:
#             x = x.long()
#             y = y.float()
#             y_pred = model(x, s)
#             optimizer.zero_grad()
#             loss = F.binary_cross_entropy_with_logits(y_pred, y.unsqueeze(1))
#             loss.backward()
#             optimizer.step()
#             sum_loss += loss.item()*y.shape[0]
#             total += y.shape[0]
#         val_loss, val_acc = val_metrics(model, val_dl)
#         if i % 5 == 1:
#             print("train loss %.3f val loss %.3f and val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

In [37]:
# def val_metrics(model, valid_dl):
#     model.eval()
#     correct = 0
#     total = 0
#     sum_loss = 0.0
#     for x, s, y in valid_dl:
#         x = x.long().cuda()
#         y = y.float().cuda().unsqueeze(1)
#         y_hat = model(x, s)
#         loss = F.binary_cross_entropy_with_logits(y_hat, y)
#         y_pred = y_hat > 0
#         correct += (y_pred.float() == y).float().sum()
#         total += y.shape[0]
#         sum_loss += loss.item()*y.shape[0]
#     return sum_loss/total, correct/total

In [38]:
def train_epocs(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, s, y in train_dl:
            x = x.long()
            y = y.float()
            y_pred = model(x, s)
            optimizer.zero_grad()
            loss = F.binary_cross_entropy_with_logits(y_pred, y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc = val_metrics(model, valid_dl)
        if i % 5 == 1:
            print("train loss %.3f val loss %.3f and val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

def val_metrics(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, s, y in valid_dl:
        x = x.long()
        y = y.float().unsqueeze(1)
        y_hat = model(x, s)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        y_pred = y_hat > 0
        correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total, correct/total

# def train_epocs(model, epochs=10, lr=0.001):
#     parameters = filter(lambda p: p.requires_grad, model.parameters())
#     optimizer = torch.optim.Adam(parameters, lr=lr)
#     for i in range(epochs):
#         model.train()
#         sum_loss = 0.0
#         total = 0
#         for x, y in train_dl:
#             x = torch.tensor(x[0], dtype=torch.long)
#             y = y.float()
#             y_pred = model(x)
#             optimizer.zero_grad()
#             loss = F.binary_cross_entropy_with_logits(y_pred, y.unsqueeze(1))
#             loss.backward()
#             optimizer.step()
#             sum_loss += loss.item()*y.shape[0]
#             total += y.shape[0]
#         val_loss, val_acc = val_metrics(model, valid_dl)
#         if i % 5 == 1:
#             print("train loss %.3f val loss %.3f and val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

In [39]:
# def val_metrics(model, valid_dl):
#     model.eval()
#     correct = 0
#     total = 0
#     sum_loss = 0.0
#     for x, y in valid_dl:
#         x = torch.tensor(x[0], dtype=torch.long)
#         y = y.float().unsqueeze(1)
#         y_hat = model(x)
#         loss = F.binary_cross_entropy_with_logits(y_hat, y)
#         y_pred = y_hat > 0
#         correct += (y_pred.float() == y).float().sum()
#         total += y.shape[0]
#         sum_loss += loss.item()*y.shape[0]
#     return sum_loss/total, correct/total

In [40]:
vocab_size = len(words)
print(vocab_size)

57657


In [41]:
model = GRUModel(vocab_size, 300, 50, embedding_matrix)

## GRU

In [43]:
%%time
train_epocs(model, epochs=15, lr=0.01)

train loss 0.199 val loss 0.196 and val accuracy 0.944
train loss 0.198 val loss 0.196 and val accuracy 0.944
train loss 0.198 val loss 0.197 and val accuracy 0.944
CPU times: user 2h 52min 52s, sys: 23min 29s, total: 3h 16min 21s
Wall time: 1h 8min 41s


# Save the models

In [69]:
def save_model(m, p): 
    torch.save(m.state_dict(), p)
    
def load_model(m, p): 
    m.load_state_dict(torch.load(p))

In [71]:
p1 = "./models/model-96.pth"
save_model(model, p1)

# Load the model

In [73]:
model1 = LSTMModel(vocab_size, 50, 50)
model_path = "./models/model-96.pth"
load_model(model1, model_path)

# Making predictions

In [74]:
test_ds = QuoraDataset(df_test, vocab2index, is_test=True, N=40, padding_start=False)
test_dl = DataLoader(test_ds, batch_size=1000)

In [92]:
df_submission = pd.read_csv(PATH/"sample_submission.csv")

In [100]:
preds=[]
for x in test_dl:
    x = x[0].long()
    out = model(x)
    pred = (out > 0.0).long()
    preds.append(pred.numpy())

In [102]:
df_submission['prediction'] = np.vstack(preds)

In [103]:
df_submission.head()

Unnamed: 0,qid,prediction
0,0000163e3ea7c7a74cd7,1
1,00002bd4fb5d505b9161,0
2,00007756b4a147d2b0b3,0
3,000086e4b7e1c7146103,0
4,0000c4c3fbe8785a3090,0


In [104]:
df_submission.to_csv('submission.csv', index=False)