In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
import time
from nltk.corpus import stopwords
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
# Hyper parameters
n_sample = 1000
batch_size = 16
learning_rate = 0.001

lstm_in_dim = 300
lstm_hidden_dim = 2048
lstm_out_dim = 1

num_epoch = 100

use_cuda = True

In [3]:
# Train - Dev Val set
train_df = pd.read_csv("train.csv")

def get_dev_val(df, n_sample = 1000):
    i_df = df[df["target"]==1]
    s_df = df[df["target"]==0]
    
    print("Insincere:", i_df.shape)
    print("Sincere:", s_df.shape)
    
    val = {}
    # Sincere questions
    val["x"] = list(s_df["question_text"][:n_sample].values)
    val["y"] = list(s_df["target"][:n_sample])    
    # Insincere questions
    val["x"] += list(i_df["question_text"][:n_sample].values)
    val["y"] += list(i_df["target"][:n_sample])
    
    dev = {}
    # Sincere questions
    dev["x"] = list(s_df["question_text"][n_sample:].values)
    dev["y"] = list(s_df["target"][n_sample:])
    # Insincere questions
    dev["x"] += list(i_df["question_text"][n_sample:].values)
    dev["y"] += list(i_df["target"][n_sample:])  
    return dev, val
        
dev, val = get_dev_val(train_df)
print("Sample on validation set")
for x, y in zip(val["x"][998:1002], val["y"][998:1002]):
    print(y, "--", x)

Insincere: (80810, 3)
Sincere: (1225312, 3)
Sample on validation set
0 -- Why did Facebook place on my page that I deleted an emoticon on a friends page?
0 -- In the Italian version of the novel The Name of the Rose, why is Jorge's name not in its Italian version as it is with other characters' names?
1 -- Has the United States become the largest dictatorship in the world?
1 -- Which babies are more sweeter to their parents? Dark skin babies or light skin babies?


In [4]:
print("LOAD GLOVE...")
start = time.time()
with open('glove.840B.300d.pickle', 'rb') as handle:
    glove = pickle.load(handle)
print(time.time() - start)


LOAD GLOVE...
24.24756669998169


In [5]:
# ===== Data Loader =====
import os
import random
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
class DBLoader(object):
    def __init__(self, x_data, y_data, sequence_length = 7):
        self.x_data = x_data
        self.y_data = y_data
        self.N = len(x_data)     
        self.sequence_length = sequence_length

    def __len__(self):
        # Number of images in the object dataset.
        return self.N
    
    def clean_text(self, raw_text):
        raw_text=raw_text.strip()
        try:
            no_encoding=raw_text.decode("utf-8-sig").replace(u"\ufffd", "?")
        except:
            no_encoding = raw_text
        letters_only = re.sub("[^a-zA-Z]", " ",no_encoding) 
        words = letters_only.lower().split()                             
        stops = set(stopwords.words("english"))                  
        meaningful_words = [w for w in words if not w in stops] 
        return (" ".join( meaningful_words )) 
    
    def preprocess(self, x_data):
        sequence_length = self.sequence_length
        res = self.clean_text(x_data)
        tmp = []
        for x in res.split():
            try:
                tmp.append(glove[x])
            except:
                # Word is not found in the dictionary list
                tmp.append(np.zeros([300]).astype("float32"))
        res = np.array(tmp)
        
        if res.shape[0] == 0:
            res = np.zeros([sequence_length-res.shape[0], 300]).astype("float32")
        elif res.shape[0] < sequence_length:
            padding = np.zeros([sequence_length-res.shape[0], 300]).astype("float32")
            res = np.concatenate([res, padding],0)
        elif res.shape[0] > sequence_length:
            start = np.random.randint(0,res.shape[0]-sequence_length)
            res = res[start:start+sequence_length,:]
        return res
        
    def __getitem__(self, index):
        return self.preprocess(self.x_data[index]), np.float32(self.y_data[index])

dev_set = DataLoader(dataset=DBLoader(dev["x"], dev["y"]),
                             batch_size=batch_size,
                             shuffle=True,
                             drop_last=True)

val_set = DataLoader(dataset=DBLoader(val["x"], val["y"]),
                             batch_size=batch_size,
                             shuffle=False,
                             drop_last=True)    

In [6]:
# Model
import torch.nn as nn
class LSTMTagger(nn.Module):

    def __init__(self, in_dim, hidden_dim, out_dim, n_layers=1, model="lstm", use_cuda=False):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.lstm = nn.LSTM(in_dim, hidden_dim,batch_first=True)
        self.hidden2tag = nn.Linear(hidden_dim, out_dim)
        self.model = model
        self.use_cuda = use_cuda
        self.n_layers = n_layers
    def init_hidden(self, batch_size):
        if self.model == "lstm":
            if self.use_cuda:
                return (torch.zeros(self.n_layers, batch_size, self.hidden_dim).cuda(),
                        torch.zeros(self.n_layers, batch_size, self.hidden_dim).cuda())
            return (torch.zeros(self.n_layers, batch_size, self.hidden_dim),
                    torch.zeros(self.n_layers, batch_size, self.hidden_dim))
        if self.use_cuda:
            return torch.zeros(self.n_layers, batch_size, self.hidden_dim).cuda()
        return torch.zeros(self.n_layers, batch_size, self.hidden_dim)

    def forward(self, sentence):
        batch_size = sentence.size(0)
        sequence_length = sentence.size(1)
        lstm_out, self.hidden = self.lstm(sentence, self.hidden)    
        out = self.hidden2tag(lstm_out[:,-1,:].view(batch_size, -1))
        return out.squeeze()


In [7]:
def to_numpy(x):
    try:
        return x.data.cpu().numpy()
    except:
        return x.data.numpy()
    
lstm = LSTMTagger(lstm_in_dim, lstm_hidden_dim, lstm_out_dim, use_cuda = use_cuda)
if use_cuda:
    lstm.cuda()

In [8]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate, betas=(0.5, 0.999))    

In [9]:
import collections
%matplotlib inline

In [None]:
# Training model
from tqdm import tqdm_notebook as tqdm
training_loss = []
for epoch in range(num_epoch):
    epoch_loss = 0
    for i, (question, label) in enumerate(dev_set):
        if use_cuda:
            question = question.cuda()
            label = label.cuda()
            
        lstm.hidden = lstm.init_hidden(question.size(0))                   
        pred = torch.sigmoid(lstm(question))

        loss = criterion(pred, label)
        epoch_loss += to_numpy(loss)
        
        lstm.zero_grad()
        loss.backward()
        optimizer.step()

    training_loss.append(epoch_loss)
    print("EPOCH {}, Loss {}".format(epoch, epoch_loss))

plt.plot(training_loss)
plt.show()

EPOCH 0, Loss 10472.419045117767
EPOCH 1, Loss 10063.320281201391
EPOCH 2, Loss 10736.521523992938
EPOCH 3, Loss 11992.14393237306
EPOCH 4, Loss 13739.790995373362
EPOCH 5, Loss 15446.772883243611
EPOCH 6, Loss 15826.383423540246
EPOCH 7, Loss 16620.906493912793
EPOCH 8, Loss 17137.667690888993
EPOCH 9, Loss 16574.286727563725
EPOCH 10, Loss 16783.551617517864
EPOCH 11, Loss 16384.910255003764
EPOCH 12, Loss 16023.908267524956
EPOCH 13, Loss 15973.42700965744
EPOCH 14, Loss 15976.878835206238
EPOCH 15, Loss 16120.062114497872
EPOCH 16, Loss 15816.609785826633
EPOCH 17, Loss 16157.3376090783
EPOCH 18, Loss 16253.50247084877
EPOCH 19, Loss 16844.05516258443
EPOCH 20, Loss 16466.287081751092


In [None]:
# Evaluation
predictions = []
labels = []
for i, (question, label) in enumerate(val_set):
    if use_cuda:
        question = question.cuda()
        label = label.cuda()
        
    pred = to_numpy(torch.sigmoid(lstm(question)) > 0.5)
    predictions+=list(pred)
    labels+=list(to_numpy(label))
    

In [None]:
from sklearn.metrics import classification_report
print(classification_report(predictions,labels)) 


In [None]:
for itr,(p, l) in enumerate(zip(predictions, labels)):
    if p == 1 and l == 0:
        with open("pred1_label0.txt","a") as f:
            f.write(str(val["y"][itr])+","+val["x"][itr]+"\n")
    elif p == 0 and l == 1:
        with open("pred0_label1.txt","a") as f:
            f.write(str(val["y"][itr])+","+val["x"][itr]+"\n")
    elif p == 0 and l == 0:
        with open("pred0_label0.txt","a") as f:
            f.write(str(val["y"][itr])+","+val["x"][itr]+"\n")
    else:
        with open("pred0_label1.txt","a") as f:
            f.write(str(val["y"][itr])+","+val["x"][itr]+"\n")

In [None]:
# # import gensim
# # model = gensim.models.KeyedVectors.load_word2vec_format('../glove.840B.300d/glove.840B.300d.txt')
# # # weights = torch.FloatTensor(model.syn0)
# # # Load embedding
# import time
# start_time = time.time()
# to_emb = {}
# with open("../glove.840B.300d/glove.840B.300d.txt","rb") as f:
#     for item in f:
#         line = item.decode().split(" ")
#         if len(line) != 301:
#             print(line)
#             continue
            
#         to_emb[line[0]] = np.array(line[1:]).astype("float32")
        
        
# print(time.time() - start_time, len(to_emb))

# import pickle


# start = time.time()
# with open('glove.840B.300d.pickle', 'wb') as handle:
#     pickle.dump(to_emb, handle, protocol=pickle.HIGHEST_PROTOCOL)
# print(time.time()-start)