In [1]:
import os
import pandas as pd
import numpy as np

import torch
from  torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim

import torchtext
from torchtext import vocab, data
from torchtext.data import TabularDataset, BucketIterator
from torch.utils.data import DataLoader
import gensim
from nltk.tokenize import TreebankWordTokenizer
from spacy.tokenizer import Tokenizer
from scipy.stats import spearmanr
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

In [3]:
DATA_PTH = "data/"
TRAIN = "train_processed.csv"
TEST = "test_processed.csv"
EMB_FILE = "embeddings_processed.txt"
BATCH_SIZE = 128
EPOCHS = 15
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# word2vec Params
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 150
W2V_MIN_COUNT = 2

In [4]:
target_question_columns = [
       'question_asker_intent_understanding',
       'question_body_critical', 'question_conversational',
       'question_expect_short_answer', 'question_fact_seeking',
       'question_has_commonly_accepted_answer',
       'question_interestingness_others', 'question_interestingness_self',
       'question_multi_intent', 'question_not_really_a_question',
       'question_opinion_seeking', 'question_type_choice',
       'question_type_compare', 'question_type_consequence',
       'question_type_definition', 'question_type_entity',
       'question_type_instructions', 'question_type_procedure',
       'question_type_reason_explanation', 'question_type_spelling',
       'question_well_written'
]
    
target_answer_columns = [
       'answer_helpful',
       'answer_level_of_information', 'answer_plausible', 'answer_relevance',
       'answer_satisfaction', 'answer_type_instructions',
       'answer_type_procedure', 'answer_type_reason_explanation',
       'answer_well_written'
]

In [5]:
def define_torchtext_fields():
    tokenizer=TreebankWordTokenizer().tokenize
    text_field = data.Field(sequential=True, tokenize=tokenizer, use_vocab=True,
                           batch_first=True, include_lengths=False)
    num_field = data.Field(sequential=False, dtype=torch.float, use_vocab=False)
    index_field = data.Field(sequential=False, dtype=torch.int, use_vocab=False)
#     raw_field = data.RawField()
    
    return text_field, num_field, index_field #, raw_field

In [6]:
def build_w2v_documents(pth_to_train, columns, tokenizer):
    train = pd.read_csv(pth_to_train, usecols=columns)
    documents = []
    for col in columns:
        documents = documents + [tokenizer.tokenize(text) for text in train[col]]

    return documents

def build_w2v_model(pth_to_train, columns, save_embeddings=False):
    docs = build_w2v_documents(pth_to_train, columns, TreebankWordTokenizer())

    # Train Word Embeddings and save
    w2v = gensim.models.word2vec.Word2Vec(size=W2V_SIZE, 
                                          window=W2V_WINDOW,                                                                                  
                                          min_count=W2V_MIN_COUNT)
    w2v.build_vocab(docs)
    words = w2v.wv.vocab.keys()
    vocab_size = len(words)
    print("Vocab size", vocab_size)

    # Train Word Embeddings
    w2v.train(docs, total_examples=len(docs), epochs=W2V_EPOCH)
    
    if save_embeddings:
        w2v.save(DATA_PTH + EMB_FILE)

    return w2v

In [7]:
def prepare_dataset():
    if os.path.exists(DATA_PTH + EMB_FILE):
        w2v_model = gensim.models.Word2Vec.load(DATA_PTH + EMB_FILE)
    else:
        w2v_model = build_w2v_model(DATA_PTH + TRAIN, 
                                   ["text"], 
                                   save_embeddings=True)
    
    txt, num, idx = define_torchtext_fields()

    predictor_columns = [
        ('qa_id', idx),
        ('text', txt),
#         ('question_body', txt),
#         ('question_user_name', raw),
#         ('question_user_page', raw),
#         ('answer', txt),
#         ('answer_user_name', raw),
#         ('answer_user_page', raw),
#         ('url', raw),
#         ('category', raw),
#         ('host', raw)
    ]
  
    
    targets = pd.read_csv(DATA_PTH + TRAIN).columns[2:]
    targets =  list(zip(targets, [num]*len(targets)))
    print(targets)
    train_val_cols = predictor_columns + targets
    test_cols = predictor_columns
    
    train_ds, valid_ds = TabularDataset(DATA_PTH + TRAIN, 
                                        format="csv", 
                                        fields=train_val_cols, skip_header=True).split(split_ratio=0.7)
    test_ds = TabularDataset(DATA_PTH + TEST, format="csv", skip_header=True, fields=test_cols)
    
    txt.build_vocab(train_ds, valid_ds, min_freq=W2V_MIN_COUNT)
    word2vec_vectors = []
    for token, idx in txt.vocab.stoi.items():
        if token in w2v_model.wv.vocab.keys():
            word2vec_vectors.append(torch.FloatTensor(w2v_model[token]))
        else:
            word2vec_vectors.append(torch.zeros(W2V_SIZE))
    txt.vocab.set_vectors(txt.vocab.stoi, word2vec_vectors, W2V_SIZE)
    
    return train_ds, valid_ds, test_ds, txt
    

In [8]:
train_ds, valid_ds, test_ds, vocab = prepare_dataset()

[('question_asker_intent_understanding', <torchtext.data.field.Field object at 0x7fce37b2ef98>), ('question_body_critical', <torchtext.data.field.Field object at 0x7fce37b2ef98>), ('question_conversational', <torchtext.data.field.Field object at 0x7fce37b2ef98>), ('question_expect_short_answer', <torchtext.data.field.Field object at 0x7fce37b2ef98>), ('question_fact_seeking', <torchtext.data.field.Field object at 0x7fce37b2ef98>), ('question_has_commonly_accepted_answer', <torchtext.data.field.Field object at 0x7fce37b2ef98>), ('question_interestingness_others', <torchtext.data.field.Field object at 0x7fce37b2ef98>), ('question_interestingness_self', <torchtext.data.field.Field object at 0x7fce37b2ef98>), ('question_multi_intent', <torchtext.data.field.Field object at 0x7fce37b2ef98>), ('question_not_really_a_question', <torchtext.data.field.Field object at 0x7fce37b2ef98>), ('question_opinion_seeking', <torchtext.data.field.Field object at 0x7fce37b2ef98>), ('question_type_choice', <t

In [9]:
len(train_ds), len(valid_ds), len(test_ds), len(vocab.vocab.stoi)

(4255, 1824, 476, 40072)

In [10]:
vocab.vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7fce37b2ed68>>,
            {'<unk>': 0,
             '<pad>': 1,
             'the': 2,
             'to': 3,
             'a': 4,
             'i': 5,
             'is': 6,
             'and': 7,
             'of': 8,
             '&': 9,
             'in': 10,
             'it': 11,
             'that': 12,
             'you': 13,
             'not': 14,
             'for': 15,
             'this': 16,
             'have': 17,
             'on': 18,
             'be': 19,
             'with': 20,
             'can': 21,
             'are': 22,
             'if': 23,
             'gt': 24,
             'as': 25,
             'but': 26,
             'or': 27,
             's': 28,
             'do': 29,
             'my': 30,
             'from': 31,
             'an': 32,
             'will': 33,
             'so': 34,
             'would': 35,
             'at': 36,
             'your': 37,
    

In [11]:
train_dl = BucketIterator(train_ds, 
                          batch_size=4, shuffle=True,
                          sort_key=lambda x: len(x.text))

In [12]:
valid_dl = BucketIterator(valid_ds, 
                          batch_size=4, shuffle=True,
                          sort_key=lambda x: len(x.text))

In [13]:
test_dl = BucketIterator(test_ds, batch_size=4, shuffle=False, sort_key=lambda x: len(x.text))

In [14]:
batch = next(iter(train_dl))
len(batch.text[0])

906

In [15]:
class LSTMEmbedding(nn.Module):
    def __init__(self, embedding, output_size, hidden_size=128, num_layers=3, dropout=0.3, bidir=False):
        super(LSTMEmbedding, self).__init__()
        self.bidirectional = bidir
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.emb = nn.Embedding.from_pretrained(torch.FloatTensor(embedding), padding_idx=1, freeze=True)
#         self.qtitle_lstm = nn.LSTM(embedding.size(1), hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidir,
#                             dropout=dropout, batch_first=True)
        self.lstm = nn.LSTM(embedding.size(1), hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidir,
                            dropout=dropout, batch_first=True)

        self.lstm2 = nn.LSTM(self.hidden_size*2, hidden_size=self.hidden_size, num_layers=num_layers, bidirectional=bidir,
                            dropout=dropout, batch_first=True)
       
        self.dropout = nn.Dropout(dropout)
        if bidir:
            self.linear_in = hidden_size*2
        else:
            self.linear_in = hidden_size
        self.classifier = nn.Sequential(nn.Linear(self.linear_in, 64),
                                        nn.ReLU(),
                                        nn.Linear(64, output_size))
    
    def forward(self, x):
        emb = self.emb(x)
        
        out, hid = self.lstm(emb)
        out, hid = self.lstm2(out, hid)        
        out = self.dropout(out)
        out = self.classifier(out)
        return torch.sigmoid(out)[:,-1,:]

In [16]:
model = LSTMEmbedding(vocab.vocab.vectors, 30, num_layers=1
                      , hidden_size=1024, bidir=True)

In [17]:
def print_epoch_results(epoch, tl, vl, spear_t, spear_v):
    if epoch == 1:
        print("\tTrain\t\tValidation")
        print("Epoch | Loss | Spear. | Loss | Spear.")
    
    raw_line = '{:6d}' + '\u2502{:6.3f}' * 4
    print(raw_line.format(epoch, tl, spear_t, vl, spear_v))

In [18]:
def spearman_correlation_metric(predicted, true):
    score = 0.
    for i in range(predicted.shape[1]):
        score += np.nan_to_num(spearmanr(predicted[:, i], true[:,i])[0])
    
    return score / 30

In [19]:
def get_target_tensor(batch):
    return torch.stack([getattr(batch, label) for label in target_question_columns +  target_answer_columns], dim=-1).cuda()

In [20]:
def train(model, train_loader, valid_loader, learning_rate, epochs):
    optimizer = torch.optim.Adam(model.parameters(), learning_rate)
    criterion = torch.nn.BCELoss()
    print("Starting training of model...")
    for epoch in range(1, epochs+1):
        model.train()
        train_loss = []
        valid_loss = []
        train_spearman = []
        valid_spearman = []
        
        for batch in train_loader:
            optimizer.zero_grad()
            targets = get_target_tensor(batch)
            
            out = model(batch.text.to(DEVICE))
            loss = criterion(out, targets)
            train_loss.append(loss.item())
            train_spearman.append(spearman_correlation_metric(out.detach().cpu().numpy(),
                                                             targets.detach().cpu().numpy()))
            loss.backward()
            optimizer.step()
    
        model.eval()
        for batch in valid_loader:
            targets = get_target_tensor(batch)
            out = model(batch.text.to(DEVICE))
            loss = criterion(out, targets)
            valid_spearman.append(spearman_correlation_metric(out.detach().cpu().numpy(),
                                                             targets.detach().cpu().numpy()))
            valid_loss.append(loss.item())
            
        print_epoch_results(epoch, np.array(train_loss).mean(), np.array(valid_loss).mean(), np.array(train_spearman).mean(),
                            np.array(valid_spearman).mean())
    

In [None]:
train(model.cuda(), train_dl, valid_dl, 0.00025, 15)

Starting training of model...
	Train		Validation
Epoch | Loss | Spear. | Loss | Spear.
     1│ 0.415│ 0.142│ 0.398│ 0.194
     2│ 0.391│ 0.218│ 0.389│ 0.210
     3│ 0.376│ 0.259│ 0.387│ 0.225
     4│ 0.362│ 0.308│ 0.387│ 0.228
     5│ 0.349│ 0.343│ 0.392│ 0.228
     6│ 0.336│ 0.376│ 0.396│ 0.227


In [None]:
torch.save(model, "models/lstm-15ep-lr0-01-hl-1024-bidir")

In [None]:
test_ids = []
test_preds = []
for batch in test_dl:
    out = model(batch.text.to(DEVICE))
    test_ids.extend(batch.qa_id.tolist())
    test_preds.extend(torch.sigmoid(out).detach().cpu().numpy())



In [None]:
ids = pd.DataFrame({"qa_id": test_ids})

In [None]:
preds = pd.DataFrame(np.round(test_preds, 5))
preds.columns = target_question_columns + target_answer_columns

In [None]:
preds

In [None]:
submission = ids.join(preds)

In [None]:
submission.to_csv("submission.csv")