In [112]:
import os
import pandas as pd
import numpy as np

import torch
from  torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim

import torchtext
from torchtext import vocab, data
from torchtext.data import TabularDataset, BucketIterator
from torch.utils.data import DataLoader
import gensim
from nltk.tokenize import TreebankWordTokenizer, WhitespaceTokenizer
from spacy.tokenizer import Tokenizer
from scipy.stats import spearmanr
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [173]:
train = pd.read_csv("data/train_processed.csv")
test = pd.read_csv("data/test_processed.csv")

In [174]:
DATA_PTH = "data/"
TRAIN = "train_processed.csv"
TEST = "test_processed.csv"
EMB_FILE = "embeddings_processed.txt"
BATCH_SIZE = 128
EPOCHS = 50
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# word2vec Params
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 150
W2V_MIN_COUNT = 2

LABEL_THRESH = 0.5

In [175]:
target_question_columns = [
       'question_asker_intent_understanding',
       'question_body_critical', 'question_conversational',
       'question_expect_short_answer', 'question_fact_seeking',
       'question_has_commonly_accepted_answer',
       'question_interestingness_others', 'question_interestingness_self',
       'question_multi_intent', 'question_not_really_a_question',
       'question_opinion_seeking', 'question_type_choice',
       'question_type_compare', 'question_type_consequence',
       'question_type_definition', 'question_type_entity',
       'question_type_instructions', 'question_type_procedure',
       'question_type_reason_explanation', 'question_type_spelling',
       'question_well_written'
]
    
target_answer_columns = [
       'answer_helpful',
       'answer_level_of_information', 'answer_plausible', 'answer_relevance',
       'answer_satisfaction', 'answer_type_instructions',
       'answer_type_procedure', 'answer_type_reason_explanation',
       'answer_well_written'
]

In [203]:
def define_torchtext_fields():
    text_tokenizer=TreebankWordTokenizer().tokenize
    label_tokenizer = WhitespaceTokenizer().tokenize
    
    text_field = data.Field(sequential=True, tokenize=text_tokenizer, use_vocab=True,
                           batch_first=True, include_lengths=False)
    label_field = data.LabelField()
    index_field = data.Field(sequential=False, dtype=torch.int, use_vocab=False)
    
    return text_field, label_field, index_field


In [204]:
def build_w2v_documents(pth_to_train, columns, tokenizer):
    train = pd.read_csv(pth_to_train, usecols=columns)
    documents = []
    for col in columns:
        documents = documents + [tokenizer.tokenize(text) for text in train[col]]

    return documents

def build_w2v_model(pth_to_train, columns, save_embeddings=False):
    docs = build_w2v_documents(pth_to_train, columns, TreebankWordTokenizer())

    # Train Word Embeddings and save
    w2v = gensim.models.word2vec.Word2Vec(size=W2V_SIZE, 
                                          window=W2V_WINDOW,                                                                                  
                                          min_count=W2V_MIN_COUNT)
    w2v.build_vocab(docs)
    words = w2v.wv.vocab.keys()
    vocab_size = len(words)
    print("Vocab size", vocab_size)

    # Train Word Embeddings
    w2v.train(docs, total_examples=len(docs), epochs=W2V_EPOCH)
    
    if save_embeddings:
        w2v.save(DATA_PTH + EMB_FILE)

    return w2v

In [205]:
def prepare_dataset():
    if os.path.exists(DATA_PTH + EMB_FILE):
        w2v_model = gensim.models.Word2Vec.load(DATA_PTH + EMB_FILE)
    else:
        w2v_model = build_w2v_model(DATA_PTH + TRAIN, 
                                   ["TEXT"], 
                                   save_embeddings=True)
    
    txt, label, idx = define_torchtext_fields()

    predictor_columns = [
        ('qa_id', idx),
        ('TEXT', txt)
    ]
  
    labels = [('labels', label)]

 
    train_val_cols = predictor_columns + labels
    test_cols = predictor_columns
    
    train_ds, valid_ds = TabularDataset(DATA_PTH + TRAIN, 
                                        format="csv", 
                                        fields=train_val_cols, skip_header=True).split(split_ratio=0.7)
    test_ds = TabularDataset(DATA_PTH + TEST, format="csv", skip_header=True, fields=test_cols)
    
    txt.build_vocab(train_ds, valid_ds, min_freq=W2V_MIN_COUNT)
    label.build_vocab(train_ds)
    print(label.vocab.stoi)
    word2vec_vectors = []
    for token, idx in txt.vocab.stoi.items():
        if token in w2v_model.wv.vocab.keys():
            word2vec_vectors.append(torch.FloatTensor(w2v_model[token]))
        else:
            word2vec_vectors.append(torch.zeros(W2V_SIZE))
    txt.vocab.set_vectors(txt.vocab.stoi, word2vec_vectors, W2V_SIZE)
    
    return train_ds, valid_ds, test_ds, txt
    

In [206]:
train_ds, valid_ds, test_ds, vocab = prepare_dataset()

defaultdict(None, {'answer_relevance': 0, 'answer_plausible': 1, 'answer_well_written': 2, 'answer_helpful': 3, 'question_asker_intent_understanding': 4, 'answer_satisfaction': 5, 'question_well_written': 6, 'answer_level_of_information': 7, 'question_has_commonly_accepted_answer': 8, 'question_fact_seeking': 9, 'question_expect_short_answer': 10, 'question_interestingness_others': 11, 'question_body_critical': 12, 'question_type_instructions': 13, 'answer_type_reason_explanation': 14, 'answer_type_instructions': 15, 'question_opinion_seeking': 16, 'question_interestingness_self': 17, 'question_type_reason_explanation': 18, 'question_type_choice': 19, 'question_multi_intent': 20, 'question_type_procedure': 21, 'answer_type_procedure': 22, 'question_type_entity': 23, 'question_conversational': 24, 'question_type_compare': 25, 'question_type_definition': 26, 'question_type_consequence': 27, 'question_not_really_a_question': 28, 'question_type_spelling': 29})


In [207]:
len(train_ds), len(valid_ds), len(test_ds), len(vocab.vocab.stoi)

(4255, 1824, 476, 40072)

In [208]:
train_ds.examples[0].labels

array(['question_asker_intent_understanding', 'question_body_critical',
       'question_expect_short_answer', 'question_fact_seeking',
       'question_has_commonly_accepted_answer',
       'question_interestingness_others', 'question_interestingness_self',
       'question_type_instructions', 'question_well_written',
       'answer_helpful', 'answer_level_of_information',
       'answer_plausible', 'answer_relevance', 'answer_satisfaction',
       'answer_type_reason_explanation', 'answer_well_written'],
      dtype='<U37')

In [209]:
train_dl = BucketIterator(train_ds, 
                          batch_size=64, shuffle=True,
                          sort_key=lambda x: len(x))

In [210]:
valid_dl = BucketIterator(valid_ds, 
                          batch_size=64, shuffle=True,
                          sort_key=lambda x: len(x))

In [211]:
test_dl = BucketIterator(test_ds, batch_size=32, shuffle=False, sort_key=lambda x: len(x.TEXT))

In [212]:
next(iter(train_dl))

TypeError: unhashable type: 'numpy.ndarray'

In [134]:
batch = next(iter(train_dl))
batch.labels

tensor([[ 6, 14, 12,  ...,  1,  1,  1],
        [ 6, 11, 13,  ..., 16,  4,  1],
        [ 6, 14, 12,  ...,  1,  1,  1],
        ...,
        [ 6, 14, 11,  ...,  4,  1,  1],
        [ 6, 14, 12,  ...,  7,  4,  1],
        [ 6, 12, 11,  ...,  1,  1,  1]])

In [15]:
class LSTMEmbedding(nn.Module):
    def __init__(self, embedding, output_size, hidden_size=128, num_layers=3, dropout=0.3, bidir=False):
        super(LSTMEmbedding, self).__init__()
        self.bidirectional = bidir
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.emb = nn.Embedding.from_pretrained(torch.FloatTensor(embedding), padding_idx=1, freeze=True)

        self.lstm = nn.LSTM(embedding.size(1), hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidir,
                            dropout=dropout, batch_first=True)
      
       
        if bidir:
            self.linear_in = hidden_size*2
        else:
            self.linear_in = hidden_size
            
        self.classifier = nn.Sequential(nn.Linear(self.linear_in, self.linear_in//2),
                                        nn.ReLU(),
                                        nn.Linear(64, output_size))
    
    def forward(self, x)
        out = self.emb(x)
        
        out, hidden = self.lstm(x)

        out = self.classifier(hidden)
        
        return out 

In [16]:
model = LSTMEmbedding(vocab.vocab.vectors, 30, num_layers=3, hidden_size=256)

In [17]:
def print_epoch_results(epoch, tl, vl, spear_t, spear_v):
    if epoch == 1:
        print("\tTrain\t\tValidation")
        print("Epoch | Loss | Spear. | Loss | Spear.")
    
    raw_line = '{:6d}' + '\u2502{:6.3f}' * 4
    print(raw_line.format(epoch, tl, spear_t, vl, spear_v))

In [18]:
def spearman_correlation_metric(predicted, true):
    score = 0.
    for i in range(predicted.shape[1]):
        score += np.nan_to_num(spearmanr(predicted[:, i], true[:,i])[0])
    
    return score / 30

In [19]:
def get_target_tensor(batch):
    return torch.stack([getattr(batch, label) for label in target_question_columns +  target_answer_columns], dim=-1).cuda()

In [20]:
def train(model, train_loader, valid_loader, learning_rate, epochs):
    optimizer = torch.optim.Adam(model.parameters(), learning_rate)
    criterion = torch.nn.BCEWithLogitsLoss()
    
    model.train()
    print("Starting training of model...")
    for epoch in range(1, epochs+1):
        train_loss = []
        valid_loss = []
        train_spearman = []
        valid_spearman = []
        
        for batch in train_loader:
            optimizer.zero_grad()
            qt = batch.question_title.cuda()
            qb = batch.question_body.cuda()
            qa = batch.answer.cuda()
            targets = get_target_tensor(batch)
            
            out = model(qb, qa) #, qt_h, qb_h, qa_h)
            print(out)
            loss = criterion(out, targets)
            train_loss.append(loss.item())
            train_spearman.append(spearman_correlation_metric(out.detach().cpu().numpy(),
                                                             targets.detach().cpu().numpy()))
            loss.backward()
            optimizer.step()
    
            
        for batch in valid_loader:
            qt = batch.question_title.cuda()
            qb = batch.question_body.cuda()
            qa = batch.answer.cuda()
            targets = get_target_tensor(batch)
            out = model(qb, qa)
            loss = criterion(out, targets)
            valid_spearman.append(spearman_correlation_metric(out.detach().cpu().numpy(),
                                                             targets.detach().cpu().numpy()))
            valid_loss.append(loss.item())
            
        print_epoch_results(epoch, np.array(train_loss).mean(), np.array(valid_loss).mean(), np.array(train_spearman).mean(),
                            np.array(valid_spearman).mean())
    

In [21]:
train(model.cuda(), train_dl, valid_dl, 0.001, 20)

Starting training of model...
	Train		Validation
Epoch | Loss | Spear. | Loss | Spear.
     1│ 0.463│ 0.024│ 0.425│ 0.048
     2│ 0.422│ 0.054│ 0.424│ 0.077
     3│ 0.421│ 0.086│ 0.423│ 0.140
     4│ 0.410│ 0.166│ 0.406│ 0.195
     5│ 0.402│ 0.202│ 0.405│ 0.193
     6│ 0.398│ 0.220│ 0.402│ 0.211
     7│ 0.394│ 0.230│ 0.401│ 0.211
     8│ 0.392│ 0.241│ 0.402│ 0.225
     9│ 0.388│ 0.261│ 0.400│ 0.233
    10│ 0.385│ 0.283│ 0.398│ 0.251
    11│ 0.379│ 0.307│ 0.396│ 0.265
    12│ 0.374│ 0.329│ 0.397│ 0.272
    13│ 0.368│ 0.349│ 0.396│ 0.275
    14│ 0.363│ 0.368│ 0.398│ 0.277
    15│ 0.358│ 0.382│ 0.398│ 0.271
    16│ 0.353│ 0.396│ 0.399│ 0.277
    17│ 0.349│ 0.409│ 0.399│ 0.283
    18│ 0.345│ 0.422│ 0.399│ 0.285
    19│ 0.342│ 0.435│ 0.402│ 0.285
    20│ 0.338│ 0.448│ 0.403│ 0.286


In [22]:
torch.save(model, "models/lstm-20ep-lr0-00025-hl-256")

In [23]:
test_ids = []
test_preds = []
for batch in test_dl:
    qt = batch.question_title.cuda()
    qb = batch.question_body.cuda()
    qa = batch.answer.cuda()
    out = model(qb, qa)
    test_ids.extend(batch.qa_id.tolist())
    test_preds.extend(torch.softmax(out, dim=-1).detach().cpu().numpy())



In [24]:
ids = pd.DataFrame({"qa_id": test_ids})

In [25]:
preds = pd.DataFrame(np.round(test_preds, 5))
preds.columns = target_question_columns + target_answer_columns

In [26]:
preds

Unnamed: 0,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0.15940,0.01491,0.00164,0.01185,0.01050,0.00794,0.01272,0.01277,0.00233,0.00001,...,0.05208,0.07831,0.01271,0.21257,0.20628,0.03241,0.00005,0.00017,0.03679,0.08760
1,0.02190,0.00346,0.00000,0.01320,0.02100,0.11312,0.00374,0.00230,0.00008,0.00001,...,0.00686,0.06679,0.00594,0.12953,0.28269,0.02586,0.19842,0.00025,0.00014,0.02528
2,0.14270,0.02209,0.00017,0.10350,0.04988,0.03544,0.01153,0.00953,0.00031,0.00001,...,0.04170,0.07435,0.01334,0.17393,0.17752,0.03579,0.00015,0.00017,0.00257,0.09247
3,0.02162,0.00298,0.00000,0.00948,0.04448,0.27829,0.00391,0.00193,0.00023,0.00001,...,0.00761,0.08846,0.00668,0.17759,0.23778,0.03009,0.00317,0.00059,0.03885,0.03001
4,0.06288,0.00943,0.00022,0.01479,0.00933,0.00424,0.01029,0.00682,0.00088,0.00002,...,0.02716,0.11956,0.01106,0.24306,0.29325,0.04080,0.02486,0.00106,0.00103,0.08099
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,0.04100,0.00573,0.00004,0.01113,0.00618,0.00568,0.00572,0.00464,0.00032,0.00001,...,0.01304,0.07967,0.00704,0.16359,0.29443,0.02722,0.20703,0.00026,0.00004,0.04636
472,0.03933,0.01274,0.00004,0.03159,0.20728,0.13287,0.01324,0.00807,0.00068,0.00025,...,0.02898,0.08374,0.01515,0.11571,0.16219,0.04775,0.01022,0.00261,0.00647,0.05908
473,0.04835,0.00998,0.00005,0.01896,0.11194,0.10350,0.01076,0.00665,0.01004,0.00003,...,0.02905,0.09084,0.01396,0.17282,0.17055,0.04554,0.01234,0.00613,0.01064,0.06333
474,0.03072,0.00500,0.00001,0.02320,0.11297,0.34078,0.00433,0.00335,0.00049,0.00000,...,0.01029,0.07005,0.00614,0.11889,0.17499,0.02756,0.00016,0.00023,0.03431,0.02699


In [27]:
submission = ids.join(preds)

In [28]:
submission.to_csv("submission.csv")