In [1]:
import pandas as pd
import torch
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
from custom_model.model import SimpleNet, SAttendedSimpleNet, SAttendedNet, CrossAttentionNet
import seaborn as sns
from sklearn.metrics import roc_auc_score


USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
print('Device: ', device)

def read_pickle(fname):
    with open(fname, 'rb') as fin:
        return pickle.load(fin)

df_train = pd.read_pickle('./data/processed/wikiqa_df_train.pickle')
df_test = pd.read_pickle('./data/processed/wikiqa_df_test.pickle')
df_test, df_val = np.split(df_test.sample(frac=1., random_state=42), 2)
emb_weights = np.load('./data/processed/index2vector.npy')

vocab_size = emb_weights.shape[0]
embed_dim = emb_weights.shape[1]

# df_train = df_train.iloc[:100]

print('Train shape: {} \n\
Test shape: {} \n\
Val shape {}: '.format(df_train.shape, df_test.shape, df_val.shape))

Device:  cuda
Train shape: (20347, 9) 
Test shape: (3058, 9) 
Val shape (3058, 9): 


In [2]:
df = pd.read_csv('./data/WikiQACorpus/WikiQA-train.tsv', sep='\t')
df.loc[:, ['Question', 'Sentence', 'Label']].to_csv('./data/WikiQACorpus/WikiQA-train_clear.tsv',
                                                   sep='\t', index=False)

df = pd.read_csv('./data/WikiQACorpus/WikiQA-test.tsv', sep='\t')
df.loc[:, ['Question', 'Sentence', 'Label']].to_csv('./data/WikiQACorpus/WikiQA-test_clear.tsv',
                                                   sep='\t', index=False)

df = pd.read_csv('./data/WikiQACorpus/WikiQA-dev.tsv', sep='\t')
df.loc[:, ['Question', 'Sentence', 'Label']].to_csv('./data/WikiQACorpus/WikiQA-dev_clear.tsv',
                                                   sep='\t', index=False)

In [3]:
import spacy
from torchtext import data
spacy_en = spacy.load('en')

def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text)]


QUESTION = data.Field(sequential=True, tokenize=tokenizer, lower=True)
SENTENCE = data.Field(sequential=True, tokenize=tokenizer, lower=True)
LABEL = data.Field(sequential=False, use_vocab=False)

train, val, test = data.TabularDataset.splits(
        path='./data/WikiQACorpus/', train='WikiQA-train_clear.tsv',
        validation='WikiQA-dev_clear.tsv', test='WikiQA-test_clear.tsv', format='tsv',
        fields=[('QUESTION', QUESTION),
                ('SENTENCE', SENTENCE),
                ('LABEL', LABEL)],
        skip_header=True)

In [4]:
print(vars(train.examples[4]))

{'QUESTION': ['how', 'are', 'glacier', 'caves', 'formed', '?'], 'SENTENCE': ['glacier', 'caves', 'are', 'often', 'called', 'ice', 'caves', ',', 'but', 'this', 'term', 'is', 'properly', 'used', 'to', 'describe', 'bedrock', 'caves', 'that', 'contain', 'year', '-', 'round', 'ice', '.'], 'LABEL': '0'}


In [5]:
QUESTION.build_vocab(train, vectors="glove.6B.100d")
SENTENCE.build_vocab(train, vectors="glove.6B.100d")
LABEL.build_vocab(train)

VOCAB = QUESTION.vocab
VOCAB.extend(SENTENCE.vocab)

In [6]:
BATCH_SIZE = 16

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, val, test), 
    batch_size=BATCH_SIZE, 
    device=device)

cuda


In [7]:
import math 

In [8]:
class SelfAttention():
    def __init__(self, dropout=None, positional_encoding=False):
        if dropout:
            # TODO: dropout
            raise NotImplementedError()
        if positional_encoding:
            # TODO: positional encoding
            raise NotImplementedError()
        pass

    @staticmethod
    def self_attention(query, key, value):
        d_k = value.size(-1)
        score = torch.bmm(query, key.permute(0,2,1))
        score = score / math.sqrt(d_k)
        # TODO: потенциально слабое место с направлением softmax'a.
        p_att = F.softmax(score, dim=-1)
        score = torch.bmm(p_att, value)
        return score, p_att
    
import torch.nn as nn
from torch.nn import functional as F
class MultiheadAttention(nn.Module):
    def __init__(self, n_heads, emb_size, att_size=64, dropout=None):
        super(MultiheadAttention, self).__init__()
        if dropout:
            # TODO: dropout
            raise NotImplementedError
        self.n_heads = n_heads
        self.emb_size = emb_size
        self.att_size = att_size
        self.attention = SelfAttention().self_attention
        # (W_q) n_heads times:
        self.linear_query = nn.ModuleList([nn.Linear(self.emb_size, self.att_size) for _ in range(self.n_heads)])
        # (W_k) n_heads times:
        self.linear_key = nn.ModuleList([nn.Linear(self.emb_size, self.att_size) for _ in range(self.n_heads)])
        # (W_v) n_heads times:
        self.linear_value = nn.ModuleList([nn.Linear(self.emb_size, self.att_size) for _ in range(self.n_heads)])
        # Fields for keeping attended values and attention_probabilities
        self.att_probas = []    # n_heads х n_sentences x max_len x max_len
        self.scores = []
        # Linear layer to transform concatenated heads
        self.output_linear = nn.Linear(n_heads*att_size, emb_size)

    def forward(self, query, key, value):
        # for each head:
        for head in range(self.n_heads):
            q = self.linear_query[head](query)
            k = self.linear_key[head](key)
            v = self.linear_value[head](value)
            # Scaled dot-product attention:
            score, p_att = self.attention(q,k,v)
            self.att_probas.append(p_att)
            self.scores.append(score)
        # Concatenate resulting matrices concat(z_0, z_1, ... z__n_heads)
        scores = torch.cat(self.scores, -1)
        # Transform concatenated
        scores = self.output_linear(scores)
        # Update attention probabilities for every head
        att_probas = self.att_probas
        # Reset scores and probabilities
        self.scores = []
        self.att_probas = []
        return scores, att_probas
    
class AttentionFlattener(nn.Module):
    def __init__(self, seq_len):
        super(AttentionFlattener, self).__init__()
        self.attention_matrix = None
        self.linear = nn.Linear(seq_len, 1)
        self.softmax = nn.Softmax(0)
        pass

    def forward(self, x):
        self.attention_matrix = x
        scores = self.linear(self.attention_matrix)
        scores = self.softmax(scores)
        return scores

In [9]:
INPUT_DIM = VOCAB.vectors.shape[0]
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 1
BIDIRECTIONAL = False
DROPOUT = 0.5

pretrained_embeddings = VOCAB.vectors.detach().numpy()
model = SAttendedSimpleNet(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, HIDDEN_DIM, 1, pretrained_embeddings)

In [10]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [11]:
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [12]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [13]:
train.examples[4].LABEL

'0'

In [14]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.QUESTION.t(), batch.SENTENCE.t()).squeeze(1)
        
        loss = criterion(predictions, batch.LABEL.float())
        
        acc = binary_accuracy(predictions, batch.LABEL.float())
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [15]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.QUESTION, batch.SENTENCE).squeeze(1)
            
            loss = criterion(predictions, batch.LABEL.float())
            
            acc = binary_accuracy(predictions, batch.LABEL.float())

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [16]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

RuntimeError: Expected object of type torch.cuda.FloatTensor but found type torch.cuda.LongTensor for argument #2 'other'