In [1]:
import pandas as pd
import torch
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
from custom_model.model import SimpleNet, SAttendedSimpleNet, SAttendedNet, CrossAttentionNet
import seaborn as sns
from sklearn.metrics import roc_auc_score


USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
print('Device: ', device)

def read_pickle(fname):
    with open(fname, 'rb') as fin:
        return pickle.load(fin)

df_train = pd.read_pickle('./data/processed/wikiqa_df_train.pickle')
df_test = pd.read_pickle('./data/processed/wikiqa_df_test.pickle')
df_test, df_val = np.split(df_test.sample(frac=1., random_state=42), 2)
emb_weights = np.load('./data/processed/index2vector.npy')

vocab_size = emb_weights.shape[0]
embed_dim = emb_weights.shape[1]

# df_train = df_train.iloc[:100]

print('Train shape: {} \n\
Test shape: {} \n\
Val shape {}: '.format(df_train.shape, df_test.shape, df_val.shape))

Device:  cuda
Train shape: (20347, 9) 
Test shape: (3058, 9) 
Val shape (3058, 9): 


In [13]:
import spacy
from torchtext import data
spacy_en = spacy.load('en')

def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text)]

QuestionID = data.Field(sequential=False, use_vocab=False)
QUESTION = data.Field(sequential=True, tokenize=tokenizer, lower=True)
DocumentID = data.Field(sequential=False, use_vocab=False)
DocumentTitle = data.Field(sequential=False, use_vocab=False)
SentenceID = data.Field(sequential=False, use_vocab=False)
SENTENCE = data.Field(sequential=True, tokenize=tokenizer, lower=True)
Label = data.Field(sequential=False, use_vocab=False)

train, val, test = data.TabularDataset.splits(
        path='./data/WikiQACorpus/', train='WikiQA-train.tsv',
        validation='WikiQA-dev.tsv', test='WikiQA-test.tsv', format='tsv',
        fields=[('QUESTIONID', QuestionID),
                ('QUESTION', QUESTION),
                ('DOCUMENTID', DocumentID),
                ('DOCUMENTIDTITLE', DocumentTitle),
                ('SENTENCEID', SentenceID),
                ('SENTENCE', SENTENCE),
                ('LABEL', Label)],
        skip_header=True)

In [15]:
QUESTION.build_vocab(train, vectors="glove.6B.100d")
SENTENCE.build_vocab(train, vectors="glove.6B.100d")
Label.build_vocab(train)

VOCAB = QUESTION.vocab
VOCAB.extend(SENTENCE.vocab)

In [16]:
BATCH_SIZE = 16

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train, val, test), 
    batch_size=BATCH_SIZE, 
    device=device)

cuda


In [20]:
import math 

In [22]:
class SelfAttention():
    def __init__(self, dropout=None, positional_encoding=False):
        if dropout:
            # TODO: dropout
            raise NotImplementedError()
        if positional_encoding:
            # TODO: positional encoding
            raise NotImplementedError()
        pass

    @staticmethod
    def self_attention(query, key, value):
        d_k = value.size(-1)
        score = torch.bmm(query, key.permute(0,2,1))
        score = score / math.sqrt(d_k)
        # TODO: потенциально слабое место с направлением softmax'a.
        p_att = F.softmax(score, dim=-1)
        score = torch.bmm(p_att, value)
        return score, p_att
    
import torch.nn as nn
from torch.nn import functional as F
class MultiheadAttention(nn.Module):
    def __init__(self, n_heads, emb_size, att_size=64, dropout=None):
        super(MultiheadAttention, self).__init__()
        if dropout:
            # TODO: dropout
            raise NotImplementedError
        self.n_heads = n_heads
        self.emb_size = emb_size
        self.att_size = att_size
        self.attention = SelfAttention().self_attention
        # (W_q) n_heads times:
        self.linear_query = nn.ModuleList([nn.Linear(self.emb_size, self.att_size) for _ in range(self.n_heads)])
        # (W_k) n_heads times:
        self.linear_key = nn.ModuleList([nn.Linear(self.emb_size, self.att_size) for _ in range(self.n_heads)])
        # (W_v) n_heads times:
        self.linear_value = nn.ModuleList([nn.Linear(self.emb_size, self.att_size) for _ in range(self.n_heads)])
        # Fields for keeping attended values and attention_probabilities
        self.att_probas = []    # n_heads х n_sentences x max_len x max_len
        self.scores = []
        # Linear layer to transform concatenated heads
        self.output_linear = nn.Linear(n_heads*att_size, emb_size)

    def forward(self, query, key, value):
        # for each head:
        for head in range(self.n_heads):
            q = self.linear_query[head](query)
            k = self.linear_key[head](key)
            v = self.linear_value[head](value)
            # Scaled dot-product attention:
            score, p_att = self.attention(q,k,v)
            self.att_probas.append(p_att)
            self.scores.append(score)
        # Concatenate resulting matrices concat(z_0, z_1, ... z__n_heads)
        scores = torch.cat(self.scores, -1)
        # Transform concatenated
        scores = self.output_linear(scores)
        # Update attention probabilities for every head
        att_probas = self.att_probas
        # Reset scores and probabilities
        self.scores = []
        self.att_probas = []
        return scores, att_probas
    
class AttentionFlattener(nn.Module):
    def __init__(self, seq_len):
        super(AttentionFlattener, self).__init__()
        self.attention_matrix = None
        self.linear = nn.Linear(seq_len, 1)
        self.softmax = nn.Softmax(0)
        pass

    def forward(self, x):
        self.attention_matrix = x
        scores = self.linear(self.attention_matrix)
        scores = self.softmax(scores)
        return scores

In [24]:
INPUT_DIM = len(VOCAB)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 1
BIDIRECTIONAL = False
DROPOUT = 0.5

model = SAttendedNet(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

  "num_layers={}".format(dropout, num_layers))
