In [1]:
import pandas as pd
import torch
import numpy as np
import os
import pickle
from custom_model.model import SimpleNet
from custom_model.train import Train

In [2]:
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
print('Device: ', device)

Device:  cuda


In [3]:
def read_pickle(fname):
    with open(fname, 'rb') as fin:
        return pickle.load(fin)

In [4]:
df_train = pd.read_pickle('./data/processed/wikiqa_df_train.pickle')
df_test = pd.read_pickle('./data/processed/wikiqa_df_test.pickle')
voc = read_pickle('./data/processed/vocabulary.pickle')

print('Train shape: {} \n\
Test shape: {}'.format(df_train.shape, df_test.shape))

Train shape: (20347, 9) 
Test shape: (6116, 9)


In [5]:
df_train.head()

Unnamed: 0,QuestionID,Question,DocumentID,DocumentTitle,SentenceID,Sentence,Label,Question_encoded,Sentence_encoded
0,Q1,how are glacier caves formed,D1,Glacier cave,D1-0,a partly submerged glacier cave on perito more...,0,"[2622, 3473, 2381, 524, 3311, 0, 0, 0, 0, 0, 0...","[1878, 4448, 10810, 2381, 21552, 1328, 24841, ..."
1,Q1,how are glacier caves formed,D1,Glacier cave,D1-1,the ice facade is approximately m high,0,"[2622, 3473, 2381, 524, 3311, 0, 0, 0, 0, 0, 0...","[474, 40, 24270, 383, 13999, 2461, 2387, 0, 0,..."
2,Q1,how are glacier caves formed,D1,Glacier cave,D1-2,ice formations in the titlis glacier cave,0,"[2622, 3473, 2381, 524, 3311, 0, 0, 0, 0, 0, 0...","[40, 11084, 1433, 474, 25610, 2381, 21552, 0, ..."
3,Q1,how are glacier caves formed,D1,Glacier cave,D1-3,a glacier cave is a cave formed within the ice...,1,"[2622, 3473, 2381, 524, 3311, 0, 0, 0, 0, 0, 0...","[1878, 2381, 21552, 383, 1878, 21552, 3311, 18..."
4,Q1,how are glacier caves formed,D1,Glacier cave,D1-4,glacier caves are often called ice caves but t...,0,"[2622, 3473, 2381, 524, 3311, 0, 0, 0, 0, 0, 0...","[2381, 524, 3473, 988, 354, 40, 524, 21447, 38..."


In [6]:
net = SimpleNet(voc['voc_len'], 256, 128)
# net.to(device)

In [7]:
# df_train = df_train.iloc[:20]

In [8]:
Xq = np.array(df_train.Question_encoded.values.tolist())
Xa = np.array(df_train.Sentence_encoded.values.tolist())
t = np.array(df_train.Label.values.tolist())

In [9]:
Xq = torch.from_numpy(Xq)
Xa = torch.from_numpy(Xa)
t = torch.from_numpy(t)

In [10]:
batch_size = 50
epochs = 100

optimizer = torch.optim.Adam
loss_func = torch.nn.CrossEntropyLoss(weight=torch.tensor([0.05, 1.]).to(device))

In [11]:
# net.to(device)

In [12]:
net.fit(Xq, Xa, t, 50, 10, loss_func, optimizer, device)

Epoch: 0, loss: 0.7022308111190796
Epoch: 1, loss: 0.6250463128089905
Epoch: 2, loss: 0.5366259217262268
Epoch: 3, loss: 0.5190755724906921
Epoch: 4, loss: 0.48851391673088074
Epoch: 5, loss: 0.48703405261039734
Epoch: 6, loss: 0.48426684737205505
Epoch: 7, loss: 0.48432716727256775
Epoch: 8, loss: 0.4838498830795288
Epoch: 9, loss: 0.4771558940410614


In [None]:
net_fname = 'net.torch'
if not os.path.exists(net_fname):
    trainer = Train()
    trainer.fit(Xq, Xa, t, net, batch_size, epochs, loss_func, optimizer, device)
    torch.save(net, net_fname)
else:
    print('File already exists. Name: ', net_fname)
    net = torch.load(net_fname)
    net.to(device)

### Test:

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
Xq_test = np.array(df_test.Question_encoded.values.tolist())
Xa_test = np.array(df_test.Sentence_encoded.values.tolist())
t_test = np.array(df_test.Label.values.tolist())

Xq_test = torch.from_numpy(Xq_test)
Xa_test = torch.from_numpy(Xa_test)
t_test = torch.from_numpy(t_test)

t_pred = net(Xq_test.to(device), Xa_test.to(device))
t_pred[:10]

In [None]:
print('ROC-AUC score: {:}', roc_auc_score(t_test.tolist(), np.array(t_pred.tolist()).T[1]))