In [1]:
import sys
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer

import torch
from torch import optim
from torch.autograd import Variable,gradcheck
from torch.utils.data import DataLoader

sys.path.append('../scripts')

In [2]:
class DatasetLoader:
    def __init__(self, path):
        with open(path, 'r') as f:
            self.data = f.read().splitlines()
        split_tab = lambda x: [item.split('\t') for item in x]
        self.data = split_tab(self.data)
        self.num_data = len(self.data)
        self.arr = np.array(self.data)
        self.df = pd.DataFrame({
            'start': self.arr[:, 0],
            'relation': self.arr[:, 1],
            'dest': self.arr[:, 2]
        })
        
        # get one_hot relation
        one_hot_relation = [[item[1],] for item in self.data]
        self.enc = OneHotEncoder().fit(one_hot_relation)
        self.one_hot_relation = self.enc.transform(one_hot_relation).toarray()
        self.y_index = np.argmax(self.one_hot_relation, axis=1)
    
    def encode_index(self):
        start = self.arr[:, 0].astype(np.uint32)
        end = self.arr[:, 2].astype(np.uint32)
        unpack_bits = lambda num, m: np.array(list(np.binary_repr(num).zfill(m))).astype(np.uint8)
        m = int(np.log2(max(np.max(start), np.max(end)))) + 1
        start = np.stack([unpack_bits(i, m) for i in start])
        end = np.stack([unpack_bits(i, m) for i in end])
        return np.concatenate((start, end), axis=1)

In [3]:
verbose = False
data_path = '../datasets/WN18RR/'    

# datasets
train = DatasetLoader(data_path + 'train.txt')
test = DatasetLoader(data_path + 'test.txt')
val = DatasetLoader(data_path + 'valid.txt')

# get uniques
relations_all = np.concatenate((train.arr[:, 1], val.arr[:, 1], test.arr[:, 1]))
relations = np.unique(relations_all)
nodes_all = np.concatenate((train.arr[:, (0, 2)], val.arr[:, (0, 2)], test.arr[:, (0, 2)]))
nodes = np.unique(nodes_all)

print('Number of Training:', len(train.arr))
print('Number of Test:', len(test.arr))
print('Number of Validation:', len(val.arr))
print()
print('Number of Total Relations:', len(relations))
print('Training Relations:', len(np.unique(train.arr[:, 1])))
print('Valiation Relations:', len(np.unique(val.arr[:, 1])))
print('Test Relations:', len(np.unique(test.arr[:, 1])))
print()
print('Number of Total Nodes:', len(nodes))
print('Training Nodes:', len(np.unique(train.arr[:, (0, 2)])))
print('Valiation Nodes:', len(np.unique(val.arr[:, (0, 2)])))
print('Test Nodes:', len(np.unique(test.arr[:, (0, 2)])))
if verbose:
    print('Unique Relations:', relations)

Number of Training: 86835
Number of Test: 3134
Number of Validation: 3034

Number of Total Relations: 11
Training Relations: 11
Valiation Relations: 11
Test Relations: 11

Number of Total Nodes: 40943
Training Nodes: 40559
Valiation Nodes: 5173
Test Nodes: 5323


In [4]:
X_train = train.encode_index()
X_val = val.encode_index()
X_test = test.encode_index()

y_train = train.y_index
y_val = val.y_index
y_test = test.y_index

In [5]:
%%time

from DNN import DNN

# CUDA support 
if torch.cuda.is_available():
    device = torch.device('cuda:1')
else:
    device = torch.device('cpu')

# model settings
D_in  = X_train.shape[1]
D_out = len(relations)
H = 200
Depth = 5
NUMEPOCHS = 1000
Batch_size = 2000

model = DNN(
    input_size=D_in,
    output_size=D_out,
    hidden_size=H,
    depth=Depth
).to(device)

# Loss Function
criterion = torch.nn.CrossEntropyLoss()

# Optimizer
optimizer = optim.Adamax(model.parameters())

CPU times: user 2.01 s, sys: 540 ms, total: 2.55 s
Wall time: 2.56 s


In [6]:
# Input Data
trainX = Variable(torch.from_numpy(X_train).float())
trainY = Variable(torch.from_numpy(y_train).long())
valX = Variable(torch.from_numpy(X_val).float())
valY = Variable(torch.from_numpy(y_val).long())
testX = Variable(torch.from_numpy(X_test).float())
testY = Variable(torch.from_numpy(y_test).long())

print(trainX.shape)
print(trainY.shape)
print(valX.shape)
print(valY.shape)
print(testX.shape)
print(testY.shape)

if torch.cuda.is_available():
    trainX = trainX.to(device)
    trainY = trainY.to(device)
    testX = testX.to(device)
    testY = testY.to(device)
    valX = valX.to(device)
    valY = valY.to(device)

torch.Size([86835, 48])
torch.Size([86835])
torch.Size([3034, 48])
torch.Size([3034])
torch.Size([3134, 48])
torch.Size([3134])


In [7]:
# Train the model
print("Epochs")
for epoch in range(NUMEPOCHS):
    data_train_loader = DataLoader(
        list(zip(trainX,trainY)), 
        batch_size=Batch_size, 
        shuffle=True
    )
    for batchX, batchY in data_train_loader: 
        # Forward pass
        outputs = model(batchX)
        loss = criterion(outputs, batchY)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()     
        optimizer.step()
        
    # Print Epochs and Losses to Monitor Convergence
    if epoch % 100 == 0:
        print("{}".format(epoch),end = ", ")
        print(loss)
        
print('\nTraining Complete')

Epochs
0, tensor(1.4281, device='cuda:1', grad_fn=<NllLossBackward>)
100, tensor(0.3836, device='cuda:1', grad_fn=<NllLossBackward>)
200, tensor(0.0602, device='cuda:1', grad_fn=<NllLossBackward>)
300, tensor(0.0209, device='cuda:1', grad_fn=<NllLossBackward>)
400, tensor(0.0088, device='cuda:1', grad_fn=<NllLossBackward>)
500, tensor(0.0144, device='cuda:1', grad_fn=<NllLossBackward>)
600, tensor(0.0074, device='cuda:1', grad_fn=<NllLossBackward>)
700, tensor(0.0050, device='cuda:1', grad_fn=<NllLossBackward>)
800, tensor(0.0017, device='cuda:1', grad_fn=<NllLossBackward>)
900, tensor(0.0028, device='cuda:1', grad_fn=<NllLossBackward>)

Training Complete


In [85]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.metrics import label_ranking_loss, label_ranking_average_precision_score, coverage_error
def evaluate(X, Y, name):
    score = model(X).detach().cpu().numpy()
    pred = np.argmax(score, axis=1)
    score = (
        score - np.min(score, axis=1)[:, np.newaxis]
    ) / (
        np.max(score, axis=1) - np.min(score, axis=1)
    )[:, np.newaxis]
    
    true = Y.cpu().numpy()
    true_score = np.zeros_like(score)
    true_score[np.arange(len(true_score)), true] += 1
    
    hit_3 = score.argsort(axis=1)[np.arange(len(score)), -3:]
    hit_10 = score.argsort(axis=1)[np.arange(len(score)), -10:]
    hit_3 = np.sum(np.max(true[:, np.newaxis] == hit_3, axis=1)) / len(score)
    hit_10 = np.sum(np.max(true[:, np.newaxis] == hit_10, axis=1)) / len(score)
    
    print(name + ' Accuracy:\t', accuracy_score(true, pred))
    print(name + ' F1-micro:\t', f1_score(true, pred, average='micro'))
    print(name + ' F1-macro:\t', f1_score(true, pred, average='macro'))
    print(name + ' MRR:\t', label_ranking_average_precision_score(true_score, score))
    print(name + ' Ranking Loss:\t', label_ranking_loss(true_score, score))
    print(name + ' Coverage Error:\t', coverage_error(true_score, score))
    print(name + ' Hit@1:\t', accuracy_score(true, pred))
    print(name + ' Hit@3:\t', hit_3)
    print(name + ' Hit@10:\t', hit_10)
    print()

In [86]:
print('Test Labels :\n', testY.cpu().numpy())
print('Test Predictions :\n', np.argmax(model(testX).detach().cpu().numpy(), axis=1))
print()

evaluate(trainX, trainY, 'Training')
evaluate(valX, valY, 'Validation')
evaluate(testX, testY, 'Test')


Test Labels :
 [ 7 10  6 ...  1  2  3]
Test Predictions :
 [7 1 6 ... 1 3 3]

Training Accuracy:	 0.9980192318765475
Training F1-micro:	 0.9980192318765475
Training F1-macro:	 0.9938338234599562
Training MRR:	 0.9990096159382738
Training Ranking Loss:	 0.00019807681234525247
Training Coverage Error:	 1.0019807681234525
Training Hit@1:	 0.9980192318765475
Training Hit@3:	 1.0
Training Hit@10:	 1.0

Validation Accuracy:	 0.8592617007251153
Validation F1-micro:	 0.8592617007251154
Validation F1-macro:	 0.653897068489025
Validation MRR:	 0.9160120486340003
Validation Ranking Loss:	 0.02633487145682268
Validation Coverage Error:	 1.2633487145682267
Validation Hit@1:	 0.8592617007251153
Validation Hit@3:	 0.9706657877389585
Validation Hit@10:	 1.0

Test Accuracy:	 0.8611997447351627
Test F1-micro:	 0.8611997447351627
Test F1-macro:	 0.6670316113069182
Test MRR:	 0.9168517843214722
Test Ranking Loss:	 0.02597319719208679
Test Coverage Error:	 1.259731971920868
Test Hit@1:	 0.8611997447351627
