In [1]:
import json
import random
import numpy as np
import os

os.environ['KMP_DUPLICATE_LIB_OK']='True'

import data_helper
from model import ZSBert
import torch
from torch.utils.data import DataLoader
from evaluation import extract_relation_emb, evaluate
from transformers import BertModel, BertConfig, BertPreTrainedModel, BertTokenizer
from tqdm import tqdm



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
wiki_zsl_data="../Wiki-ZSL"
train_data_file = os.path.join(wiki_zsl_data, "train.json")
test_data_file = os.path.join(wiki_zsl_data, "test.json")
idx2property_file = os.path.join(wiki_zsl_data, "idx2property.json")

In [3]:
with open(train_data_file) as f:
    training_data = json.load(f)
    training_data = training_data
with open(test_data_file) as f:
    test_data = json.load(f)
train_label = list(i['edgeSet'][0]['kbID'] for i in training_data)
test_label = list(i['edgeSet'][0]['kbID'] for i in test_data)

In [4]:
property2idx, idx2property, pid2vec = data_helper.get_pid2vec("bert-base-nli-mean-tokens", idx2property_file,
                                                            prop_list_path="../resources/property_list.html")

In [5]:
pid2vec

{'P17': array([ 7.22222626e-01,  7.57713974e-01,  1.23370051e+00,  9.94259194e-02,
         2.91103035e-01, -3.11170053e-02,  1.64269543e+00,  7.42031157e-01,
         6.65463060e-02,  2.12694645e-01, -1.12508070e+00,  2.94415981e-01,
         3.09791416e-02,  5.75223088e-01,  6.38105929e-01,  5.67683578e-01,
        -1.07704937e+00, -3.21253270e-01,  8.81626680e-02, -1.98970929e-01,
        -4.62135792e-01, -1.37115866e-01, -8.46041813e-02, -7.83924937e-01,
        -4.16538194e-02, -7.03685224e-01,  4.87179607e-01, -2.54537165e-01,
        -1.27528203e+00,  1.78782091e-01,  4.14860249e-01,  5.18724740e-01,
         2.17795536e-01,  8.84599313e-02,  1.23911011e+00,  2.47767106e-01,
         5.67845404e-01, -3.44364971e-01, -1.72932222e-01, -7.87971377e-01,
         2.29798555e-01,  4.67279911e-01,  3.26472819e-01, -3.72977438e-03,
         1.53317705e-01, -5.23135483e-01,  1.94101882e+00,  2.13896677e-01,
        -1.31592557e-01, -2.50735790e-01, -2.54464656e-01, -3.81814510e-01,
     

In [6]:
print('there are {} kinds of relation in train.'.format(len(set(train_label))))
print('there are {} kinds of relation in test.'.format(len(set(test_label))))
print('number of union of train and test: {}'.format(len(set(train_label) & set(test_label))))

print(len(training_data))
print(len(test_data))

there are 103 kinds of relation in train.
there are 5 kinds of relation in test.
number of union of train and test: 0
84634
4297


In [10]:
t= "bert-base-multilingual-cased"

In [12]:
bertconfig = BertConfig.from_pretrained(t,
                                        num_labels=len(set(train_label)),
                                        finetuning_task='wiki-zero-shot')
# bertconfig.relation_emb_dim = args.relation_emb
bertconfig.margin = 7.5
bertconfig.alpha = 0.4
bertconfig.dist_func = "inner"
# get relation embedding dimension directly
bertconfig.relation_emb_dim = list(pid2vec.values())[0].shape[0]
model = ZSBert.from_pretrained(t, config=bertconfig)

Some weights of ZSBert were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'fclayer.bias', 'fclayer.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")

In [14]:
model=model.to(device)

In [15]:
trainset = data_helper.WikiDataset('train', training_data, pid2vec, property2idx, t)


In [25]:
trainset.__getitem__(0)[4].shape

(768,)

In [27]:
trainset.__getitem__(0)[5]

tensor(109)

In [28]:
test_y_attr, test_y = [], []
test_idxmap = {}

for i, test in enumerate(test_data):
    property_kbid = test['edgeSet'][0]['kbID']
    label = int(property2idx[property_kbid])
    test_y.append(label)
    test_idxmap[i] = label

test_y_attr = list(pid2vec[i] for i in set(test_label))
test_y_attr = np.array(test_y_attr)
test_y = np.array(test_y)

print(test_y_attr.shape)
print(test_y.shape)

(5, 768)
(4297,)


In [36]:
test_y_attr.shape

(5, 768)

In [30]:
test_y

array([66, 66, 66, ..., 35, 35, 35])

In [32]:
testset = data_helper.WikiDataset('test', test_data, pid2vec, property2idx, t)

In [34]:
testset.__getitem__(0)

(tensor([  101, 10134, 10105, 14107, 17309, 10108, 10485, 13596, 10108, 87038,
         10106, 10105, 19643, 13847, 14780,   119,   102]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0]),
 tensor([0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([ 1.03849530e+00, -2.61959434e-01,  1.64209473e+00,  8.92132521e-01,
         4.26776201e-01,  3.66336554e-01, -4.42709237e-01,  5.95278025e-01,
        -2.78065801e-02, -4.93847877e-02, -1.72438234e-01,  5.23893118e-01,
         6.65421903e-01,  9.79206622e-01, -4.03084636e-01,  3.24904174e-01,
        -3.89915198e-01, -8.66574228e-01,  7.45270133e-01, -4.15656954e-01,
        -1.71721637e-01,  6.17363453e-01,  6.14662886e-01, -9.88482893e-01,
        -8.17617238e-01, -1.18837094e+00, -3.44504923e-01, -8.80705118e-01,
        -1.37667194e-01,  4.15627239e-03,  8.33911151e-02, -5.50206780e-01,
         4.13720906e-01, -3.14869285e-01, -3.20940971e-01,  3.86314

In [35]:
test_y

array([66, 66, 66, ..., 35, 35, 35])