In [1]:
import numpy as np
import pandas as pd
import random as rn
import os
import cne
import maxent
import utils
import joblib
import explaiNE
import pickle

In [None]:
SEED = 123
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
rn.seed(SEED)

print(f'CPU count: {joblib.cpu_count()}')

data = np.load(os.path.join('.','data','royalty_spouse.npz'))

In [None]:
train = data['X_train']
test = data['X_test']

train_exp = data['train_exp']
test_exp = data['test_exp']

full_train = np.concatenate((train,train_exp.reshape(-1,3)), axis=0)

entities = data['entities'].tolist()
relations = data['relations'].tolist()

NUM_ENTITIES = len(entities)
NUM_RELATIONS = len(relations)

ent2idx = dict(zip(entities, range(NUM_ENTITIES)))
rel2idx = dict(zip(relations, range(NUM_RELATIONS)))

train2idx = utils.array2idx(train,ent2idx,rel2idx)
test2idx = utils.array2idx(test,ent2idx,rel2idx)

data2idx = np.concatenate([train2idx,test2idx], axis=0)

trainexp2idx = utils.array2idx(train_exp,ent2idx,rel2idx)
testexp2idx = utils.array2idx(test_exp,ent2idx,rel2idx)

adjacency_data = np.concatenate((train,train_exp.reshape(-1,3)), axis=0)
adjacency_data = np.concatenate((adjacency_data,test), axis=0)

A = utils.get_adjacency_matrix(adjacency_data,entities,NUM_ENTITIES)

In [None]:
trainexp2idx = np.concatenate([trainexp2idx[:,:,0],
    trainexp2idx[:,:,2]],axis=1).reshape(-1,1,2)

testexp2idx = np.concatenate([testexp2idx[:,:,0],
    testexp2idx[:,:,2]],axis=1).reshape(-1,1,2)

In [None]:
from explaiNE import get_explanations

In [None]:
explanations = joblib.Parallel(n_jobs=-2, verbose=0)(
    joblib.delayed(get_explanations)(
        i,j,S1,S2,EMBEDDING_DIM,GAMMA,X,A,TOP_K,train2idx,HESSIANS,PROBS,seed=SEED
        ) for i,_,j in train2idx[0:10]#test2idx
    )

In [None]:
explanations = np.array(explanations)

In [None]:
trip, trace = utils.parse_ttl(
    file_name=os.path.join('.','data','traces','spouse.ttl'),
    max_padding=1)

In [8]:
data = np.load(os.path.join('.','data','royalty_uncle.npz'))

train = data['X_train']
test = data['X_test']

train_exp = data['train_exp']
test_exp = data['test_exp']

In [10]:
print(test_exp[0])

[['Pōmare_III' 'parent' 'Pōmare_II']
 ['Pōmare_III' 'gender' '"male"@en']
 ['Pōmare_IV' 'parent' 'Pōmare_II']
 ['Teriitapunui_Pōmare' 'parent' 'Pōmare_IV']]


In [6]:
a = np.array([['Pōmare_IV', 'parent', 'Pōmare_II'],['Pōmare_III', 'parent', 'Pōmare_II'],
          ['Teriitapunui_Pōmare', 'parent', 'Pōmare_IV'],['Pōmare_III', 'gender', '"male"@en']])
a = np.concatenate([a,a],axis=0).reshape(-1,4,3)
a

array([[['Pōmare_IV', 'parent', 'Pōmare_II'],
        ['Pōmare_III', 'parent', 'Pōmare_II'],
        ['Teriitapunui_Pōmare', 'parent', 'Pōmare_IV'],
        ['Pōmare_III', 'gender', '"male"@en']],

       [['Pōmare_IV', 'parent', 'Pōmare_II'],
        ['Pōmare_III', 'parent', 'Pōmare_II'],
        ['Teriitapunui_Pōmare', 'parent', 'Pōmare_IV'],
        ['Pōmare_III', 'gender', '"male"@en']]], dtype='<U19')

In [7]:
utils.jaccard_score(test_exp[0:2],a)

0.5

In [None]:
# def train_test_split_no_unseen(X, exp,test_size=100, seed=0, allow_duplication=False, filtered_test_predicates=None):
#     """Split into train and test sets.
#      This function carves out a test set that contains only entities
#      and relations which also occur in the training set.
#     Parameters
#     ----------
#     X : ndarray, size[n, 3]
#         The dataset to split.
#     test_size : int, float
#         If int, the number of triples in the test set.
#         If float, the percentage of total triples.
#     seed : int
#         A random seed used to split the dataset.
#     allow_duplication: boolean
#         Flag to indicate if the test set can contain duplicated triples.
#     filtered_test_predicates: None, list
#         If None, all predicate types will be considered for the test set.
#         If list, only the predicate types in the list will be considered for
#         the test set.
#     Returns
#     -------
#     X_train : ndarray, size[n, 3]
#         The training set.
#     X_test : ndarray, size[n, 3]
#         The test set.
#     Examples
#     --------
#     >>> import numpy as np
#     >>> from ampligraph.evaluation import train_test_split_no_unseen
#     >>> # load your dataset to X
#     >>> X = np.array([['a', 'y', 'b'],
#     >>>               ['f', 'y', 'e'],
#     >>>               ['b', 'y', 'a'],
#     >>>               ['a', 'y', 'c'],
#     >>>               ['c', 'y', 'a'],
#     >>>               ['a', 'y', 'd'],
#     >>>               ['c', 'y', 'd'],
#     >>>               ['b', 'y', 'c'],
#     >>>               ['f', 'y', 'e']])
#     >>> # if you want to split into train/test datasets
#     >>> X_train, X_test = train_test_split_no_unseen(X, test_size=2)
#     >>> X_train
#     array([['a', 'y', 'b'],
#         ['f', 'y', 'e'],
#         ['b', 'y', 'a'],
#         ['c', 'y', 'a'],
#         ['c', 'y', 'd'],
#         ['b', 'y', 'c'],
#         ['f', 'y', 'e']], dtype='<U1')
#     >>> X_test
#     array([['a', 'y', 'c'],
#         ['a', 'y', 'd']], dtype='<U1')
#     >>> # if you want to split into train/valid/test datasets, call it 2 times
#     >>> X_train_valid, X_test = train_test_split_no_unseen(X, test_size=2)
#     >>> X_train, X_valid = train_test_split_no_unseen(X_train_valid, test_size=2)
#     >>> X_train
#     array([['a', 'y', 'b'],
#         ['b', 'y', 'a'],
#         ['c', 'y', 'd'],
#         ['b', 'y', 'c'],
#         ['f', 'y', 'e']], dtype='<U1')
#     >>> X_valid
#     array([['f', 'y', 'e'],
#         ['c', 'y', 'a']], dtype='<U1')
#     >>> X_test
#     array([['a', 'y', 'c'],
#         ['a', 'y', 'd']], dtype='<U1')
#     """

#     if type(test_size) is float:
#         test_size = int(len(X) * test_size)

#     rnd = np.random.RandomState(seed)

#     subs, subs_cnt = np.unique(X[:, 0], return_counts=True)
#     objs, objs_cnt = np.unique(X[:, 2], return_counts=True)
#     rels, rels_cnt = np.unique(X[:, 1], return_counts=True)
#     dict_subs = dict(zip(subs, subs_cnt))
#     dict_objs = dict(zip(objs, objs_cnt))
#     dict_rels = dict(zip(rels, rels_cnt))

#     idx_test = np.array([], dtype=int)

#     loop_count = 0
#     tolerance = len(X) * 10
#     # Set the indices of test set triples. If filtered, reduce candidate triples to certain predicate types.
#     if filtered_test_predicates:
#         test_triples_idx = np.where(np.isin(X[:, 1], filtered_test_predicates))[0]
#     else:
#         test_triples_idx = np.arange(len(X))

#     while idx_test.shape[0] < test_size:
#         i = rnd.choice(test_triples_idx)
#         if dict_subs[X[i, 0]] > 1 and dict_objs[X[i, 2]] > 1 and dict_rels[X[i, 1]] > 1:
#             dict_subs[X[i, 0]] -= 1
#             dict_objs[X[i, 2]] -= 1
#             dict_rels[X[i, 1]] -= 1
#             if allow_duplication:
#                 idx_test = np.append(idx_test, i)
#             else:
#                 idx_test = np.unique(np.append(idx_test, i))

#         loop_count += 1

#         # in case can't find solution
#         if loop_count == tolerance:
#             if allow_duplication:
#                 raise Exception("Cannot create a test split of the desired size. "
#                                 "Some entities will not occur in both training and test set. "
#                                 "Change seed values, remove filter on test predicates or set "
#                                 "test_size to a smaller value.")
#             else:
#                 raise Exception("Cannot create a test split of the desired size. "
#                                 "Some entities will not occur in both training and test set. "
#                                 "Set allow_duplication=True,"
#                                 "change seed values, remove filter on test predicates or "
#                                 "set test_size to a smaller value.")


#     idx = np.arange(len(X))
#     idx_train = np.setdiff1d(idx, idx_test)

#     return X[idx_train, :], X[idx_test, :], exp[idx_train,:], exp[idx_test,:]