In [1]:
import numpy as np
import pandas as pd
import random as rn
import os
import cne
import maxent
from scipy.stats import halfnorm
import utils
import joblib

In [2]:
SEED = 123
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
rn.seed(SEED)

print(f'CPU count: {joblib.cpu_count()}')

data = np.load(os.path.join('.','data','royalty_spouse.npz'))

CPU count: 4


In [3]:
train = data['X_train']
test = data['X_test']

train_exp = data['train_exp']
test_exp = data['test_exp']


full_train = np.concatenate((train,train_exp.reshape(-1,3)), axis=0)
full_test = np.concatenate((test,test_exp.reshape(-1,3)), axis=0)
full_data = np.concatenate((full_train,full_test), axis=0)

entities = data['entities'].tolist()
relations = data['relations'].tolist()

num_entities = len(entities)
num_relations = len(relations)

ent2idx = dict(zip(entities, range(num_entities)))
rel2idx = dict(zip(relations, range(num_relations)))

In [4]:
train2idx = utils.array2idx(full_train,ent2idx,rel2idx)
test2idx = utils.array2idx(test,ent2idx,rel2idx)

In [5]:
A = utils.get_adjacency_matrix(full_train,entities,num_entities)

In [6]:
embedding_dim = 50
s1 = 1
s2 = 1.5
learning_rate = .001
max_iter = 2
gamma = (1/(s1**2)) - (1/(s2**2))
top_k = 1

In [7]:
prior = maxent.BGDistr(A) 
prior.fit()

CNE = cne.ConditionalNetworkEmbedding(
    A=A,
    d=embedding_dim,
    s1=s1,
    s2=s2,
    prior_dist=prior
    )
CNE.fit(lr=learning_rate, max_iter=max_iter)

X = CNE._ConditionalNetworkEmbedding__emb

Epoch: 0, grad norm: 645.6384, obj: 112601.1830, obj smoothness: 112601.1830
Epoch: 1, grad norm: 644.9050, obj: 112375.4409, obj smoothness: 225.7422


In [8]:
def get_pij(i,j,s1,s2,prior):
    
    p_prior = prior.get_row_probability([i], [j])
    
    normal_s1 = halfnorm.rvs(loc=0,scale=s1,size=1,random_state=SEED)
    normal_s2 = halfnorm.rvs(loc=0,scale=s2,size=1,random_state=SEED)
    
    numerator = p_prior * normal_s1
    denom = numerator + (1-p_prior)*normal_s2
    
    return numerator/denom

def get_hessian(i,s1,s2,gamma,X,A,embedding_dim):
    
    hessian = np.zeros(shape=(embedding_dim,embedding_dim))

    for j in range(A.shape[0]):

        if i != j:

            x_i = X[i,:]
            x_j = X[j,:]

            x_diff = (x_i - x_j).reshape(-1,1)  
            
            prob = get_pij(i,j,s1,s2,prior)

            h = (gamma**2) * np.dot(x_diff,x_diff.T) * (prob * (1-prob))

            p_diff_mat = gamma * (prob - A[i,j])[0] * np.identity(h.shape[0])

            hessian += (p_diff_mat - h)
            
    return hessian

def explaiNE(i,j,k,l,s1,s2,embedding_dim,gamma,X,A):

    hessian = get_hessian(i=i,s1=s1,s2=s2,gamma=gamma,X=X,A=A,embedding_dim=embedding_dim)
    pij = get_pij(i=i,j=j,s1=s1,s2=s2,prior=prior)

    invert = (-hessian) / ((gamma**2 * (pij) * (1-pij)))

    hess_inv = np.linalg.inv(invert)

    x_i = X[i,:]
    x_j = X[j,:]
    x_k = X[k,:]
    x_l = X[l,:]

    xij_diff = (x_i - x_j).T

    xlk_diff = (x_l - x_k)

    return np.dot(np.dot(xij_diff, hess_inv), xlk_diff).squeeze()

In [9]:
def get_explanations(i,j,s1,s2,embedding_dim,gamma,X,A,top_k,train2idx):

    row,col = A.nonzero()

    neighbors = []

    for idx, l in enumerate(col):
        if l == i:

            if (i,j) != (row[idx],col[idx]):

                neighbors.append([row[idx],col[idx]])

    if len(neighbors) > top_k:

        temp = []

        for k,l in neighbors:

            score = explaiNE(i,j,k,l,s1,s2,embedding_dim,gamma,X,A)

            temp.append(((k,l),score))

        # temp = joblib.Parallel(n_jobs=-2,verbose=20)(joblib.delayed(loop)(i,j,
        #     k,l,s1,s2,embedding_dim,gamma,X,A, train2idx) for k,l in neighbors)

    else:

        temp = []

        for k,_,l in train2idx[0:5]:

            if (i,j) != (k,l):

                score = explaiNE(i,j,k,l,s1,s2,embedding_dim,gamma,X,A)

                temp.append(((k,l),score))

        # temp = joblib.Parallel(n_jobs=-2,verbose=20)(joblib.delayed(loop)(i,j,
        #     k,l,s1,s2,embedding_dim,gamma,X,A, train2idx) for k,_,l in train2idx if (i,j) != (k,l))

    sorted_scores = sorted(temp,key=lambda x:x[1], reverse=True)[0:top_k]

    explanation = [np.array(tup) for tup,_ in sorted_scores]

    return np.array(explanation)

In [10]:
#sorted_scores = sorted(temp,key=lambda x:x[1], reverse=True)[0:top_k]

In [11]:
explanations = joblib.Parallel(n_jobs=-2, verbose=20)(
    joblib.delayed(get_explanations)(i,j,s1,s2,embedding_dim,gamma,X,A,top_k,train2idx) for i,_,j in test2idx[0:2]
    )

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done   1 tasks      | elapsed:   15.5s
[Parallel(n_jobs=-2)]: Done   2 out of   2 | elapsed:   15.5s remaining:    0.0s
[Parallel(n_jobs=-2)]: Done   2 out of   2 | elapsed:   15.5s finished


In [12]:
np.array(explanations).shape

(2, 1, 2)

In [13]:
#utils.jaccard_score(explanation,explanation)

In [14]:
utils.jaccard_score(np.array(explanations), np.array(explanations))

1.0

In [15]:
testexp2idx = utils.array2idx(test_exp,ent2idx,rel2idx)

a = np.concatenate([testexp2idx[:,:,0],testexp2idx[:,:,2]],axis=1).reshape(-1,1,2)

In [16]:
utils.jaccard_score(a[0:2],np.array(explanations))

0.0