In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import random as rn
import os
import utils
import explaiNE
import joblib
import maxent
import cne

In [2]:
SEED = 123
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
rn.seed(SEED)

print(f'CPU count: {joblib.cpu_count()}')

data = np.load(os.path.join('.','data','royalty_uncle.npz'))

train = data['X_train']
test = data['X_test']

train_exp = data['train_exp']
test_exp = data['test_exp']

entities = data['entities'].tolist()
relations = data['relations'].tolist()

NUM_ENTITIES = len(entities)
NUM_RELATIONS = len(relations)

ent2idx = dict(zip(entities, range(NUM_ENTITIES)))
rel2idx = dict(zip(relations, range(NUM_RELATIONS)))

train2idx = utils.array2idx(train,ent2idx,rel2idx)
test2idx = utils.array2idx(test,ent2idx,rel2idx)

trainexp2idx = utils.array2idx(train_exp,ent2idx,rel2idx)
testexp2idx = utils.array2idx(test_exp,ent2idx,rel2idx)

#adjacency_data = np.concatenate((train,train_exp.reshape(-1,3)), axis=0)
adjacency_data = np.concatenate((train,train_exp.reshape(-1,3)), axis=0)

A = utils.get_adjacency_matrix(adjacency_data,entities,NUM_ENTITIES)

CPU count: 4


In [3]:
EMBEDDING_DIM = 50
S1 = 1
S2 = 1.5
LEARNING_RATE = .001
MAX_ITER = 100
GAMMA = (1/(S1**2)) - (1/(S2**2))
TOP_K = 4

trainexp2idx = trainexp2idx[:,:,[0,2]]

testexp2idx = testexp2idx[:,:,[0,2]]

In [4]:
prior = maxent.BGDistr(A) 
prior.fit()

CNE = cne.ConditionalNetworkEmbedding(
    A=A,
    d=EMBEDDING_DIM,
    s1=S1,
    s2=S2,
    prior_dist=prior
    )

CNE.fit(lr=LEARNING_RATE, max_iter=MAX_ITER)

X = CNE._ConditionalNetworkEmbedding__emb

Epoch: 0, grad norm: 357.1649, obj: 12319.4804, obj smoothness: 12319.4804
Epoch: 1, grad norm: 356.8296, obj: 12302.2487, obj smoothness: 17.2317
Epoch: 2, grad norm: 356.4944, obj: 12294.9881, obj smoothness: 7.2605
Epoch: 3, grad norm: 356.1594, obj: 12286.2510, obj smoothness: 8.7372
Epoch: 4, grad norm: 355.8245, obj: 12279.0789, obj smoothness: 7.1721
Epoch: 5, grad norm: 355.4897, obj: 12272.1230, obj smoothness: 6.9559
Epoch: 6, grad norm: 355.1552, obj: 12265.6537, obj smoothness: 6.4693
Epoch: 7, grad norm: 354.8210, obj: 12237.1648, obj smoothness: 28.4889
Epoch: 8, grad norm: 354.4869, obj: 12229.7210, obj smoothness: 7.4438
Epoch: 9, grad norm: 354.1532, obj: 12222.6829, obj smoothness: 7.0381
Epoch: 10, grad norm: 353.8197, obj: 12214.8670, obj smoothness: 7.8160
Epoch: 11, grad norm: 353.4864, obj: 12196.0775, obj smoothness: 18.7895
Epoch: 12, grad norm: 353.1533, obj: 12187.3025, obj smoothness: 8.7750
Epoch: 13, grad norm: 352.8205, obj: 12180.3506, obj smoothness: 6.

In [5]:
#A = utils.get_adjacency_matrix(test,entities,NUM_ENTITIES)

PROBS = joblib.Parallel(n_jobs=-2, verbose=0)(
    joblib.delayed(explaiNE.compute_prob)(
        i,S1,S2,X,NUM_ENTITIES,prior,SEED
        ) for i in range(NUM_ENTITIES)
    )

PROBS = np.array(PROBS)

HESSIANS = joblib.Parallel(n_jobs=-2, verbose=20)(
    joblib.delayed(explaiNE.get_hessian)(
        i,S1,S2,GAMMA,X,A,EMBEDDING_DIM,PROBS,SEED
        ) for i in range(NUM_ENTITIES)
    )

HESSIANS = np.array(HESSIANS)

ITER_DATA = np.unique(testexp2idx.reshape(-1,2), axis=0)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-2)]: Batch computation too fast (0.0245s.) Setting batch_size=16.
[Parallel(n_jobs=-2)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-2)]: Done   3 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-2)]: Done   4 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-2)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-2)]: Done   6 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-2)]: Done  22 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-2)]: Done  38 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-2)]: Done  54 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-2)]: Done  70 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-2)]: Done  86 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-2)]: Done 102 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-2)]: Done 118 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-2)]: Done 134 ta

In [6]:
explanations = joblib.Parallel(n_jobs=-2, verbose=0)(
    joblib.delayed(explaiNE.get_explanations)(
        i,j,S1,S2,EMBEDDING_DIM,GAMMA,X,TOP_K,ITER_DATA,HESSIANS,PROBS,SEED
        ) for i,_,j in test2idx
    )

explanations = np.array(explanations)

In [7]:
jaccard = utils.jaccard_score(testexp2idx,explanations)

In [8]:
jaccard

0.18558558558558563

In [9]:
# scores = []

# for i in range(len(testexp2idx)):
    
#     true_i = testexp2idx[i][np.argsort(testexp2idx[i][:, 0])]
#     pred_i = explanations[i][np.argsort(explanations[i][:, 0])]

#     num_true_traces = true_i.shape[0]
#     num_pred_traces = pred_i.shape[0]

#     bool_array = (pred_i == true_i)

#     count = 0
#     for row in bool_array:
#         if row.all():
#             count += 1

#     score = count / (num_true_traces + num_pred_traces-count)
    
#     scores.append(score)

In [10]:
#testexp2idx[0]
#'Zaitao' in np.unique(np.concatenate((train[:,0], train[:,2]),axis=0))

In [11]:
# for i in explanations[0][np.argsort(testexp2idx[0][:, 0])]:
#     if i in testexp2idx[0][np.argsort(testexp2idx[0][:, 0])]:
#         print(i)

In [12]:
#testexp2idx[0][np.argsort(testexp2idx[0][:, 0])]

In [13]:
# indices = [idx for idx,i in enumerate(scores) if i]

# for i in indices:
    
#     true_i = np.sort(testexp2idx[i], axis=0)
#     pred_i = np.sort(explanations[i], axis=0)
    
#     print(true_i)

In [14]:
# count = 0
# for i,_,j in test2idx:
#     temp = []

#     for k,l in testexp2idx.reshape(-1,2):
        
#         score = explaiNE.explaiNE(i,j,k,l,S1,S2,EMBEDDING_DIM,GAMMA,X,HESSIANS,PROBS,SEED)
        
#         temp.append(((k,l),score))

#     sorted_scores = sorted(list(set(temp)),key=lambda x:x[1], reverse=True)#[0:4]
#     count +=1
#     if count == 3:
#         break
# #print(sorted_scores)