In [1]:
import os, sys, csv
import numpy as np
import pandas as pd
import pickle as pkl
from scipy import spatial
from sklearn.neighbors import NearestNeighbors
sys.path.insert(1, '../src')


In [2]:
dataset = pkl.load(open("/media/HSGNN/dataset/dblp_preprocessed_dataset_V2.2.pkl", "rb"))
train_test_idx = pkl.load(open("/media/HSGNN/dataset/Train_Test_indices_V2.2.pkl", "rb"))
entityID_map = np.genfromtxt ('/media/HSGNN/dataset/V2_2/entity_id_mapping.csv', delimiter=",", dtype=str)
embeddings = pkl.load(open("/media/HSGNN/output/M=singlepaper_D=dblp_v86_S=128_F=epoch=9-val_micro_f1=0.34-val_acc=0.34-val_auroc=0.87.pkl", "rb"))

In [3]:
entityID_map_paper = {int(i[2]):int(i[3]) for i in entityID_map[1:] if i[1]=="paper"}
entityID_map_author = {int(i[2]):int(i[3]) for i in entityID_map[1:] if i[1]=="author"}
entityID_map_skill = {int(i[2]):int(i[3]) for i in entityID_map[1:] if i[1]=="term"}
entityID_map_venue = {int(i[2]):int(i[3]) for i in entityID_map[1:] if i[1]=="conf"}

print("Number of paper mappings: ", len(entityID_map_paper))

Number of paper mappings:  10674


In [4]:
paper_author = {}
paper_skill = {}
for record in dataset:
    paper_id = record[0]
    skillIdx = record[1].todense().nonzero()[1]   
    authorIdx = record[2].todense().nonzero()[1]    
    paper_author.update({paper_id: authorIdx})
    paper_skill.update({paper_id: skillIdx})

In [6]:
method_name = "sgnn_128_attention"
k_max = 100
k_fold = 10
num_neighbor_papers = 50
num_neighbor_authors = 100

## Subgraph Embedding mode

In [7]:
## Paper embedding mode - papar/paper similarity mode

result_output_name = "./output/{}_output.csv".format(method_name)
with open(result_output_name, 'w') as file:
    writer = csv.writer(file)
    writer.writerow(
        ['Method Name', '# Total Folds', '# Fold Number', '# Predictions', '# Truth', 'Computation Time (ms)',
         'Prediction Indices', 'True Indices'])

    for fold_counter in range(1,k_fold+1,1):
        train_idx = train_test_idx[fold_counter]['Train']
        test_idx = train_test_idx[fold_counter]['Test']

        for target in test_idx:
            true_index = paper_author[target]
            pred_index = []
            distances = []
            for i in train_idx:
                dist = spatial.distance.cosine(embeddings[entityID_map_paper[target]], embeddings[entityID_map_paper[i]])
                distances.append((i, dist))
            distances.sort(key=lambda tup: tup[1])
            for i in range(num_neighbor_papers):
                pred_index.extend(paper_author[distances[i][0]])
            
            writer.writerow([method_name, k_fold, fold_counter, len(pred_index[:k_max]), len(true_index), 0] + pred_index[:k_max] + list(true_index))
        print("Fold {} processed.".format(fold_counter))
    file.close()


## Node Embedding mode

In [None]:
## Paper embedding mode - papar/author similarity mode

result_output_name = "./output/{}_output.csv".format(method_name)
with open(result_output_name, 'w') as file:
    writer = csv.writer(file)
    writer.writerow(
        ['Method Name', '# Total Folds', '# Fold Number', '# Predictions', '# Truth', 'Computation Time (ms)',
         'Prediction Indices', 'True Indices'])

    for fold_counter in range(1,k_fold+1,1):
        train_idx = train_test_idx[fold_counter]['Train']
        test_idx = train_test_idx[fold_counter]['Test']

        for target in test_idx:
            true_index = paper_author[target]
            pred_index = []
            distances = []
            for i in list(entityID_map_author.keys()):
                dist = spatial.distance.cosine(embeddings[entityID_map_paper[target]], embeddings[entityID_map_author[i]])
                distances.append((i, dist))
            distances.sort(key=lambda tup: tup[1])
            pred_index.extend([distances[i][0] for i in range(num_neighbor_authors)])
            
            writer.writerow([method_name, k_fold, fold_counter, len(pred_index[:k_max]), len(true_index), 0] + pred_index[:k_max] + list(true_index))
        print("Fold {} processed.".format(fold_counter))
    file.close()

In [None]:
## Paper embedding mode - skills/author similarity mode

result_output_name = "./output/{}_output.csv".format(method_name)
with open(result_output_name, 'w') as file:
    writer = csv.writer(file)
    writer.writerow(
        ['Method Name', '# Total Folds', '# Fold Number', '# Predictions', '# Truth', 'Computation Time (ms)',
         'Prediction Indices', 'True Indices'])

    for fold_counter in range(1,k_fold+1,1):
        train_idx = train_test_idx[fold_counter]['Train']
        test_idx = train_test_idx[fold_counter]['Test']

        for target in test_idx:
            true_index = paper_author[target]
            pred_index = []
            distances = []
            for i in list(entityID_map_author.keys()):
                dist = spatial.distance.cosine(np.mean([embeddings[entityID_map_skill[skill]] for skill in paper_skill[target]], axis=0), embeddings[entityID_map_author[i]])
                distances.append((i, dist))
            distances.sort(key=lambda tup: tup[1])
            pred_index.extend([distances[i][0] for i in range(num_neighbor_authors)])
            
            writer.writerow([method_name, k_fold, fold_counter, len(pred_index[:k_max]), len(true_index), 0] + pred_index[:k_max] + list(true_index))
        print("Fold {} processed.".format(fold_counter))
    file.close()

## KNN

In [118]:
## Paper embedding mode

result_output_name = "./output/{}_output.csv".format(method_name)
with open(result_output_name, 'w') as file:
    writer = csv.writer(file)
    writer.writerow(
        ['Method Name', '# Total Folds', '# Fold Number', '# Predictions', '# Truth', 'Computation Time (ms)',
         'Prediction Indices', 'True Indices'])

    for fold_counter in range(1,k_fold+1,1):
        train_idx = train_test_idx[fold_counter]['Train']
        test_idx = train_test_idx[fold_counter]['Test']

        for target in test_idx:
            true_index = paper_author[target]
            pred_index = []
            distances = []
            
            train = [embeddings[entityID_map_paper[i]] for i in train_idx]
            neigh = NearestNeighbors(n_neighbors=num_neighbor_papers)
            neigh.fit(train)
            neighbors = neigh.kneighbors([embeddings[entityID_map_paper[i]]])
            for neighbor_id in neighbors[1][0]:
                pred_index.extend(paper_author[train_idx[neighbor_id]])

            
            writer.writerow([method_name, k_fold, fold_counter, len(pred_index[:k_max]), len(true_index), 0] + pred_index[:k_max] + list(true_index))
        print("Fold {} processed.".format(fold_counter))
    file.close()

991005
221434
2064056
707628
2525642
3181175
1733050
2304689
493482
371060
991005
221434
2064056
707628
2525642
3181175
1733050
2304689
493482
371060
991005
221434
2064056
707628
2525642
3181175
1733050
2304689
493482
371060
991005
221434
2064056
707628
2525642
3181175
1733050
2304689
493482
371060
991005
221434
2064056
707628
2525642
3181175
1733050
2304689
493482
371060
991005
221434
2064056
707628
2525642
3181175
1733050
2304689
493482
371060
991005
221434
2064056
707628
2525642
3181175
1733050
2304689
493482
371060
991005
221434
2064056
707628
2525642
3181175
1733050
2304689
493482
371060
991005
221434
2064056
707628
2525642
3181175
1733050
2304689
493482
371060
991005
221434
2064056
707628
2525642
3181175
1733050
2304689
493482
371060
991005
221434
2064056
707628
2525642
3181175
1733050
2304689
493482
371060
991005
221434
2064056
707628
2525642
3181175
1733050
2304689
493482
371060
991005
221434
2064056
707628
2525642
3181175
1733050
2304689
493482
371060
991005
221434
2064056
707

KeyboardInterrupt: 