In [None]:
import csv
from datetime import datetime
import time
import random
import pickle
import pandas as pd
import zipfile

import numpy as np
import matplotlib.pyplot as plt
from scipy import sparse
from scipy.stats import weightedtau, kendalltau
from scipy.stats import norm
from scipy.linalg import null_space

import matplotlib.pyplot as plt

from pprint import pprint
from copy import deepcopy

import trueskill
import io

In [None]:
# either one should work:

# https://lfs.aminer.cn/lab-datasets/citation/dblp.v10.zip
# https://lfs.aminer.cn/lab-datasets/citation/dblp.v10.ziphttps://lfs.aminer.cn/lab-datasets/citation/dblp.v10.zip

zip_path = './data/dblp.v10.zip'

with zipfile.ZipFile(zip_path, 'r') as z:
    json_files = [f for f in z.namelist() if f.endswith('.json')]
    
    if len(json_files) != 4:
        raise ValueError("There are not exactly four JSON files in the zip.")
    
    dataframes = []
    
    for json_file in json_files:
        # Read the json file content into a pandas DataFrame
        with z.open(json_file) as f:
            data = pd.read_json(io.BytesIO(f.read()), lines=True)
        
        
        dataframes.append(data)

df, df2, df3, df4 = dataframes

In [None]:
# keep thes conferences only
conferences = [
    "neural information processing systems",
    "international conference on machine learning",
    "knowledge discovery and data mining",
   "international joint conference on artificial intelligence",
   "uncertainty in artificial intelligence",
    #"conference on uncertainty in artificial intelligence",
    "international conference on learning representations",
    "computational learning theory"
]


def filter_venues(df):
    df['venue'] = df['venue'].str.lower()
    return df[df['venue'].isin(conferences)]


filtered_dfs = [filter_venues(df) for df in [df, df2, df3, df4]]
combined_filtered_df = pd.concat(filtered_dfs, ignore_index=True)

combined_filtered_df= combined_filtered_df[combined_filtered_df['authors'].notna()]


In [None]:
def is_alphabetically_sorted(authors_list): # check if it is alphabatically sorted
    if not authors_list:  
        return False
    sorted_authors = sorted(authors_list)
    return authors_list == sorted_authors


def assign_weights(authors): # if alphbatically placed then 1 for all, otherwise follow the literature
    if is_alphabetically_sorted(authors):
       
        return [1] * len(authors)
    else:
        
        if len(authors) == 1:
            
            return [2]
        else:
            
            weights = [1] * len(authors)
            weights[0] = 2  # First author weight
            weights[-1] = 2  # Last author weight
            return weights

        
def assign_uni_weights(authors): 
    return [1] * len(authors)



citation_list = []
citation_list_ind = []
for authors in combined_filtered_df['authors']:
    if isinstance(authors, list): 
        weights = assign_weights(authors)
        uni_weights = assign_uni_weights(authors)
        # for vertex dependent 
        citation_list.append((authors, weights))
        # for vertex ind
        citation_list_ind.append((authors, uni_weights))
    else:
        citation_list.append((authors, []))
        citation_list_ind.append((authors, []))  


# get the universse
universe = set()

for authors, _ in citation_list:
    universe.update(authors)

In [None]:
pi_list = citation_list
#pi_list_ind = citation_list_ind
universe = np.array(list(universe))
# first create these matrices
# R = |E| x |V|, R(e, v) = lambda_e(v)
# W = |V| x |E|, W(v, e) = w(e) 1(v in e)

m = len(pi_list) # number of hyperedges
n = len(universe) # number of items to be ranked 
R = np.zeros([m, n])
W = np.zeros([n, m])

for i in range(len(pi_list)):
    pi, scores = pi_list[i]
    if len(pi) > 1:   
        for j in range(len(pi)):
            v = pi[j]
            v = np.where(universe == v)[0][0] #equivalent to universe.index(v) but for np arrays
            #R[i, v] = np.exp(scores[j])
            R[i, v] = scores[j]
            W[v,i] = combined_filtered_df.iloc[:,2][i]+1# citation + 1

        R[i, :] = R[i,:] / sum(R[i,:])

        
     

W = np.nan_to_num(W, nan=0.0)

# 计算每一行的和
sum_W = W.sum(axis=1)
# sanity chec, for those Nan and o, we replace them with 1
zero_sum_rows = np.where(sum_W == 0)[0]
nan_sum_rows = np.where(np.isnan(sum_W))[0]

sum_W_corrected = sum_W.copy()
sum_W_corrected[sum_W_corrected == 0] = 1

# 归一化 W
Wnorm = W / sum_W_corrected[:, None]  
###



# # first, normalize W
# #Wnorm=W/W.sum(axis=1)[:,None]
Ws = sparse.csr_matrix(Wnorm)
Rs = sparse.csr_matrix(R)

# create prob trans matrices
P = np.transpose(Ws.dot(Rs))

# create rankings
r=0.40


# COMPUTE PAGERANK
##################################################

# given probability transition matrix P
# where P_{v,w} = Prob(w -> v)
# find pagerank scores with restart probability r
def compute_pr(P, r, n, eps=1e-8):
    x = np.ones(n) / n*1.0
    flag = True
    t=0
    while flag:
        x_new = (1-r)*P*x
        x_new = x_new + np.ones(n) * r / n
        diff = np.linalg.norm(x_new - x)
        if np.linalg.norm(x_new - x,ord=1) < eps and t > 100:
            flag = False
        t=t+1
        x = x_new
    return x





rankings_hg = compute_pr(P, r, n, eps=1e-8).flatten()

In [None]:
#rankings_hg /= rankings_hg.sum()
assert len(rankings_hg) == len(universe), "rankings_hg and universe have diff length！"

ranking_df =  pd.DataFrame({
    'Name': universe,
    'PageRank_Score': rankings_hg
})

ranking_df_sorted = ranking_df.sort_values(by='PageRank_Score', ascending=False).reset_index(drop=True)

# names in the literature
bignames = ['Richard Socher', 'Zhongzhi Shi', 'Daniel Rueckert', 
                   'Lars Schmidt-Thieme','Tat-Seng Chua','Ian J. Goodfellow']

bignames_only_hd = ranking_df_sorted[ranking_df_sorted['Name'].isin(bignames)]

bignames_only_hd["H_D Rank"] = bignames_only_hd.index


bignames_only_hd.set_index("Name", inplace=True)

bignames_only_hd

In [None]:
# just leave it as it is for now:
pi_list = citation_list_ind
#pi_list_ind = citation_list_ind
universe = np.array(list(universe))
# first create these matrices
# R = |E| x |V|, R(e, v) = lambda_e(v)
# W = |V| x |E|, W(v, e) = w(e) 1(v in e)

m = len(pi_list) # number of hyperedges
n = len(universe) # number of items to be ranked 
R = np.zeros([m, n])
W = np.zeros([n, m])

for i in range(len(pi_list)):
    pi, scores = pi_list[i]
    if len(pi) > 1:   
        for j in range(len(pi)):
            v = pi[j]
            v = np.where(universe == v)[0][0] #equivalent to universe.index(v) but for np arrays
            #R[i, v] = np.exp(scores[j])
            R[i, v] = scores[j]
            W[v,i] = combined_filtered_df.iloc[:,2][i]+1# citation + 1

        R[i, :] = R[i,:] / sum(R[i,:])

        
     

W = np.nan_to_num(W, nan=0.0)

# 计算每一行的和
sum_W = W.sum(axis=1)
# sanity chec, for those Nan and o, we replace them with 1
zero_sum_rows = np.where(sum_W == 0)[0]
nan_sum_rows = np.where(np.isnan(sum_W))[0]

sum_W_corrected = sum_W.copy()
sum_W_corrected[sum_W_corrected == 0] = 1

# 归一化 W
Wnorm = W / sum_W_corrected[:, None]  
###



# # first, normalize W
# #Wnorm=W/W.sum(axis=1)[:,None]
Ws = sparse.csr_matrix(Wnorm)
Rs = sparse.csr_matrix(R)

# create prob trans matrices
P = np.transpose(Ws.dot(Rs))

# create rankings
r=0.40


# COMPUTE PAGERANK
##################################################

# given probability transition matrix P
# where P_{v,w} = Prob(w -> v)
# find pagerank scores with restart probability r
def compute_pr(P, r, n, eps=1e-8):
    x = np.ones(n) / n*1.0
    flag = True
    t=0
    while flag:
        x_new = (1-r)*P*x
        x_new = x_new + np.ones(n) * r / n
        diff = np.linalg.norm(x_new - x)
        if np.linalg.norm(x_new - x,ord=1) < eps and t > 100:
            flag = False
        t=t+1
        x = x_new
    return x





rankings_hg_ind = compute_pr(P, r, n, eps=1e-8).flatten()


#rankings_hg /= rankings_hg.sum()
assert len(rankings_hg_ind) == len(universe), "rankings_hg and universe have diff length！"

# 创建一个包含作者和对应分数的 DataFrame
ranking_df =  pd.DataFrame({
    'Name': universe,
    'PageRank_Score': rankings_hg_ind
})

# 按照 PageRank 分数降序排序
ranking_df_sorted = ranking_df.sort_values(by='PageRank_Score', ascending=False).reset_index(drop=True)


bignames = ['Richard Socher', 'Zhongzhi Shi', 'Daniel Rueckert', 
                   'Lars Schmidt-Thieme','Tat-Seng Chua','Ian J. Goodfellow']

bignames_only_ht = ranking_df_sorted[ranking_df_sorted['Name'].isin(bignames)]

# Replace `PageRank_Score` with the index
bignames_only_ht["H_T Rank"] = bignames_only_ht.index

# Drop the old `PageRank_Score` column
# df.drop(columns=["PageRank_Score"], inplace=True)

# # Set the `Name` column as the index
bignames_only_ht.set_index("Name", inplace=True)

# # Display the updated DataFrame
# print(df)

bignames_only_ht

In [None]:
tau, p_value = kendalltau(rankings_hg_ind, rankings_hg)

print("Kendall's Tau:", tau)