In [None]:
import os
import sys
import fnmatch
import zipfile
import xmltodict
import numpy as np
import pandas as pd
import json
import gzip
import pickle
import csv
import scipy.sparse

In [None]:
with gzip.open('useful_venue_list.pkl.gz','rb') as fp:
    all_venues = pickle.load(fp)
with gzip.open('useful_authors_list.pkl.gz','rb') as fp:
    all_authors = pickle.load(fp)
with gzip.open('useful_papers.pkl.gz','rb') as fp:
    all_papers = pickle.load(fp)

In [None]:
conf_idx = {v:i for i,v in enumerate(all_venues)}
name_idx = {v:i for i,v in enumerate(all_authors)}
n_confs = len(all_venues)
n_auths = len(all_authors)
r1_confs = pickle.load(open('old_version/r1_confs.pkl','rb'))
r1_confs_dict = {_:1 for _ in r1_confs}

In [None]:
faculty_affil = pd.read_csv('faculty-affiliations.csv')
def csv2dict_str_str(fname):
    with open(fname, mode='r') as infile:
        rdr = csv.reader(infile)
        d = {rows[0].strip(): rows[1].strip() for rows in rdr}
    return d
aliasdict = csv2dict_str_str('dblp-aliases.csv')

In [None]:
def pg(M,alpha=0.85,tol=1e-6,max_iter=1,verbose=False):
    N = M.shape[0]
    nodelist = np.arange(N)
    S = scipy.array(M.sum(axis=1)).flatten()
    S[S != 0] = 1.0 / S[S != 0]
    Q = scipy.sparse.spdiags(S.T, 0, *M.shape, format='csr')
    M = Q * M

    # initial vector
    x = scipy.repeat(1.0 / N, N)

    # Personalization vector
    p = scipy.repeat(1.0 / N, N)

    # Dangling nodes
    dangling_weights = p
    is_dangling = scipy.where(S == 0)[0]

    # power iteration: make up to max_iter iterations
    for _ in range(max_iter):
        xlast = x
        x = alpha * (x * M + sum(x[is_dangling]) * dangling_weights) + \
            (1 - alpha) * p
        # check convergence, l1 norm
        err = scipy.absolute(x - xlast).sum()
        if verbose:
            print(_,err)
        if err < N * tol:
            return x
            
    return x

In [None]:
import itertools
gauth_auth = scipy.sparse.dok_matrix((n_auths,n_auths))
g_auth = {}

for paper in all_papers:
    tag,title, authors, conf, pages, startPage,paper_year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    n = len(authors)

    #if clf[span_years*conf_idx[conf]:span_years*(conf_idx[conf]+1)].max() > 0:
    for a,a2 in itertools.product(authors,authors):
        auth = aliasdict.get(a,a)
        auth2 = aliasdict.get(a2,a2)
        if auth in name_idx and auth2 in name_idx:
            g_auth[(name_idx[auth],name_idx[auth2])] = 1 + g_auth.get((name_idx[auth],name_idx[auth2]),0)
gauth_auth._update(g_auth)

In [None]:
gauth_auth = scipy.sparse.csr_matrix(gauth_auth)
pr = pg(gauth_auth,max_iter=100,verbose=True,tol=1e-12)
print(gauth_auth.shape[0])

In [None]:
pr_s = np.argsort(pr)[::-1]
top_k = 100
i = 0
j = 0 
while i < top_k:
    j += 1
    idx = pr_s[j]
    #if(ri_scores[idx]/rs < 20.0):
    #    continue
    print(all_authors[idx],pr[idx])
    i += 1

In [None]:
#pickle.dump(pr,open('new_pagerank_people.pkl','wb'))

In [None]:
from collections import defaultdict
auth_confs = defaultdict(set)

for paper in all_papers:
    tag,title, authors, conf, pages, startPage,paper_year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    n = len(authors)
    for a in authors:
        auth = aliasdict.get(a,a)
        auth_confs[auth].add(conf_idx[conf])


In [None]:
auth_confs = {k: list(v) for k,v in auth_confs.items()}

In [None]:
import itertools
auth_confs_iter = {k: itertools.combinations_with_replacement(v,2) for k,v in auth_confs.items()}

In [None]:
import itertools
dconf = dict()

gconf_conf = scipy.sparse.dok_matrix((n_confs,n_confs))
dconf = {}
for k,v in auth_confs_iter.items():
    for i,j in v:
        tmp = 1 + dconf.get((i,j),0)
        dconf[(i,j)] = tmp
        if i != j:
            dconf[(j,i)] = tmp

gconf_conf._update(dconf)

In [None]:
gconf_conf = scipy.sparse.csr_matrix(gconf_conf)
prc = pg(gconf_conf,max_iter=100,verbose=True,tol=1e-12)

In [None]:
prc_s = np.argsort(prc)[::-1]
top_k = 100
i = 0
while i < top_k:
    idx = prc_s[i]
    print(all_venues[idx],prc[idx])
    i += 1

In [None]:
pickle.dump(prc,open('new_pagerank_conf.pkl','wb'))