In [None]:
import os
import sys
import fnmatch
import numpy as np
import pandas as pd
import json
import gzip
import pickle
import csv
import scipy.sparse
Xauth = None
from collections import defaultdict

In [None]:
# setup the update to work despite the broken scipy documentation
try:
    a = scipy.sparse.dok_matrix((10,10))
    a.update({(0,0):1.0})
    scipy.sparse.dok_matrix.my_update = scipy.sparse.dok_matrix.update
except:
    a = scipy.sparse.dok_matrix((10,10))
    a._update({(0,0):1.0})
    scipy.sparse.dok_matrix.my_update = scipy.sparse.dok_matrix._update

In [None]:
with gzip.open('useful_venue_list.pkl.gz','rb') as fp:
    all_venues = pickle.load(fp)
with gzip.open('useful_authors_list.pkl.gz','rb') as fp:
    all_authors = pickle.load(fp)
with gzip.open('useful_papers.pkl.gz','rb') as fp:
    all_papers = pickle.load(fp)

In [None]:
conf_idx = {v:i for i,v in enumerate(all_venues)}
name_idx = {v:i for i,v in enumerate(all_authors)}
n_confs = len(all_venues)
n_auths = len(all_authors)
print(n_confs,n_auths)

In [None]:
faculty_affil = pd.read_csv('faculty-affiliations.csv')
ranks = pd.read_csv('other_ranks/ranks.csv')
def csv2dict_str_str(fname):
    with open(fname, mode='r') as infile:
        rdr = csv.reader(infile)
        d = {rows[0].strip(): rows[1].strip() for rows in rdr}
    return d
aliasdict = csv2dict_str_str('dblp-aliases-expanded.csv')

In [None]:
min_year = all_papers[0][6]
max_year = all_papers[-1][6]
span_years = max_year - min_year + 1
print(min_year,max_year,span_years)

In [None]:
curious_names = [
    'Aditya Dhawale',
    'Tesca Fitzgerald',
    'Adam W. Harley',
    "Xiaolong Wang 0004",
    "Judy Hoffman",
    "Paris Siminelakis",
    "Roie Levin",
    "Leonid Keselman",
    "Rick Goldstein",
    "Nicholas Rhinehart",
    "Vincent Sitzmann",
    "Siddharth Ancha",
    "Xingyu Lin",
    "Humphrey Hu",
    "David F. Fouhey",
    "Chelsea Finn",
    "Dinesh Jayaraman",
    "Wen Sun 0002",
    "Lerrel Pinto",
    "Justin Johnson 0001",
    "Amir Zamir",
    "Dominik Peters",
    "Jonathan T. Barron",
    "Dorsa Sadigh",
    "Derek Hoiem",
    "Vaggos Chatziafratis",
    "Brian Okorn",
    "David Held"
]

curious_names2 = ['Pulkit Agrawal',
 'Joydeep Biswas',
 'Katherine L. Bouman',
 'David Braun',
 'Jia Deng',
 'Naomi T. Fitter',
 'David F. Fouhey',
 'Saurabh Gupta',
 'Judy Hoffman',
 'Hanbyul Joo',
 'Honglak Lee',
 'Changliu Liu',
 'Petter Nilsson',
 "Matthew O'Toole",
 'Alessandro Roncone',
 'Alanson P. Sample',
 'Manolis Savva',
 'Adriana Schulz',
 'Amy Tabb',
 'Fatma Zeynep Temel',
 'Long Wang',
 'Ling-Qi Yan']

curious_names = ['Xiaolong Wang 0004','Judy Hoffman','Paris Siminelakis','Roie Levin','Leonid Keselman',
                 'Nicholas Rhinehart','Vincent Sitzmann','Siddharth Ancha','Xingyu Lin',
                 'Humphrey Hu','Aditya Dhawale','Nick Gisolfi','Andrey Kurenkov',
                 'David F. Fouhey','Chelsea Finn','Akshara Rai','Ankit Bhatia',
                 'Lerrel Pinto','Graeme Best','Alexander Spitzer','Roberto Shu','Amir Abboud',
                 'Justin Johnson 0001','Kumar Shaurya Shankar','Ellen A. Cappo',
                 'Amir Zamir','Dominik Peters','Jonathan T. Barron','Dorsa Sadigh','Derek Hoiem','Vaggos Chatziafratis',
                 'Brian Okorn','David Held']

interesting_set = set(curious_names)

In [None]:
ri_names = list(set([aliasdict.get(row[1],row[1]) for row in pd.read_csv('other_ranks/cmu_faculty.csv').itertuples() if row[2] == 'RI']))

In [None]:
scoreV = None
try:
    import gzip
    import pickle
    with gzip.open('scoresV2.pkl.gz','rb') as fp:
        scoreV = pickle.load(fp)
except:
    print('failed!')
    
clf =  np.load('clf_gold.pkl.npy')
years_per_conf = clf.shape[0]//n_confs
YEAR_BLOCKS = span_years//years_per_conf
pr_full = pickle.load(open('new_pagerank_people.pkl','rb'))

In [None]:
from unidecode import unidecode
author_results = defaultdict(dict)
for name in interesting_set:
    idx = name_idx[name]
    author_results[name]['Affiliation'] = faculty_lookup.get(name,'Unknown')
    author_results[name]['Years'] = scoreV['working_years'][idx]

In [None]:
working_years = (auth_years[:,1] - auth_years[:,0]+1)

v = scoreV['1/i_total_1970']/(np.maximum(2,working_years.astype(np.float32)))
ratio_v = np.maximum(1e-3,scoreV['1/i_RI_1970'])/np.maximum(1e-3,scoreV['1/i_total_1970'])
v *= np.sqrt(ratio_v)
v *= 1/np.log(np.maximum(2,working_years.astype(np.float32)))
tv = np.zeros_like(scoreV['1/i_total_1970'])
for n in ['_apmFalse', '_apmTrue',  'pw_apmFalse', 'pw_apmTrue', 'pweff_apmFalse', 'pweff_apmTrue', 'pwunk_apmFalse','pwunk_apmTrue']:
    tv +=  scoreV[n]
v *= np.log(np.maximum(tv,10))
v *= np.log(scoreV['1/i_max_yr']-1965)
v = np.nan_to_num(v)
meta_metric = v

In [None]:
coauthors  = defaultdict(lambda: defaultdict(int))
coauthors_frac  = defaultdict(lambda: defaultdict(float))
coauthors_num  = defaultdict(list)
author_pos  = defaultdict(list)
conf_paper_frac  =  defaultdict(lambda: defaultdict(int))
paper_num  = defaultdict(int)
paper_frac  = defaultdict(float)
for paper in all_papers:
    tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    n = len(authors)
    parse = False
    for a in authors:
        if a in interesting_set:
            parse = True
            break
    if parse:
        for i,a in enumerate(authors):
            if a in interesting_set:
                conf_paper_frac[a][venue] += 1/n
                coauthors_num[a].append(n)
                author_pos[a].append((i+1)/n)
                paper_num[a] += 1
                paper_frac[a] += 1/n
                for a2 in authors:
                    if a2 == a:
                        continue
                    coauthors[a][a2] += 1
                    coauthors_frac[a][a2] += 1/n


In [None]:
for name in interesting_set:
    idx = name_idx[name]
    author_results[name]['MetaMetric'] = meta_metric[idx]
    author_results[name]['Score (1/pos)'] = scoreV['1/i_total_1970'][idx]
    
    author_results[name]['RIScore'] = scoreV['1/i_RI_1970'][idx]
    author_results[name]['RI %'] = scoreV['1/i_RI_1970'][idx]/scoreV['1/i_total_1970'][idx]

    sum_v = 0
    for sub in ['ROB','CV','GR','ML']:
        den = scoreV['1/i_{}_1970'.format(sub)][idx]
        den = den if den != 0.0 else 0
        author_results[name][sub + ' %'] = den/max(1e-9,scoreV['1/i_total_1970'][idx])
        sum_v += den
    author_results[name]['Other %'] = max(0,scoreV['1/i_total_1970'][idx] - sum_v)/scoreV['1/i_total_1970'][idx]
    
    author_results[name]['From'] = scoreV['auth_years'][idx][0]
    author_results[name]['Until'] = scoreV['auth_years'][idx][1]

    author_results[name]['YearlyScore (1/pos)'] = scoreV['1/i_total_1970'][idx]/scoreV['working_years'][idx]
    author_results[name]['YearlyRIScore'] = scoreV['1/i_RI_1970'][idx]/scoreV['working_years'][idx]

new_set = set()

for name in interesting_set:
    idx = name_idx[name]

    author_results[name]['avgCoauthor'] = np.array(coauthors_num[name]).mean()
    colabs = sorted([(v,k) for k,v in coauthors_frac[name].items()],reverse=True)
    fam_colab = sorted([(v*scoreV['1/i_total_1970'][name_idx[k]],k) for k,v in coauthors_frac[name].items()],reverse=True)
    
    freq_colabs = sorted([(v,k) for k,v in coauthors[name].items() if v >= 4],reverse=True)
    if len(colabs) > 0:
        author_results[name]['mostCoauthorName'] = unidecode(colabs[0][1])
        author_results[name]['mostCoauthorTimes'] = colabs[0][0]
        new_set.add(colabs[0][1])
    else:
        author_results[name]['mostCoauthorName'] = ''
        author_results[name]['mostCoauthorTimes'] = 0

    if len(fam_colab) > 0:
        author_results[name]['famCoauthorName'] = unidecode(fam_colab[0][1])
        new_set.add(fam_colab[0][1])
    else:
        author_results[name]['famCoauthorName'] = ''

    author_results[name]['authorPosition%'] = np.array(author_pos[name]).mean()
    author_results[name]['totalCoauth'] = len(colabs)
    author_results[name]['freqCoauth (> 3 papers)'] = len(freq_colabs)
    author_results[name]['famCoauthFrac'] = sum([_[0] for _ in fam_colab])
    author_results[name]['totalCoauthFrac'] = sum([_[0] for _ in colabs])
    
    author_results[name]['mostPaperConf'] = sorted([(v,k) for k,v in conf_paper_frac[name].items()],reverse=True)[0][1]
    author_results[name]['venuesPublishedIn'] = len(conf_paper_frac[name].items())
    author_results[name]['pageRank'] = pr_full[idx]
    #author_results[name]['pageRankRI'] = pr_ri[idx]
    
    
    author_results[name]['numPapers'] = paper_num[name]
    author_results[name]['numPapersFrac'] = paper_frac[name]


    author_results[name]['YearlyScore (1/n)'] = scoreV['1/n_total_1970'][idx]/scoreV['working_years'][idx]
    author_results[name]['YearlyScore (Full)'] = scoreV['full_total_1970'][idx]/scoreV['working_years'][idx]
    author_results[name]['Score (1/n)'] = scoreV['1/n_total_1970'][idx]
    author_results[name]['Score (Full)'] = scoreV['full_total_1970'][idx]
    
    author_results[name]['MaxScore'] = scoreV['1/i_max'][idx]
    author_results[name]['MaxScore (1/n)'] = scoreV['1/n_max'][idx]
    author_results[name]['MaxScore (Full)'] = scoreV['full_max'][idx]
    
    author_results[name]['BestYear'] = scoreV['1/i_max_yr'][idx]
    author_results[name]['BestYear (1/n)'] = scoreV['1/n_max_yr'][idx]
    author_results[name]['BestYear (Full)'] = scoreV['full_max_yr'][idx]
    
    for i,n in enumerate(['_apmFalse', '_apmTrue',  'pw_apmFalse', 'pw_apmTrue', 'pweff_apmFalse', 'pweff_apmTrue', 'pwunk_apmFalse','pwunk_apmTrue']):
        author_results[name]['Adv'+str(i+1)] =  scoreV[n][idx]
results_list = []
for k,v in author_results.items():
    v['Name'] = k
    results_list.append(v)
def_order = list(author_results[list(author_results.keys())[0]].keys())
df_results = pd.DataFrame(results_list)[def_order].set_index('Name')

In [None]:
df_results[[_ for _ in df_results.columns if 'Adv!' not in _]].sort_values('Adv1',0,False).style.background_gradient()