In [None]:
import os
import sys
import fnmatch
import numpy as np
import pandas as pd
import json
import gzip
import pickle
import csv
import scipy.sparse
Xauth = None
from collections import defaultdict
import matplotlib.pyplot as plt

In [None]:
# setup the update to work despite the broken scipy documentation
try:
    a = scipy.sparse.dok_matrix((10,10))
    a.update({(0,0):1.0})
    scipy.sparse.dok_matrix.my_update = scipy.sparse.dok_matrix.update
except:
    a = scipy.sparse.dok_matrix((10,10))
    a._update({(0,0):1.0})
    scipy.sparse.dok_matrix.my_update = scipy.sparse.dok_matrix._update

In [None]:
with gzip.open('useful_venue_list.pkl.gz','rb') as fp:
    all_venues = pickle.load(fp)
with gzip.open('useful_authors_list.pkl.gz','rb') as fp:
    all_authors = pickle.load(fp)
with gzip.open('useful_papers.pkl.gz','rb') as fp:
    all_papers = pickle.load(fp)


In [None]:
min_year = all_papers[0][6]
max_year = all_papers[-1][6]
span_years = max_year - min_year + 1
print(min_year,max_year,span_years)
conf_idx = {v:i for i,v in enumerate(all_venues)}
name_idx = {v:i for i,v in enumerate(all_authors)}
n_confs = len(all_venues)
n_auths = len(all_authors)
n_papers = len(all_papers)
print(n_confs,n_auths,n_papers)

In [None]:
clf =  np.load('clf_gold.pkl.npy')
years_per_conf = clf.shape[0]//n_confs
YEAR_BLOCKS = span_years//years_per_conf
clf[2323]


In [None]:
import scipy.sparse
import gc
if Xauth is None:
    valid_ns = set()
    for paper in all_papers:
        tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
        n = len(authors)
        valid_ns.add(n)
    
    per_author_val = {}
    for n in valid_ns:
        author_scores = 1/(np.arange(n)+1)
        per_author_val[n] = author_scores/author_scores.sum()
        
    count_vecs = {}
    paper_vecs = []
    for paper in all_papers:
        tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
        n = len(authors)
        j = years_per_conf*conf_idx[venue] + (year-min_year)//YEAR_BLOCKS

        author_scores = per_author_val[n]
        paper_vecs.append([(name_idx[a],j,v) for a,v in zip(authors,author_scores)])
        
    Xauth = scipy.sparse.dok_matrix((n_auths,years_per_conf*n_confs))
    xdict = {}
  
    for paper_vec in paper_vecs:
        for i,j,v in paper_vec:
            xdict[(i,j)] = v + xdict.get((i,j),0)

    Xauth.my_update(xdict)
            
    Xauth = scipy.sparse.csr_matrix(Xauth)
    paper_vec = []
    xdict = {}
    gc.collect()
    auth_years = np.ones((n_auths,2)) * np.array([3000,1000]) 
    for paper in all_papers:
        tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
        for a in authors:
            i = name_idx[a]
            auth_years[i,0] = min(auth_years[i,0],year)
            auth_years[i,1] = max(auth_years[i,1],year)

In [None]:
cmu_uni = pd.read_csv('other_ranks/cmu_faculty.csv')
cmu_uni = cmu_uni.fillna('Other')
cmu_uni = cmu_uni[cmu_uni.dept == 'RI']
#uni_names = ['Andrea Tagliasacchi','Paul G. Kry']#['Xuemin Shen','H. Vincent Poor','Kang G. Shin','Mohamed-Slim Alouini','Lajos Hanzo']#list(cmu_uni.name)
#uni_names = list(faculty_affil[faculty_affil.affiliation == 'Johns Hopkins University'].name)
uni_names = list(cmu_uni.name)
print(len(uni_names))
conf_counts = {}
conf_counts_value = {}

for paper in all_papers:
    tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    if year < 2000:
        continue
    n = len(authors)
    for a in authors:
        if a in uni_names:
            conf_counts[venue] = 1/n + conf_counts.get(venue,0)
            conf_counts_value[venue] = clf[years_per_conf*(conf_idx[venue]) + (year-min_year)//YEAR_BLOCKS]/n + conf_counts_value.get(venue,0)
conf_counts_value = {k: v/conf_counts[k] for k,v in conf_counts_value.items()}
ri_fav_confs = [(conf_counts[_[1]]*conf_counts_value[_[1]],_[1],conf_counts[_[1]],conf_counts_value[_[1]]) for _ in sorted([(v,k) for k,v in conf_counts.items() if v > 0],reverse=True)]

In [None]:
confs_to_filter = [_[1] for _ in sorted(ri_fav_confs,reverse=True) if _[-2] >= 1.0]

In [None]:
only_after = 1970
start_year_idx = max(0,only_after-min_year)//YEAR_BLOCKS

year_filter = np.zeros_like(clf).reshape((-1,years_per_conf))

if False: # filter confs
    for conf in confs_to_filter:
        year_filter[conf_idx[conf],start_year_idx:] = 1
else:
    year_filter[:,start_year_idx:] = 1
total_scores = Xauth.dot(clf * year_filter.reshape((-1)))


year_filter = np.zeros_like(clf).reshape((-1,years_per_conf))

for conf in confs_to_filter:
    year_filter[conf_idx[conf],start_year_idx:] = 1
ri_scores = Xauth.dot(clf * year_filter.reshape((-1)))


In [None]:
best_idx = np.argsort(total_scores)[::-1]
for k in range(10):
    idx = best_idx[k]
    print('{:30s}\t{:.2f}'.format(all_authors[idx],total_scores[idx]))

# Build the APM matrix

In [None]:
Xauth = None
gc.collect()

In [None]:
Xauth = scipy.sparse.dok_matrix((n_papers,n_auths))
y = np.zeros(n_papers)
xdict = {}

for i,paper in enumerate(all_papers):
    tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    n = len(authors)
    for a in authors:
        j = name_idx[a]
        xdict[(i,j)] = 1/n
    ji = years_per_conf*conf_idx[venue] + (year-min_year)//YEAR_BLOCKS
    y[i] = clf[ji]

Xauth.my_update(xdict)

Xauth = scipy.sparse.csr_matrix(Xauth)
paper_vec = []
xdict = {}
gc.collect()

In [None]:
_ = plt.hist(y,50)
plt.figure()
_ = plt.hist((y-y.mean())/y.std(),50)
y.std(),y.mean()
plt.figure()
yt = np.copy(y)
where = np.where(y > 0)
yt2 = np.log(yt[where])
yt[where] = (yt2-yt2.mean())/yt2.std()
yt = np.clip(yt,-3.5,3.5)
_ = plt.hist(yt,50)


In [None]:
FI = False

In [None]:
from sklearn.linear_model import SGDRegressor
clf2 = SGDRegressor('huber',alpha=1e-3,penalty='l2',tol=1e-6,max_iter=1000,average=True,verbose=1,fit_intercept=FI)
#clf2.fit(Xauth,yt)
clf2.fit(Xauth,y)

In [None]:
apm_weights = clf2.coef_
apm_weights = (apm_weights-apm_weights.mean())/apm_weights.std()
_ = plt.hist(apm_weights,100,log=True)
print((clf2.intercept_-apm_weights.mean())/apm_weights.std())

# cumulative, pairwise apm
Some questions I have
* Should you get 1 and 0.5 as the coefficients?
* Should you get per_paper or total credit?

In [None]:
import itertools

pairwise_authors = {}

for i,paper in enumerate(all_papers):
    tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    n = len(authors)
    authors = sorted(authors)
    for ap in itertools.combinations(authors, 2):
        if not ap in pairwise_authors:
            pairwise_authors[ap] = len(pairwise_authors)
n_pairwise_authors = len(pairwise_authors)

In [None]:
Xauth = scipy.sparse.dok_matrix((n_auths+n_pairwise_authors,n_auths))
y = np.zeros(n_auths+n_pairwise_authors)
xdict = {}

for i,paper in enumerate(all_papers):
    tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    n = len(authors)
    ji = years_per_conf*conf_idx[venue] + (year-min_year)//YEAR_BLOCKS
    per_person_credit = clf[ji]/n
    for a in authors:
        j = name_idx[a]
        xdict[(j,j)] = 1
        y[j] += per_person_credit
    
    authors_s = sorted(authors)
    for ap in itertools.combinations(authors_s, 2):
        j2 = pairwise_authors[ap] + n_auths
        a1 = name_idx[ap[0]]
        a2 = name_idx[ap[1]]
        xdict[(j2,a1)] = 0.5
        xdict[(j2,a2)] = 0.5
        y[j2] += per_person_credit


Xauth.my_update(xdict)

Xauth = scipy.sparse.csr_matrix(Xauth)
paper_vec = []
xdict = {}
gc.collect()

In [None]:
from sklearn.linear_model import SGDRegressor
clf3 = SGDRegressor('huber',alpha=1e-3,penalty='l2',tol=1e-6,max_iter=1000,average=True,verbose=1,fit_intercept=FI)
clf3.fit(Xauth,y)

In [None]:
pw_apm_weights = clf3.coef_
pw_apm_weights = (pw_apm_weights-pw_apm_weights.mean())/pw_apm_weights.std()
_ = plt.hist(pw_apm_weights,100,log=True)
print((clf3.intercept_-pw_apm_weights.mean())/pw_apm_weights.std())

In [None]:
Xauth = scipy.sparse.dok_matrix((n_auths+n_pairwise_authors,n_auths))
y = np.zeros(n_auths+n_pairwise_authors)
xdict = {}

for i,paper in enumerate(all_papers):
    tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    n = len(authors)
    ji = years_per_conf*conf_idx[venue] + (year-min_year)//YEAR_BLOCKS
    per_person_credit = clf[ji]/n
    for a in authors:
        j = name_idx[a]
        xdict[(j,j)] = 1 + xdict.get((j,j),0)
        y[j] += per_person_credit
    
    authors_s = sorted(authors)
    for ap in itertools.combinations(authors_s, 2):
        j2 = pairwise_authors[ap] + n_auths
        a1 = name_idx[ap[0]]
        a2 = name_idx[ap[1]]
        xdict[(j2,a1)] = 0.5 + xdict.get((j2,a1),0)
        xdict[(j2,a2)] = 0.5 + xdict.get((j2,a2),0)
        y[j2] += per_person_credit


Xauth.my_update(xdict)

Xauth = scipy.sparse.csr_matrix(Xauth)
paper_vec = []
xdict = {}
gc.collect()

In [None]:
from sklearn.linear_model import SGDRegressor
clf4 = SGDRegressor('huber',alpha=1e-3,penalty='l2',tol=1e-6,max_iter=1000,average=True,verbose=1,fit_intercept=FI)
clf4.fit(Xauth,y)

In [None]:
pweff_apm_weights = clf4.coef_
pweff_apm_weights = (pweff_apm_weights-pweff_apm_weights.mean())/pweff_apm_weights.std()
_ = plt.hist(pweff_apm_weights,100,log=True)
print((clf4.intercept_-pweff_apm_weights.mean())/pweff_apm_weights.std())

In [None]:
if False:
    Xauth = scipy.sparse.dok_matrix((n_auths+n_pairwise_authors,n_auths + 1))
    y = np.zeros(n_auths+n_pairwise_authors)
    xdict = {}

    for i,paper in enumerate(all_papers):
        tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
        n = len(authors)
        ji = years_per_conf*conf_idx[venue] + (year-min_year)//YEAR_BLOCKS
        per_person_credit = clf[ji]
        for a in authors:
            j = name_idx[a]
            xdict[(j,j)] = 1 + xdict.get((j,j),0)
            # an unknown author
            xdict[(j,n_auths)] = (n-1) + xdict.get((j,n_auths),0)

            y[j] += per_person_credit


        authors_s = sorted(authors)
        for ap in itertools.combinations(authors_s, 2):
            j2 = pairwise_authors[ap] + n_auths + 1
            a1 = name_idx[ap[0]]
            a2 = name_idx[ap[1]]
            xdict[(j2,a1)] = 1.0 + xdict.get((j2,a1),0)
            xdict[(j2,a2)] = 1.0 + xdict.get((j2,a2),0)
            y[j2] += per_person_credit
            xdict[(j2,n_auths)] = (n-2) + xdict.get((j2,n_auths),0)



    Xauth.my_update(xdict)

    Xauth = scipy.sparse.csr_matrix(Xauth)
    paper_vec = []
    xdict = {}
    gc.collect()

In [None]:
if False:
    from sklearn.linear_model import SGDRegressor
    clf5 = SGDRegressor('huber',alpha=1e-3,penalty='l2',tol=1e-6,max_iter=25,average=True,verbose=1,fit_intercept=FI)
    clf5.fit(Xauth,y)

In [None]:
if False:
    pwunk_apm_weights = clf5.coef_[:n_auths]
    pwunk_apm_weights = (pwunk_apm_weights-pwunk_apm_weights.mean())/pwunk_apm_weights.std()
    _ = plt.hist(pwunk_apm_weights,100,log=True)
    print((clf5.coef_[n_auths]-pwunk_apm_weights.mean())/pwunk_apm_weights.std())

In [None]:
# get author year counts
auth_pro_years = defaultdict(set)

for i,paper in enumerate(all_papers):
    tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    n = len(authors)
    for a in authors:
        auth_pro_years[a].add(year)
            
    authors_s = sorted(authors)
    for ap in itertools.combinations(authors_s, 2):
        auth_pro_years[ap].add(year)


In [None]:
auth_pro_years_count = {k: len(v) for k,v in auth_pro_years.items()}
auth_pro_years = None
gc.collect()

In [None]:
year_span2 = np.array([auth_pro_years_count[n] for n in all_authors])

In [None]:
Xauth = scipy.sparse.dok_matrix((n_auths+n_pairwise_authors,n_auths))
y = np.zeros(n_auths+n_pairwise_authors)
xdict = {}

for i,paper in enumerate(all_papers):
    tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    n = len(authors)
    ji = years_per_conf*conf_idx[venue] + (year-min_year)//YEAR_BLOCKS
    credit = clf[ji]
    author_scores = per_author_val[n]

    for a,c in zip(authors,author_scores):
        j = name_idx[a]
        xdict[(j,j)] = c + xdict.get((j,j),0)

        y[j] += c*credit

    authors_s = sorted([(a,c) for a,c in zip(authors,author_scores)])
    for ap in itertools.combinations(authors_s, 2):
        a1,c1 = ap[0]
        a2,c2 = ap[1] 
        j2 = pairwise_authors[(a1,a2)] + n_auths
        a1 = name_idx[a1]
        a2 = name_idx[a2]
        xdict[(j2,a1)] = c1 + xdict.get((j2,a1),0)
        xdict[(j2,a2)] = c2 + xdict.get((j2,a2),0)
        y[j2] += (c1+c2)*credit


Xauth.my_update(xdict) 

Xauth = scipy.sparse.csr_matrix(Xauth)
paper_vec = []
xdict = {}
gc.collect()

In [None]:
yn = np.zeros_like(y)
for k,v in name_idx.items():
    yn[v] = auth_pro_years_count[k]
for k,v in pairwise_authors.items():
    yn[n_auths+v] = auth_pro_years_count[k]

In [None]:
from sklearn.linear_model import SGDRegressor
clf5 = SGDRegressor('huber',alpha=1e-3,penalty='l2',tol=1e-6,max_iter=25,average=True,verbose=1,fit_intercept=FI)
clf5.fit(Xauth,y/yn)

In [None]:
pwunk_apm_weights = clf5.coef_
pwunk_apm_weights = (pwunk_apm_weights-pwunk_apm_weights.mean())/pwunk_apm_weights.std()
_ = plt.hist(pwunk_apm_weights,100,log=True)
print((clf5.intercept_-pwunk_apm_weights.mean())/pwunk_apm_weights.std())

In [None]:
#year_span2 = (auth_years[:,1] - auth_years[:,0]) + 1
year_span2 = np.maximum(1,year_span2)

eff_scores = total_scores/year_span2
df_clfs2 = pd.DataFrame(np.vstack([total_scores,eff_scores,apm_weights,pw_apm_weights,pweff_apm_weights,pwunk_apm_weights]).T,columns=['Total','Per Year','APM','PW APM','PW Eff APM','PW APM Yearly'])
#print(df_clfs2.corr('spearman').to_latex())
corr = df_clfs2.corr('spearman')
print(corr.mean(0))
corr

In [None]:
best_idx = np.argsort(apm_weights)[::-1]
for k in range(180):
    idx = best_idx[k]
    name = all_authors[idx]
    if name + ' 0001' in name_idx:
        print('.',end='')
        #continue
    print('{:30s}\t{:.2f}\t{:.4f}'.format(all_authors[idx],total_scores[idx],apm_weights[idx]))

In [None]:
best_idx = np.argsort(pw_apm_weights)[::-1]
for k in range(180):
    idx = best_idx[k]
    name = all_authors[idx]
    if name + ' 0001' in name_idx:
        print('.',end='')
        #continue
    print('{:30s}\t{:.2f}\t{:.2f}\t{:.2f}'.format(name,total_scores[idx],pw_apm_weights[idx],apm_weights[idx]))

In [None]:
best_idx = np.argsort(pweff_apm_weights)[::-1]
for k in range(180):
    idx = best_idx[k]
    name = all_authors[idx]
    if name + ' 0001' in name_idx:
        print('.',end='')
        #continue
    print('{:30s}\t{:.2f}\t{:.2f}\t{:.2f}'.format(name,total_scores[idx],pweff_apm_weights[idx],apm_weights[idx]))

In [None]:
best_idx = np.argsort(pwunk_apm_weights)[::-1]
for k in range(180):
    idx = best_idx[k]
    name = all_authors[idx]
    if name + ' 0001' in name_idx:
        print('.',end='')
        #continue
    print('{:30s}\t{:.2f}\t{:.2f}\t{:.2f}'.format(name,total_scores[idx],pwunk_apm_weights[idx],apm_weights[idx]))

In [None]:
uni_scores = sorted([(pwunk_apm_weights[name_idx[name]],name) for name in uni_names if name in name_idx],reverse=True)
uni_scores

In [None]:
uni_scores = sorted([(pweff_apm_weights[name_idx[name]],name) for name in uni_names if name in name_idx],reverse=True)
uni_scores

In [None]:
uni_scores = sorted([(pw_apm_weights[name_idx[name]],name) for name in uni_names if name in name_idx],reverse=True)
uni_scores

In [None]:
uni_scores = sorted([(apm_weights[name_idx[name]],name) for name in uni_names if name in name_idx],reverse=True)
uni_scores

In [None]:
uni_scores = sorted([(total_scores[name_idx[name]],name) for name in uni_names if name in name_idx],reverse=True)
uni_scores

In [None]:
curious_names = ['Xiaolong Wang 0004','Judy Hoffman','Paris Siminelakis','Roie Levin','Leonid Keselman',
                 'Nicholas Rhinehart','Vincent Sitzmann','Siddharth Ancha','Xingyu Lin',
                 'Humphrey Hu','Aditya Dhawale','Nick Gisolfi','Andrey Kurenkov',
                 'David F. Fouhey','Chelsea Finn','Akshara Rai','Ankit Bhatia',
                 'Lerrel Pinto','Graeme Best','Alexander Spitzer','Roberto Shu','Amir Abboud',
                 'Justin Johnson','Kumar Shaurya Shankar','Ellen A. Cappo',
                 'Amir Roshan Zamir','Dominik Peters','Jonathan T. Barron','Dorsa Sadigh','Derek Hoiem','Vaggos Chatziafratis',
                 'Brian Okorn','David Held']

In [None]:
uni_scores = sorted([(pwunk_apm_weights[name_idx[name]],name) for name in curious_names if name in name_idx],reverse=True)
uni_scores

In [None]:
uni_scores = sorted([(pweff_apm_weights[name_idx[name]],name) for name in curious_names if name in name_idx],reverse=True)
uni_scores

In [None]:
uni_scores = sorted([(pw_apm_weights[name_idx[name]],name) for name in curious_names if name in name_idx],reverse=True)
uni_scores

In [None]:
uni_scores = sorted([(apm_weights[name_idx[name]],name) for name in curious_names if name in name_idx],reverse=True)
uni_scores

In [None]:
uni_scores = sorted([(total_scores[name_idx[name]],name) for name in curious_names if name in name_idx],reverse=True)
uni_scores

In [None]:
prev_cand = ['Pulkit Agrawal',
 'Joydeep Biswas',
 'Katherine L. Bouman',
 'David Braun',
 'Naomi T. Fitter',
 'David F. Fouhey',
 'Saurabh Gupta 0001',
 'Judy Hoffman',
 'Hanbyul Joo',
 'Changliu Liu',
 'Petter Nilsson',
 "Matthew O'Toole",
 'Alessandro Roncone',
 'Alanson P. Sample',
 'Manolis Savva',
 'Adriana Schulz',
 'Amy Tabb',
 'Fatma Zeynep Temel',
 'Long Wang 0007',
 'Cathy Wu',
 'Ling-Qi Yan']

In [None]:
uni_scores = sorted([(pwunk_apm_weights[name_idx[name]],name) for name in prev_cand if name in name_idx],reverse=True)
uni_scores

In [None]:
uni_scores = sorted([(pweff_apm_weights[name_idx[name]],name) for name in prev_cand if name in name_idx],reverse=True)
uni_scores

In [None]:
uni_scores = sorted([(pw_apm_weights[name_idx[name]],name) for name in prev_cand if name in name_idx],reverse=True)
uni_scores

In [None]:
uni_scores = sorted([(apm_weights[name_idx[name]],name) for name in prev_cand if name in name_idx],reverse=True)
uni_scores

In [None]:
uni_scores = sorted([(total_scores[name_idx[name]],name) for name in prev_cand if name in name_idx],reverse=True)
uni_scores

In [None]:
prev_cand = ['Yonatan Bisk',
             'Angela Dai',
             'Abe Davis',
             'Tali Dekel',
             'Jaime F. Fisac',
             'Zakia Hammal',
             'Josie Hughes',
             'László A. Jeni',
             'Angjoo Kanazawa',
             'Deepak Pathak',
             'Lerrel Pinto',
             'Elaine Short',
             'Wen Sun 0002',
             'Jiajun Wu 0001',
             #'Ji Zhang', # disambig
             'Jun-Yan Zhu',
             'Yuke Zhu'
            ]
[_ for _ in prev_cand if _ not in name_idx]

In [None]:
uni_scores = sorted([(pwunk_apm_weights[name_idx[name]],name) for name in prev_cand if name in name_idx],reverse=True)
uni_scores

In [None]:
uni_scores = sorted([(pweff_apm_weights[name_idx[name]],name) for name in prev_cand if name in name_idx],reverse=True)
uni_scores

In [None]:
uni_scores = sorted([(pw_apm_weights[name_idx[name]],name) for name in prev_cand if name in name_idx],reverse=True)
uni_scores

In [None]:
uni_scores = sorted([(apm_weights[name_idx[name]],name) for name in prev_cand if name in name_idx],reverse=True)
uni_scores

In [None]:
uni_scores = sorted([(total_scores[name_idx[name]],name) for name in prev_cand if name in name_idx],reverse=True)
uni_scores

In [None]:
faculty_affil = pd.read_csv('faculty-affiliations.csv')

year_span = (auth_years[:,1] - auth_years[:,0]) + 1
faculty_lookup = {_[1]:_[2] for _ in faculty_affil.itertuples()}


In [None]:
first_paper_year = 2013
min_pub_years = 2
cand_total = total_scores * (auth_years[:,0] >= first_paper_year).astype(np.float)* (year_span2 >= min_pub_years).astype(np.float) 

cand_ri = ri_scores* (auth_years[:,0] >= first_paper_year).astype(np.float) * (year_span2 >= min_pub_years).astype(np.float) 

cand_total_ef = cand_total/year_span2
cand_ri_ef = cand_ri/year_span2

print('{}\t{:30s}\t{:s}\t{:s}\t{:s}\t{:s}\t{:s}\t{:20s}'.format('Rank','Author',
                                                      'RI','Total',
                                                      'eRI','eTotal',
                                                      'Since','Affiliation'))
for num,idx in enumerate(np.argsort(cand_total_ef)[::-1][:50000]):
    uni = faculty_lookup.get(all_authors[idx],'None')
    if all_authors[idx] + ' 0001' in name_idx:
        continue
    print('{}\t{:30s}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.0f}\t{:20s}'.format(num+1,all_authors[idx],
                                                      cand_ri[idx],cand_total[idx],
                                                      cand_ri_ef[idx],cand_total_ef[idx],
                                                      auth_years[idx,0],uni))



In [None]:
first_paper_year = 2013
min_pub_years = 2
cand_apm_total = pw_apm_weights * (auth_years[:,0] >= first_paper_year).astype(np.float)* (year_span >= min_pub_years).astype(np.float) 


print('{}\t{:30s}\t{:s}\t{:s}\t{:s}\t{:s}\t{:20s}'.format('Rank','Author',
                                                      'APM','Total','eTotal',
                                                      'Since','Affiliation'))
for num,idx in enumerate(np.argsort(cand_apm_total)[::-1][:50000]):
    uni = faculty_lookup.get(all_authors[idx],'None')
    if all_authors[idx] + ' 0001' in name_idx:
        continue
    print('{}\t{:30s}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.0f}\t{:20s}'.format(num+1,all_authors[idx],
                                                      cand_apm_total[idx],total_scores[idx],eff_scores[idx],
                                                      auth_years[idx,0],uni))



In [None]:
name_to_test = 'Alanson P. Sample'
print(total_scores[name_idx[name_to_test]])
for i,paper in enumerate(all_papers):
    tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    if not name_to_test in authors:
        continue
    n = len(authors)
    ji = years_per_conf*conf_idx[venue] + (year-min_year)//YEAR_BLOCKS
    per_person_credit = clf[ji]/n
    authors_s = sorted(authors)
    combos = list(itertools.combinations(authors_s, 2))
    all_vals = [total_scores[name_idx[n]] for n in authors]
    print('{:.2f}\t{:d} {:d}\t{:.1f}\t{:.1f}'.format(per_person_credit,n,len(combos),min(all_vals),max(all_vals) ))

In [None]:
np.save('apm'+str(FI),apm_weights)
np.save('pwapm'+str(FI),pw_apm_weights)
np.save('pweffapm'+str(FI),pweff_apm_weights)
np.save('pwunkapm'+str(FI),pwunk_apm_weights)
np.save('apm'+str(FI),apm_weights)
np.save('total',total_scores)