In [None]:
import os
import sys
import fnmatch
import numpy as np
import pandas as pd
import json
import gzip
import pickle
import csv
import scipy.sparse
Xauth = None
from collections import defaultdict
import matplotlib.pyplot as plt

In [None]:
# setup the update to work despite the broken scipy documentation
try:
    a = scipy.sparse.dok_matrix((10,10))
    a.update({(0,0):1.0})
    scipy.sparse.dok_matrix.my_update = scipy.sparse.dok_matrix.update
except:
    a = scipy.sparse.dok_matrix((10,10))
    a._update({(0,0):1.0})
    scipy.sparse.dok_matrix.my_update = scipy.sparse.dok_matrix._update

In [None]:
with gzip.open('useful_venue_list.pkl.gz','rb') as fp:
    all_venues = pickle.load(fp)
with gzip.open('useful_authors_list.pkl.gz','rb') as fp:
    all_authors = pickle.load(fp)
with gzip.open('useful_papers.pkl.gz','rb') as fp:
    all_papers = pickle.load(fp)


In [None]:

min_year = all_papers[0][6]
max_year = all_papers[-1][6]
span_years = max_year - min_year + 1
print(min_year,max_year,span_years)
conf_idx = {v:i for i,v in enumerate(all_venues)}
name_idx = {v:i for i,v in enumerate(all_authors)}
n_confs = len(all_venues)
n_auths = len(all_authors)
n_papers = len(all_papers)
print(n_confs,n_auths,n_papers)

In [None]:
clf =  np.load('clf_gold.pkl.npy')
years_per_conf = clf.shape[0]//n_confs
YEAR_BLOCKS = span_years//years_per_conf
import gzip
import pickle
with gzip.open('scoresV2.pkl.gz','rb') as fp:
    scoreV = pickle.load(fp)

In [None]:
author_totals = np.zeros(n_auths)
author_coauthors = np.zeros(n_auths)
author_coauth_set = defaultdict(set)

author_vecs = {}
last_years = np.zeros(n_auths)
for paper in all_papers:
    tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    n = len(authors)
    c_value =  clf[years_per_conf*(conf_idx[venue]) + (year-min_year)//YEAR_BLOCKS]
    c_value = max(c_value,1e-9)
    if n not in author_vecs:
        v= 1/(np.arange(n)+1)
        v = v/v.sum()
        author_vecs[n] = v
        
    for ai,a in enumerate(authors):
        idx = name_idx[a]
        last_years[idx] = max(last_years[idx],year)
        w =  c_value*author_vecs[n][ai]
        author_totals[idx] += w
        author_coauthors[idx] += n*w
        for a2 in authors:
            author_coauth_set[a].add(a2)
author_coauthors /= np.maximum(1e-12,author_totals)
author_coauth_num = np.array([len(author_coauth_set[a]) for a in all_authors])

In [None]:
vec = np.ones_like(scoreV['1/i_CV_1970'])
#prod_col = [v for k,v in scoreV.items() if 'apm' in k] + [1/scoreV['working_years'],1/author_coauthors,1/author_coauth_num,scoreV['1/i_CV_1970'],scoreV['1/i_GR_1970'],scoreV['full_total_1970']]
#prod_col = [scoreV['1/i_total_1970']] #+ [scoreV['pw_apmTrue']]
#prod_col = [v for k,v in scoreV.items() if 'apm' in k]
prod_col = [scoreV['_apmTrue'],scoreV['_apmFalse']]
#prod_col = [1/scoreV['working_years'],1/author_coauthors,1/author_coauth_num,scoreV['1/i_CV_1970'],scoreV['1/i_GR_1970'],scoreV['full_total_1970']]
#prod_col = [scoreV['1/i_CV_1970'],scoreV['1/i_GR_1970'],scoreV['1/i_ML_1970'],scoreV['full_total_1970'],1/author_coauthors,1/scoreV['working_years']]

tot_score = scoreV['full_total_1970']
for col in prod_col:
    vec += (col-col.mean())/col.std()
vec = (vec)/len(prod_col)

In [None]:
print('{:20s}\t{}\t{}\t{}\t{}\t{}'.format('name','scoreV','score','coauth','auth','yr'))
for i in (np.argsort(vec)[::-1])[:8000]:
    yz = int(scoreV['working_years'][i])
    pz_per_year = author_coauth_num[i]/yz
    if last_years[i] < 2018:
        continue
    if yz < 80 and pz_per_year < 150 and author_coauthors[i] <= 13.5:
        print('{:20s}\t{:.2f}\t{:.0f}\t{:.0f}\t{:.1f}\t{:d}'.format(all_authors[i],vec[i],tot_score[i],author_coauth_num[i]/yz,author_coauthors[i],yz))

In [None]:
scoreV.keys()

In [None]:
scoreV['1/i_total_1970'].shape

In [None]:
curious_names = ['Xiaolong Wang 0004','Judy Hoffman','Paris Siminelakis','Roie Levin','Leonid Keselman',
                 'Nicholas Rhinehart','Vincent Sitzmann','Siddharth Ancha','Xingyu Lin',
                 'Humphrey Hu','Aditya Dhawale','Nick Gisolfi','Andrey Kurenkov','Micah Corah',
                 'David F. Fouhey','Chelsea Finn','Akshara Rai','Ankit Bhatia','Xuning Yang',
                 'Lerrel Pinto','Alexander Spitzer','Roberto Shu','Allison Del Giorno','Nadine Chang',
                 'Justin Johnson 0001','Kumar Shaurya Shankar','Ellen A. Cappo','Hunter Goforth',
                 'Amir Roshan Zamir','Jonathan T. Barron','Dorsa Sadigh','Derek Hoiem','Vaggos Chatziafratis',
                 'Brian Okorn','David Held','Adam W. Harley','Hsiao-Yu Fish Tung','Tess Lee Hellebrekers']
print('{:20s}\t{}\t{}\t{}\t{}'.format('name','score','coauth/yr','avg auth','yr'))
for i in (np.argsort(vec)[::-1]):
    if all_authors[i] not in curious_names:
        continue
    yz = int(scoreV['working_years'][i])
    pz_per_year = author_coauth_num[i]/yz
    print('{:20s}\t{:.1f}\t{:.1f}\t\t{:.1f}\t\t{:d}'.format(all_authors[i],vec[i],author_coauth_num[i]/yz,author_coauthors[i],yz))

In [None]:
[(k,v[name_idx['Martial Hebert']]/v[name_idx['Abhinav Gupta 0001']]) for k,v in scoreV.items() if 'apm' in k]

In [None]:
def csv2dict_str_str(fname):
    with open(fname, mode='r') as infile:
        rdr = csv.reader(infile)
        d = {rows[0].strip(): rows[1].strip() for rows in rdr}
    return d
aliasdict = csv2dict_str_str('dblp-aliases-expanded.csv')
ri_names = list(set([aliasdict.get(row[1],row[1]) for row in pd.read_csv('other_ranks/cmu_faculty.csv').itertuples() if row[2] == 'RI']))

In [None]:
print('{:20s}\t{}\t{}\t{}\t{}'.format('name','score','coauth/yr','avg auth','yr'))
for i in (np.argsort(vec)[::-1]):
    if all_authors[i] not in ri_names:
        continue
    yz = int(scoreV['working_years'][i])
    pz_per_year = author_coauth_num[i]/yz
    print('{:20s}\t{:.1f}\t{:.0f}\t\t{:.1f}\t\t{:d}'.format(all_authors[i],vec[i],author_coauth_num[i]/yz,author_coauthors[i],yz))

In [None]:
associated_colab = defaultdict(lambda: defaultdict(float))

for paper in all_papers:
    tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    n = len(authors)
    c_value =  clf[years_per_conf*(conf_idx[venue]) + (year-min_year)//YEAR_BLOCKS]
    for ai,a in enumerate(authors):
        w =  c_value*author_vecs[n][ai]
        for a2 in authors:
            associated_colab[a2][a] += w

In [None]:
sorted([(v,k) for k,v in associated_colab['Jonathan T. Barron'].items()],reverse=True)

In [None]:
ratio_score = np.zeros(n_auths)
ent_score = np.zeros(n_auths)

for a in associated_colab:
    cl = associated_colab[a]
    ms = cl[a]
    vvec = np.array(list(cl.values()))
    os = max(1e-9,vvec.sum())
    ms_os = max(1e-9,os-ms)
    
    vvec_N = vvec/os
    entr = -(np.log(np.maximum(1e-9,vvec_N))*vvec_N).sum()
    ratio_score[name_idx[a]] = ms_os/max(1e-9,ms)
    ent_score[name_idx[a]] = entr

In [None]:
print('{:20s}\t{}\t{}\t{}\t{}\t{}\t{}'.format('name','score','coauth/yr','avg auth','yr','ratio','ent'))
for i in (np.argsort(ratio_score)[::-1]):
    if all_authors[i] not in curious_names:
        continue
    yz = int(scoreV['working_years'][i])
    pz_per_year = author_coauth_num[i]/yz
    print('{:20s}\t{:.1f}\t{:.0f}\t\t{:.1f}\t\t{:d}\t{:.1f}\t{:.1f}'.format(all_authors[i],vec[i],author_coauth_num[i]/yz,author_coauthors[i],yz,ratio_score[i],ent_score[i]))

In [None]:
ri_colab_set = sorted(list(set(sum([list(author_coauth_set[n]) for n in ri_names],[]))))

In [None]:
#df_a = pd.DataFrame([all_authors] + [_ for _ in scoreV.values() if len(_.shape) == 1])

In [None]:
#df_a = pd.DataFrame([all_authors])
df_a = pd.DataFrame(np.array(all_authors),columns=['name'])
valO_v = np.array([sum([v for k,v in associated_colab[a].items() if k!=a]) for a in all_authors])
#for k,v in scoreV.items():
#    if len(v.shape) != 1:
#          continue
#    df_a[k] = v
df_a['val'] = author_totals
df_a['adv'] = scoreV['pw_apmTrue']
df_a['coauth'] = author_coauth_num
df_a['avg_co'] = author_coauthors
df_a['ratio'] = ratio_score
df_a['ratio2'] = author_totals/(valO_v+author_totals)
df_a['ent'] = ent_score
df_a['valO'] = valO_v
#adv2 = [np.maximum(1e-9,v) for k,v in scoreV.items() if 'apm' in k]
#adv2 = np.prod(adv2,axis=0)**(1/len(adv2))
df_a['adv2'] = scoreV['_apmTrue'] + scoreV['_apmFalse']

In [None]:
df2 = df_a[df_a.name.isin(ri_colab_set)].sort_values('adv',0,False)

In [None]:
import seaborn as sns
plt.scatter(df2.ratio,df2.val)

In [None]:
import scipy.stats
import statsmodels.api as sm
df3 = df2[df2.ratio < 10]
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(df3['coauth'],df3['val'])
sns.lmplot('coauth','val',df3)
slope,intercept


In [None]:
#regf = sm.OLS(df_r2['val'],df_r2['coauth']).fit()
#regf.summary()

In [None]:
plt.style.use('fivethirtyeight')
#plt.figure(figsize=(4,4))
df_r = df_a[df_a.name.isin(ri_names)]
df_r = df_r[df_r.val < 300]
df_r['pval'] = df_r.coauth*0.62
sns.lmplot('val','pval',df_r)
#plt.scatter(df_r.val,df_r.coauth*0.25)
#plt.xlabel('academic value')
#plt.ylabel('coauthors times 0.25')
#plt.grid(True)
#plt.axis('equal')
#plt.xlim(0,200)
plt.ylabel('CoAuthors x 0.62')
plt.xlabel('Academic Contribution')
plt.title('RI Faculty')

In [None]:
#df_r2['pval'] = 0.6*df_r2.coauth
#df_r2['overExpect'] = (df_r2.val - df_r2.pval)/df_r2.val
#df_r2.sort_values('overExpect',0,False)

In [None]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(open('download/ri_students.html','rt').read(), 'html.parser')

In [None]:
stud_names = []
advisor_set = []
is_phd = []
students = soup.find_all('div',{'class':'RIpersoninfo'})
for stud in students:
    filt = [_ for _ in [_.strip().rstrip().replace('\t',' ') for _ in stud.get_text().split('\n')] if len(_) > 0]
    stud_names.append(filt[0])
    advisors = [_.strip().rstrip() for _ in filt if 'Advisor' in _]
    if len(advisors) > 0:
        a_set = advisors[0].split(':')[-1].rstrip().strip()
        a_set = [_.strip().rstrip() for _ in a_set.split(' and ')]
        a_set = [_ for _ in a_set if len(_) > 0]
    else:
        a_set = []
        
    advisor_set.append(a_set)
    is_phd.append('PhD' in stud.get_text())

In [None]:
from collections import Counter
a_count = sorted([(v,k) for k,v in Counter(sum(advisor_set,[])).items()])[::-1]
ri_names = list(set([aliasdict.get(row[1],row[1]) for row in pd.read_csv('other_ranks/cmu_faculty.csv',encoding='utf8').itertuples() if row[2] == 'RI']))
ri_colab_set = set(sorted(list(set(sum([list(author_coauth_set[n]) for n in ri_names],[])))))

In [None]:
import thefuzz.process
if False:
    dblp_stud = []
    dblp_cand_s = []
    for stud_n,advis_n,isP in zip(stud_names,advisor_set,is_phd):
        if len(advis_n) == 0:
            continue
        res = thefuzz.process.extract(stud_n,ri_colab_set)
        if res[0][1] >= 88:
            r = res[0][0]
            dblp_stud.append(res[0][0])
        elif stud_n in name_idx and stud_n + ' 0001' not in name_idx:
            r = stud_n
        else:
            r = ''
            print(res,stud_n,advis_n)
        dblp_cand_s.append([stud_n,advis_n,isP,r])
    df_s_out = pd.DataFrame(dblp_cand_s,columns = ['name','advis','phd','dblp name'])
    df_s_out.to_csv('stud_lookup2.csv',index=False)

In [None]:
df_s_out = pd.read_csv('stud_lookup2.csv')
filt_set1 = df_s_out[df_s_out['dblp name'].map(lambda x: type(x) is str) & df_s_out.phd]
filt_set2 = df_s_out[df_s_out['dblp name'].map(lambda x: type(x) is float and np.isnan(x)) & df_s_out.phd]

dblp_stud = list(filt_set1['dblp name'])
filt_set2

In [None]:
advisor_set[0]

In [None]:
dblp_stud

In [None]:
for n in [_[1] for _ in a_count]:
    res = thefuzz.process.extract(n,ri_names)
    if res[0][1] < 80:
        print(n,res[0])


In [None]:
df_s = df_a[df_a.name.isin(dblp_stud)]

s_clf = sm.OLS(df_s['val'],df_s[['valO']]).fit()
s_clf.summary()

In [None]:
plt.style.use('fivethirtyeight')
#plt.figure(figsize=(4,4))
#df_s = df_s[df_s.coauth < 100]
#df_s = df_s[df_s.val < 20]

df_s['pval'] = s_clf.predict()

df_s['diff'] = (df_s.val - df_s.pval)#/(df_s.valO+df_s.val)
sns.lmplot('val','pval',df_s)
#plt.scatter(df_r.val,df_r.coauth*0.25)
#plt.xlabel('academic value')
#plt.ylabel('coauthors times 0.25')
#plt.grid(True)
#plt.axis('equal')
#plt.xlim(0,200)
plt.ylabel('Predicted value')
plt.xlabel('Academic Contribution')
plt.title('RI Students')
#plt.ylim(0,20)
#plt.xlim(0,20)

In [None]:
pd.set_option('display.max_rows', 300)
df_s.sort_values('ratio2',0,False)

In [None]:
soup2 = BeautifulSoup(open('download/ri_alumni.html','rt').read(), 'html.parser')

In [None]:
alum = soup2.find_all('div',{'class':'col-lg-12 col-md-12 col-sm-12 col-xs-12 alumni-block'})
alum_names = []
for stud in alum:
    res = [_ for _ in stud.get_text().split('\n') if len(_) > 0]
    
    is_phd = [_ for _ in res if 'Program: PhD' == _]
    is_complete = [_ for _ in res if 'Date Completed' in _]
    if is_phd and len(is_complete) > 0:
        #print(res)
        alum_names.append((res[0],int(is_complete[0][-4:])))

In [None]:
alum_names

In [None]:
[_ for _ in ri_colab_set if 'Matt' in  _]

In [None]:
if False:
    import thefuzz.process
    dblp_alum = []
    dblp_alum_yr = []
    failed_alum_find = []
    for stud_n,yr in alum_names:
        if len(ri_colab_set) == 0:
            continue
        res = thefuzz.process.extract(stud_n,ri_colab_set)
        if res[0][1] < 88:
            print(res,stud_n,yr)
            r = ''
        else:
            r = res[0][0]
            dblp_alum.append(r)
            dblp_alum_yr.append((r,yr))
        failed_alum_find.append([stud_n,yr,r])
        
    df_a_out = pd.DataFrame(failed_alum_find,columns = ['name','year','dblp name'])
    df_a_out.to_csv('alum_lookup2.csv',index=False)

In [None]:
df_a_out = pd.read_csv('alum_lookup.csv')
df_a_out[df_a_out['dblp name'].map(lambda x: type(x) is float and np.isnan(x))]

In [None]:
plt.style.use('fivethirtyeight')
#plt.figure(figsize=(4,4))
df_l = df_a[df_a.name.isin(set(df_a_out['dblp name']))]
#df_l = df_l[df_l.coauth < 100]
#df_l = df_l[df_l.val < 20]

df_l['pval'] = 0.45*df_l.coauth
df_l['diff'] = (df_l.val - df_l.pval)/np.maximum(df_l.pval,df_l.val)
sns.lmplot('val','pval',df_l)
#plt.scatter(df_r.val,df_r.coauth*0.25)
#plt.xlabel('academic value')
#plt.ylabel('coauthors times 0.25')
#plt.grid(True)
#plt.axis('equal')
#plt.xlim(0,200)
plt.ylabel('CoAuthors')
plt.xlabel('Academic Contribution')
plt.title('RI Students')

In [None]:
df_l.sort_values('val',0,False)

In [None]:
for row in df_a_out.itertuples():
    print(row[2],row[1])

In [None]:
dblp_alum_yr = {_[3]:_[2] for _ in df_a_out.itertuples() if type(_[3]) == str}
bad_list = ['Eric Huang','David Silver']
for n in bad_list:
    if n in dblp_alum_yr:
        del dblp_alum_yr[n]
for n in dblp_alum_yr:
    if n not in name_idx:
        print(n)

In [None]:
author_vals = defaultdict(float)
author_vals_O = defaultdict(float)

author_cos = defaultdict(list)
for paper in all_papers:
    tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    n = len(authors)
    c_value =  clf[years_per_conf*(conf_idx[venue]) + (year-min_year)//YEAR_BLOCKS]
    for ai,a in enumerate(authors):
        w =  c_value*author_vecs[n][ai]
        if a in dblp_alum_yr and year <= dblp_alum_yr[a]:
            author_cos[a] = authors + author_cos[a]
            author_vals[a] += w
            author_vals_O[a] += c_value-w

In [None]:
author_cos_l = {k:len(set(v)) for k,v in author_cos.items()}

In [None]:
#df_a[df_a.name.isin(dblp_alum_yr)].sort_values('adv2',0,False)

In [None]:
df_a2 = pd.DataFrame([author_cos_l,author_vals,author_vals_O,dblp_alum_yr]).T
df_a2.columns = ['coauth','val','valO','year']
df_a2 = df_a2.fillna(0)
df_a2['pval'] = df_a2.coauth*0.5
df_a2['diff'] =  (df_a2.val - df_a2.pval)/np.maximum(df_a2.pval,df_a2.val)

In [None]:
df_a2[df_a2.coauth.map(lambda x:np.isnan(x))]

In [None]:
df_a3 = df_a2.copy()
df_a3 = df_a3.dropna()
#[df_a2.coauth < 190]
#df_a3 = df_a3[df_a3.val < 60]
#df_a3 = df_a3[df_a3.year >= 2010]
df_a3[df_a3.year == 1994].sort_values('val',0,False)

In [None]:
plt.plot(df_a3.groupby('year').median()['val'])
plt.xlabel('Graduation Year')
plt.ylabel('Median Production')
plt.title('RI PhD students')

In [None]:
plt.scatter(df_a3.val,df_a3.coauth*0.5)

In [None]:
regf = sm.OLS(df_a3['val'],df_a3[['coauth','valO']]).fit()
regf.summary()

In [None]:
plt.hist(df_a3.val,50)
#plt.xlim(0,20)
q1,q2,q3 = np.median(df_a3.val),np.quantile(df_a3.val,0.25),np.quantile(df_a3.val,0.75)
plt.title('RI PhD Student Productivity\nMedian: {:.2f}, IQR: [{:.1f} to {:.1f}]'.format(q1,q2,q3))
plt.xlabel('Production before graduating')

In [None]:
df_a3['pred'] = regf.predict()
df_a3['diff2'] =  (df_a3.val - df_a3.pred)/np.maximum(df_a3.pred,df_a3.val)
df_a3 = df_a3.sort_values('diff2',0,False)

In [None]:
dblp_alum_yr

In [None]:
df_a3.sort_values('diff',0,False)