## Ideas for statistics


#### 3 types of authorship model scores (full, 1/n, 1/position)
* Mean Scores across years (x)
* Max Scores across year & Year of Max ( )
* Average value in last 7 years  ( )
* Most productive co-author (x)

#### Remaining Analysis from 1/position
* "RI" Conf sub-score (X)
* "Top Graphics","Top Vision", "Top Robotics", "Top ML", "Other" sub-scores (x)
* Average number of authors (x)
* Average Author Position (x)
* Average & Median "quality" of collab ( )
* Current Affiliation (x)
* Total number of collab ( )
* Top 3 collabs (x)
* Top 3 conferences from generated value ( )
* Career length (x)
* Number of collabs w/ more than 4 papers

#### Advanced Stats from 1/n
* 5 unlabeled variants of plus-minus (w/ intercept) (x)
* 5 unlabeled variants of plius-minus (w/o intercept) (x)

#### NSF Data
* Total number of grants (x)
* Total grant money (x)
* fractional grant money (x)
* grant money of collabs ( )

In [None]:
import os
import sys
import fnmatch
import numpy as np
import pandas as pd
import json
import gzip
import pickle
import csv
import scipy.sparse
Xauth = None
from collections import defaultdict
import matplotlib.pyplot as plt

In [None]:
# setup the update to work despite the broken scipy documentation
try:
    a = scipy.sparse.dok_matrix((10,10))
    a.update({(0,0):1.0})
    scipy.sparse.dok_matrix.my_update = scipy.sparse.dok_matrix.update
except:
    a = scipy.sparse.dok_matrix((10,10))
    a._update({(0,0):1.0})
    scipy.sparse.dok_matrix.my_update = scipy.sparse.dok_matrix._update

In [None]:
with gzip.open('useful_venue_list.pkl.gz','rb') as fp:
    all_venues = pickle.load(fp)
with gzip.open('useful_authors_list.pkl.gz','rb') as fp:
    all_authors = pickle.load(fp)
with gzip.open('useful_papers.pkl.gz','rb') as fp:
    all_papers = pickle.load(fp)


In [None]:

min_year = all_papers[0][6]
max_year = all_papers[-1][6]
span_years = max_year - min_year + 1
print(min_year,max_year,span_years)
conf_idx = {v:i for i,v in enumerate(all_venues)}
name_idx = {v:i for i,v in enumerate(all_authors)}
n_confs = len(all_venues)
n_auths = len(all_authors)
n_papers = len(all_papers)
print(n_confs,n_auths,n_papers)

In [None]:
scoreV = {}
clf =  np.load('clf_gold.pkl.npy')
years_per_conf = clf.shape[0]//n_confs
YEAR_BLOCKS = span_years//years_per_conf
clf[2323]


In [None]:

for FI in [False,True]:
    scoreV['_apm' + str(FI)] = np.load('apm'+str(FI) + '.npy')
    scoreV['pw_apm' + str(FI)] = np.load('pwapm'+str(FI) + '.npy')
    scoreV['pweff_apm' + str(FI)] = np.load('pweffapm'+str(FI) + '.npy')
    scoreV['pwunk_apm' + str(FI)] = np.load('pwunkapm'+str(FI) + '.npy')
    print(scoreV['pwunk_apm' + str(FI)].shape)


In [None]:
if True:
    try:
        import gzip
        import pickle
        with gzip.open('scoresV2.pkl.gz','rb') as fp:
            scoreV = pickle.load(fp)
    except:
        print('failed!')
    PROCESS_DATA = len(scoreV) < 13
    print(scoreV['pwunk_apm' + str(FI)].shape)
    USE_LIMITS = False
else:
    PROCESS_DATA = True
    USE_LIMITS = False
    last_years = np.load('last_years.npy')

In [None]:
[(k,_.shape) for k,_ in scoreV.items() if _.shape[0] == 2337504]

In [None]:
with open('top_ri_Metric.json','rt') as fp:
    interesting_set = set(json.load(fp))
print(len(interesting_set))
curious_names = ['Xiaolong Wang 0004','Judy Hoffman','Paris Siminelakis','Roie Levin','Leonid Keselman',
                 'Nicholas Rhinehart','Vincent Sitzmann','Siddharth Ancha','Xingyu Lin',
                 'Humphrey Hu','Aditya Dhawale','Nick Gisolfi','Andrey Kurenkov','Micah Corah',
                 'David F. Fouhey','Chelsea Finn','Akshara Rai','Ankit Bhatia','Xuning Yang',
                 'Lerrel Pinto','Graeme Best','Alexander Spitzer','Roberto Shu','Amir Abboud',
                 'Justin Johnson','Kumar Shaurya Shankar','Ellen A. Cappo',
                 'Amir Roshan Zamir','Dominik Peters','Jonathan T. Barron','Dorsa Sadigh','Derek Hoiem','Vaggos Chatziafratis',
                 'Brian Okorn','David Held']
#interesting_set = set(curious_names)
with open('ri_cand_names.pkl','rb') as fp:
    interesting_set = set(    pickle.load(fp))

In [None]:
#interesting_set.remove('David P. Hayden')
#interesting_set.add('David S. Hayden')

In [None]:
#with open('ri_cand_names.pkl','wb') as fp:
#     pickle.dump(interesting_set,fp)

In [None]:

words_count = {}
for paper in all_papers:
    tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    for w in title.lower().replace(',','').replace('-',' ').replace(':','').replace('.','').split():
        words_count[w] = 1+words_count.get(w,0)

In [None]:
sorted([(v,k) for k,v in words_count.items()],reverse=True)[:100]

In [None]:
words_total =sum(words_count.values())
word_person = {}
for paper in all_papers:
    tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    wc = []
    for w in title.lower().replace(',','').replace('-',' ').replace(':','').replace('.','').split():
        wc.append(words_count.get(w,0))
    wc = [_ for _ in wc if _ < words_count['an'] and _ > 1]
    if len(wc) > 0:
        wc = np.mean(wc)
        for a in authors:
            word_person[a] = word_person.get(a,[]) + [wc]


In [None]:
word_person = {k: np.mean(np.log(v)) for k,v in word_person.items()}

In [None]:
prev_cand = [
 'Changliu Liu',
 "Matthew O'Toole",
 "Jun-Yan Zhu",
  "Wenzhen Yuan",
    "Oliver Kroemer",
    "James McCann",
    "Ioannis Gkioulekas",
    "Keenan Crane",
    "Henny Admoni",
    "Shubham Tulsiani",
    "Melisa Orta Martinez",
 'Fatma Zeynep Temel',
    "Deepak Pathak",
    "David Held",
    "Zachary Manchester",
]



In [None]:
cmu_uni = pd.read_csv('other_ranks/cmu_faculty.csv')
cmu_uni = cmu_uni.fillna('Other')
cmu_uni = cmu_uni[cmu_uni.dept == 'RI']
uni_names = set(list(cmu_uni.name))
for n in prev_cand:
    uni_names.add(n)
print(len(uni_names))
conf_counts = {}
conf_counts_value = {}

#interesting_set = uni_names

for paper in all_papers:
    tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    if year < 2004:
        continue
    n = len(authors)
    for a in authors:
        if a in uni_names:
            conf_counts[venue] = 1/n + conf_counts.get(venue,0)
            conf_counts_value[venue] = clf[years_per_conf*(conf_idx[venue]) + (year-min_year)//YEAR_BLOCKS]/n + conf_counts_value.get(venue,0)
conf_counts_value = {k: v/conf_counts[k] for k,v in conf_counts_value.items()}
ri_fav_confs = [(conf_counts[_[1]]*conf_counts_value[_[1]],_[1],conf_counts[_[1]],conf_counts_value[_[1]]) for _ in sorted([(v,k) for k,v in conf_counts.items() if v > 0],reverse=True)]

In [None]:
ri_confs = [_[1] for _ in sorted(ri_fav_confs,reverse=True) if _[-2] >= 1.25]
#confs_to_filter =['ICRA','IROS','Robotics: Science and Systems']
ri_confs

In [None]:
if PROCESS_DATA:
    auth_years = np.ones((n_auths,2)) * np.array([3000,1000]) 
    for paper in all_papers:
        tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
        for a in authors:
            i = name_idx[a]
            auth_years[i,0] = min(auth_years[i,0],year)
            auth_years[i,1] = max(auth_years[i,1],year)
    working_years = (auth_years[:,1] - auth_years[:,0]+1)
    scoreV['working_years'] = working_years
    scoreV['auth_years'] = auth_years
    scoreV['last_years'] = last_years


In [None]:
if PROCESS_DATA:
    valid_ns = set()
    for paper in all_papers:
        tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
        n = len(authors)
        valid_ns.add(n)


In [None]:
if USE_LIMITS:
    for i in range(max(valid_ns)):
        valid_ns.add(i)

In [None]:
conf_types = {
        'RI': ri_confs,
        'ML':['NIPS','ICML','AAAI','AISTATS','IJCAI','UAI','CoRL','ICLR'],
        'CV':['CVPR','ICCV','ECCV','IEEE Trans. Pattern Anal. Mach. Intell.','FGR','Int. J. Comput. Vis.','WACV','BMVC','ACCV'],
        'ROB':['HRI','Int. J. Robotics Res.','Robotics: Science and Systems','Humanoids','WAFR','IROS','ICRA','FSR','ISER','ISRR','AAMAS','IEEE Robotics Autom. Lett.','IEEE Trans. Robotics and Automation'],
        'GR':['ACM Trans. Graph.','Comput. Graph. Forum','SIGGRAPH','SIGGRAPH Asia','Symposium on Computer Animation'],
             }

In [None]:
am_types = ['full','1/n','1/i']
year_filters = [1970,1990,2000,2010]

In [None]:
if PROCESS_DATA:
    confTypeN = len(conf_types)+1
    YearConf = scipy.sparse.lil_matrix((n_confs*years_per_conf,years_per_conf*confTypeN))
    for i in range(years_per_conf):
        year_filter = np.zeros_like(clf).reshape((-1,years_per_conf))
        year_filter[:,i] = 1
        YearConf[:,i*confTypeN] = (clf * year_filter.reshape(clf.shape))[:,np.newaxis]
        j = 1
        for f_type, f_confs in conf_types.items():
            year_filter = np.zeros_like(clf).reshape((-1,years_per_conf))
            for conf in f_confs:
                year_filter[conf_idx[conf],i] = 1
            YearConf[:,i*confTypeN+j] = (clf * year_filter.reshape(clf.shape))[:,np.newaxis]
            j+=1
    YearConf = scipy.sparse.csr_matrix(YearConf)

In [None]:
import scipy.sparse
import gc
if PROCESS_DATA:
    for amt in am_types:

        per_author_val = {}

        if amt == 'full':
            for n in valid_ns:
                author_scores = np.ones(n)
                per_author_val[n] = author_scores
        elif amt == '1/n':
            for n in valid_ns:
                author_scores = (np.ones(n))
                per_author_val[n] = author_scores/author_scores.sum()
        elif amt == '1/i':
            for n in valid_ns:
                author_scores = 1/(np.arange(n)+1)
                per_author_val[n] = author_scores/author_scores.sum()
        else:
            raise

        count_vecs = {}
        paper_vecs = []
        for paper in all_papers:
                
            tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
            n = len(authors)
            j = years_per_conf*conf_idx[venue] + (year-min_year)//YEAR_BLOCKS

            author_scores = per_author_val[n]
            if USE_LIMITS:
                tmpp = []
                tmpapers = authors[:-1] if n >= 2 else authors
                tmpscores = author_scores[:-1] if n >= 2 else author_scores

                for a,v in zip(tmpapers,tmpscores):
                    idx = name_idx[a]
                    if year > last_years[idx]:
                        continue
                    tmpp.append((idx,j,v))
                paper_vecs.append(tmpp)
            else:
                paper_vecs.append([(name_idx[a],j,v) for a,v in zip(authors,author_scores)])

        Xauth = scipy.sparse.dok_matrix((n_auths,years_per_conf*n_confs))
        xdict = {}

        for paper_vec in paper_vecs:
            for i,j,v in paper_vec:
                xdict[(i,j)] = v + xdict.get((i,j),0)

        Xauth.my_update(xdict)

        Xauth = scipy.sparse.csr_matrix(Xauth)



        scoreV[amt] = Xauth @ YearConf


        paper_vec = []
        xdict = {}
        gc.collect()

In [None]:
if PROCESS_DATA:
    import gzip
    import pickle
    with gzip.open('scoresV2.pkl.gz','wb') as fp:
        pickle.dump(scoreV,fp)

In [None]:
print(len(scoreV),PROCESS_DATA)#,years_per_conf


In [None]:
import scipy.ndimage
for am in am_types:
    #scores = np.array(scoreV[am]).reshape((n_auths,years_per_conf,-1)).astype(np.float32)
    scores = np.array(scoreV[am].todense()).reshape((n_auths,years_per_conf,-1)).astype(np.float32)
    scores = np.transpose(scores,(0,2,1))
    smooth_kernel = scipy.ndimage.gaussian_filter1d(np.identity(years_per_conf,np.float32),1)
    scores = scores @ smooth_kernel
    scoreV[am] = scores

In [None]:
sTypes = ['Full'] + [k for k,v in conf_types.items()]

In [None]:
scores.dtype,scores.nbytes,gc.collect()

In [None]:
auth_years = scoreV['auth_years']
working_years = scoreV['working_years']

total_scores = scoreV['1/i'][:,sTypes.index('Full')].sum(1)
ri_scores = scoreV['1/i'][:,sTypes.index('RI')].sum(1)
ri_eff_scores = ri_scores/working_years#,np.maximum(auth_years[:,1]-2000,1))

ri_scores_max = scoreV['1/i'][:,sTypes.index('RI')].max(1)
ri_scores_max_yr = np.argmax(scoreV['1/n'][:,sTypes.index('RI')],axis=1)*YEAR_BLOCKS + min_year


In [None]:
#best_idx = np.argsort(total_scores)[::-1]
#for k in range(10):
#    idx = best_idx[k]
#    print('{:30s}\t{:.2f}'.format(all_authors[idx],total_scores[idx]))

In [None]:
interesting_set = set()
with open('top_ri_Metric.json','rt') as fp:
    interesting_set = set(json.load(fp))
    print(len(interesting_set))
#interesting_set.add('Jeff Clune')
interesting_set = set([_ for _ in uni_names if _ in name_idx])
print(len(interesting_set))
interesting_set.add('Dinesh Jayaraman')

In [None]:
pot_cand_df = pd.read_csv('pot_export.csv',index_col=0)
#interesting_set = set(pot_cand_df.Author)


In [None]:
with open('ri_cand_names.pkl','rb') as fp:
    profile_set = set(    pickle.load(fp))
profile_set.remove('David Rosen')
profile_set.add('David M. Rosen')
profile_set.add('Vaggos Chatziafratis')

interesting_set = set(list(profile_set)).union(prev_cand)

In [None]:
Directors = ['Takeo Kanade','Martial Hebert','Matthew T. Mason']
labels = ['TK','MH','MM','RD']
for n in Directors:
    interesting_set.add(n)

In [None]:
best_idx = np.argsort(ri_scores_max * (working_years > 5))[::-1]
for k in range(1000):
    idx = best_idx[k]
    #interesting_set.add(all_authors[idx])
    print('{}\t{:30s}\t{:.2f}\t{:.2f}\t{:.2f}\t{:d}\t{:d}'.format(k,all_authors[idx],ri_scores_max[idx],ri_eff_scores[idx],ri_scores[idx],ri_scores_max_yr[idx],int(working_years[idx])))

In [None]:
best_idx = np.argsort(ri_eff_scores * (working_years > 5))[::-1]
for k in range(150):
    idx = best_idx[k]
    #interesting_set.add(all_authors[idx])
    print('{}\t{:30s}\t{:.2f}\t{:.2f}\t{:d}'.format(k,all_authors[idx],ri_eff_scores[idx],ri_scores[idx],int(working_years[idx])))

In [None]:
best_idx = np.argsort(ri_scores)[::-1]
for k in range(150):
    idx = best_idx[k]
    #interesting_set.add(all_authors[idx])

    print('{}\t{:30s}\t{:.2f}\t{:.2f}\t{:d}'.format(k,all_authors[idx],ri_scores[idx],ri_scores[idx]/total_scores[idx],int(auth_years[idx,0])))

In [None]:
best_idx = np.argsort(total_scores)[::-1]
for k in range(len(best_idx)):
    idx = best_idx[k]
    #interesting_set.add(all_authors[idx])
    if idx == 228644:
        print('{}\t{:30s}\t{:.2f}\t{:.2f}\t{:d}'.format(k,all_authors[idx],ri_scores[idx],ri_scores[idx]/total_scores[idx],int(auth_years[idx,0])))

In [None]:
name_idx['Benjamin A. Newman']

In [None]:
#with open('top_ri3.json','wt') as fp:
#    json.dump(sorted(list(interesting_set)),fp,sort_keys=True,indent=4, separators=(',', ': '))

In [None]:
faculty_affil = pd.read_csv('faculty-affiliations.csv')
year_span = (auth_years[:,1] - auth_years[:,0]) + 1
faculty_lookup = {_[1]:_[2] for _ in faculty_affil.itertuples()}
faculty_lookup['Reid G. Simmons'] = 'Carnegie Mellon University'
faculty_lookup['Sebastian Scherer'] = 'Carnegie Mellon University'
faculty_lookup['Jeff G. Schneider'] = 'Carnegie Mellon University'
for row in cmu_uni.itertuples():
    faculty_lookup[row[1]] = 'Carnegie Mellon University'

## Author affiliated stats

In [None]:
coauthors  = defaultdict(lambda: defaultdict(int))
coauthors_frac  = defaultdict(lambda: defaultdict(float))
coauthors_num  = defaultdict(list)
author_pos  = defaultdict(list)
conf_paper_frac  =  defaultdict(lambda: defaultdict(int))
paper_num  = defaultdict(int)
paper_frac  = defaultdict(float)


In [None]:
for paper in all_papers:
    tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    n = len(authors)
    parse = False
    for a in authors:
        if a in interesting_set:
            parse = True
            break
    if parse:
        for i,a in enumerate(authors):
            if a in interesting_set:
                conf_paper_frac[a][venue] += 1/n
                coauthors_num[a].append(n)
                author_pos[a].append((i+1)/n)
                paper_num[a] += 1
                paper_frac[a] += 1/n
                for a2 in authors:
                    if a2 == a:
                        continue
                    coauthors[a][a2] += 1
                    coauthors_frac[a][a2] += 1/n
                    

In [None]:
sorted([(v,k) for k,v in coauthors['Martial Hebert'].items()],reverse=True)

In [None]:
np.array(coauthors_num['Martial Hebert']).mean()

In [None]:
pr_full = pickle.load(open('new_pagerank_people.pkl','rb'))
pr_ri = pickle.load(open('new_pagerank_people_ri.pkl','rb'))
pr_full /= pr_full.max()
pr_ri /= pr_ri.max()
print(pr_ri.shape,pr_full.shape)

# Build it

In [None]:
remove_names = ['Huijuan Xu','David A. B. Hyde','Elahe Soltanaghaei','Ankit Shah 0003']
for n in remove_names:
    if n in interesting_set:
        interesting_set.remove(n)

In [None]:
from unidecode import unidecode
author_results = defaultdict(dict)
for name in interesting_set:
    idx = name_idx[name]
    author_results[name]['Affiliation'] = faculty_lookup.get(name,'Unknown')
    author_results[name]['Years'] = scoreV['working_years'][idx]

In [None]:
scoreV['1/i_total_1970'] = scoreV['1/i'][:,sTypes.index('Full')].sum(1)
scoreV['1/i_RI_1970'] = scoreV['1/i'][:,sTypes.index('RI')].sum(1)
for sub in ['ROB','CV','GR','ML']:
    den = scoreV['1/i_{}_1970'.format(sub)] = scoreV['1/i'][:,sTypes.index(sub)].sum(1)
scoreV['full_total_1970'] = scoreV['full'][:,sTypes.index('Full')].sum(1)
scoreV['1/n_total_1970'] = scoreV['1/n'][:,sTypes.index('Full')].sum(1)


scoreV['1/n_max'] = scoreV['1/n'][:,sTypes.index('Full')].max(1)
scoreV['1/n_max_yr'] = np.argmax(scoreV['1/n'][:,sTypes.index('Full')],axis=1)*YEAR_BLOCKS+min_year

scoreV['1/i_max'] = scoreV['1/i'][:,sTypes.index('Full')].max(1)
scoreV['1/i_max_yr'] = np.argmax(scoreV['1/i'][:,sTypes.index('Full')],axis=1)*YEAR_BLOCKS+min_year

scoreV['full_max'] = scoreV['full'][:,sTypes.index('Full')].max(1)
scoreV['full_max_yr'] = np.argmax(scoreV['full'][:,sTypes.index('Full')],axis=1)*YEAR_BLOCKS+min_year

In [None]:
for k,v in scoreV.items():
    print(k,v.shape)

In [None]:
avg_auth_pos = np.zeros(n_auths)
avg_auth_cnt = np.zeros(n_auths)
auth_been_last = np.zeros(n_auths)
auth_first_last_year = 3000*np.ones(n_auths)

for paper in all_papers:
    tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    n = len(authors)
    if n  > 1:
        for i,a in enumerate(authors):
            idx = name_idx[a]
            pos = i/(n-1)
            auth_been_last[idx] += int(pos == 1)
            avg_auth_pos[idx] += pos
            avg_auth_cnt[idx] += 1
        auth_first_last_year[name_idx[authors[-1]]] = min(year,auth_first_last_year[name_idx[authors[-1]]])
np.save('last_years',auth_first_last_year)

In [None]:
last_years = np.load('last_years.npy')
scoreV['last_years'] = last_years

In [None]:
v = scoreV['1/i_total_1970']/(np.maximum(2,working_years.astype(np.float32)))
ratio_v = np.maximum(1e-3,scoreV['1/i_RI_1970'])/np.maximum(1e-3,scoreV['1/i_total_1970'])
v *= np.sqrt(ratio_v)
v *= 1/np.log(np.maximum(2,working_years.astype(np.float32)))
tv = np.zeros_like(scoreV['1/i_total_1970'])
for n in ['_apmFalse', '_apmTrue',  'pw_apmFalse', 'pw_apmTrue', 'pweff_apmFalse', 'pweff_apmTrue', 'pwunk_apmFalse','pwunk_apmTrue']:
    print(n,scoreV[n].shape)
    tv +=  scoreV[n]
v *= np.log(np.maximum(tv,10))
v *= np.log(np.maximum(np.exp(1),scoreV['full'][:,sTypes.index('Full')].sum(1)))
v *= np.log(scoreV['1/i_max_yr']-1965)
v = np.nan_to_num(v)
meta_metric = v

In [None]:
idx = name_idx['Maria Bauzá']
scoreV['1/i_{}_1970'.format('CV')][idx]

In [None]:
for name in interesting_set:
    idx = name_idx[name]
    author_results[name]['MetaMetric'] = v[idx]
    author_results[name]['Score (1/pos)'] = scoreV['1/i_total_1970'][idx]
    author_results[name]['RIScore'] = scoreV['1/i_RI_1970'][idx]
    author_results[name]['RI %'] = scoreV['1/i_RI_1970'][idx]/scoreV['1/i_total_1970'][idx]

    sum_v = 0
    for sub in ['ROB','CV','GR','ML']:
        den = scoreV['1/i_{}_1970'.format(sub)][idx]
        den = den if den != 0.0 else 0
        author_results[name][sub + ' %'] = den/max(1e-9,scoreV['1/i_total_1970'][idx])
        sum_v += den
    author_results[name]['Other %'] = max(0,scoreV['1/i_total_1970'][idx] - sum_v)/scoreV['1/i_total_1970'][idx]
    
    author_results[name]['From'] = scoreV['auth_years'][idx][0]
    author_results[name]['Until'] = scoreV['auth_years'][idx][1]

    author_results[name]['YearlyScore (1/pos)'] = scoreV['1/i_total_1970'][idx]/scoreV['working_years'][idx]
    author_results[name]['YearlyRIScore'] = scoreV['1/i_RI_1970'][idx]/scoreV['working_years'][idx]

new_set = set()

for name in interesting_set:
    idx = name_idx[name]

    author_results[name]['avgCoauthor'] = np.array(coauthors_num[name]).mean()
    colabs = sorted([(v,k) for k,v in coauthors_frac[name].items()],reverse=True)
    fam_colab = sorted([(v*scoreV['1/i_total_1970'][name_idx[k]],k) for k,v in coauthors_frac[name].items()],reverse=True)
    
    freq_colabs = sorted([(v,k) for k,v in coauthors[name].items() if v >= 4],reverse=True)
    if len(colabs) > 0:
        author_results[name]['mostCoauthorName'] = unidecode(colabs[0][1])
        author_results[name]['mostCoauthorTimes'] = colabs[0][0]
        new_set.add(colabs[0][1])
    else:
        author_results[name]['mostCoauthorName'] = ''
        author_results[name]['mostCoauthorTimes'] = 0

    if len(fam_colab) > 0:
        author_results[name]['famCoauthorName'] = unidecode(fam_colab[0][1])
        new_set.add(fam_colab[0][1])
    else:
        author_results[name]['famCoauthorName'] = ''

    author_results[name]['authorPosition%'] = np.array(author_pos[name]).mean()
    author_results[name]['totalCoauth'] = len(colabs)
    author_results[name]['freqCoauth (> 3 papers)'] = len(freq_colabs)
    author_results[name]['famCoauthFrac'] = sum([_[0] for _ in fam_colab])
    author_results[name]['totalCoauthFrac'] = sum([_[0] for _ in colabs])
    
    author_results[name]['mostPaperConf'] = sorted([(v,k) for k,v in conf_paper_frac[name].items()],reverse=True)[0][1]
    author_results[name]['venuesPublishedIn'] = len(conf_paper_frac[name].items())
    author_results[name]['pageRank'] = pr_full[idx]
    #author_results[name]['pageRankRI'] = pr_ri[idx]
    
    
    author_results[name]['numPapers'] = paper_num[name]
    author_results[name]['numPapersFrac'] = paper_frac[name]


    author_results[name]['YearlyScore (1/n)'] = scoreV['1/n_total_1970'][idx]/scoreV['working_years'][idx]
    author_results[name]['YearlyScore (Full)'] = scoreV['full_total_1970'][idx]/scoreV['working_years'][idx]
    author_results[name]['Score (1/n)'] = scoreV['1/n_total_1970'][idx]
    author_results[name]['Score (Full)'] = scoreV['full_total_1970'][idx]
    
    author_results[name]['MaxScore'] = scoreV['1/i_max'][idx]
    author_results[name]['MaxScore (1/n)'] = scoreV['1/n_max'][idx]
    author_results[name]['MaxScore (Full)'] = scoreV['full_max'][idx]
    
    author_results[name]['BestYear'] = scoreV['1/i_max_yr'][idx]
    author_results[name]['BestYear (1/n)'] = scoreV['1/n_max_yr'][idx]
    author_results[name]['BestYear (Full)'] = scoreV['full_max_yr'][idx]
    
    for i,n in enumerate(['_apmFalse', '_apmTrue',  'pw_apmFalse', 'pw_apmTrue', 'pweff_apmFalse', 'pweff_apmTrue', 'pwunk_apmFalse','pwunk_apmTrue']):
        author_results[name]['Adv'+str(i+1)] =  scoreV[n][idx]


In [None]:
results_list = []
for k,v in author_results.items():
    v['Name'] = k
    results_list.append(v)
def_order = list(author_results[list(author_results.keys())[0]].keys())
df_results = pd.DataFrame(results_list)[def_order].set_index('Name')

In [None]:
df_results = df_results.fillna(0.0)
df_results


In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
vecs = ss.fit_transform(df_results._get_numeric_data())
pca = PCA(n_components=5,whiten=True)
res = pca.fit_transform(vecs)
pca.explained_variance_
for i in range(5):
    df_results['pca'+str(i)] = res[:,i]

In [None]:
df_results_out = df_results[~df_results.index.isin(Directors)]
pot_name = pot_cand_df.set_index('Author')
pot_name.index = pot_name.index.rename('Name')
df_results_out2= df_results_out.copy()
#df_results_out2 = df_results_out.join(pot_name[pot_name.columns.difference(df_results_out.columns)])
adv_totals = df_results_out2[[_ for _ in df_results_out2.columns if 'Adv' in _ and 'NSF' not in _]].sum(1)
adv_min = df_results_out2[[_ for _ in df_results_out2.columns if 'Adv' in _ and 'NSF' not in _]].min(1)
adv_max = df_results_out2[[_ for _ in df_results_out2.columns if 'Adv' in _ and 'NSF' not in _]].max(1)

df_hits = pd.read_excel('google hits.xlsx')
df_hits= df_hits.set_index('Name')
df_results_out2 = df_results_out2.join(df_hits)

df_results_out2['AdvMin'] = adv_min
df_results_out2['AdvMax'] = adv_max
df_results_out2['AdvTotal'] = adv_totals
df_results_out.shape,df_results_out2.shape
df_results_out2['hits'] = np.log(1+df_results_out2['hits'])

with open('s2.pkl','rb') as fp:
    s2 = pickle.load(fp)
    s2['David M. Rosen'] = s2['David Rosen']
s2r = {}
for n in df_results_out2.index:
    s2r[n] =     {k: np.log(1+s2[n][k]) for k in ['iTot','iInf','tTot','tInf']}
df_results_out2 = df_results_out2.join(pd.DataFrame(s2r).T)

In [None]:
prev_v = df_results[df_results.index.isin(profile_set)]['Score (1/pos)']
#rev_v = prev_v[1:]
prev_v.mean(), prev_v.std(),prev_v.max(),prev_v.min()

#df_results.loc['Maggie Wigness']
#prev_v.shape
#((prev_v > 15) &(prev_v < 25)).sum()
#df_results[[_ for _ in df_results.columns if 'score' in _.lower()]]

In [None]:
df_results_out2['MetaMetric'] = df_results_out2['YearlyRIScore']  + np.sqrt(df_results_out2['YearlyScore (1/pos)'])
df_results_out2['MetaMetric'] = df_results_out2['MetaMetric'] + 0.1* (df_results_out2['Score (1/pos)']-df_results_out2['RIScore'])
df_results_out2['MetaMetric'] = df_results_out2['MetaMetric'] + 0.1 * adv_totals

vecL = []
for row in df_results_out2[['ROB %','CV %', 'GR %', 'ML %', 'Other %']].itertuples():
    vec = [_ for _ in row[1:] if _ > 0]
    vecL.append(np.std(vec) if len(vec) > 1 else 0)
    
df_results_out2['Var'] = vecL
df_results_out2['Words'] = [word_person[a] for a in df_results_out2.index]



In [None]:
df_results_out2 = df_results_out2.sort_values('MetaMetric',0,False)
#df_results_out2[[_ for _ in df_results_out2.columns if '%' in _ and 'author' not in _ and 'Other' not in _]].sum(1)
df_results_out2#[df_results_out2.MetaMetric > 0]
#df_out2 = df_results_out2[(df_results_out2['RI %'] > 0.1) & (df_results_out2.MetaMetric > 0) & (df_results_out2['Score (1/pos)'] > 5)]
#.loc['Júlia Borràs Sol']

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
newdf = df_results_out2.select_dtypes(include=numerics)
df_prev_cand = newdf[newdf.index.isin(prev_cand)]

normed_df = 10*(newdf-df_prev_cand.mean())/df_prev_cand.std()+50
normed_df['Prod (I)'] = normed_df['Score (1/pos)']
normed_df['Prod (T)'] = normed_df['Score (Full)']
normed_df['Cite (I)'] = normed_df['iTot']
normed_df['Cite (T)'] = normed_df['tTot']
normed_df['CiteI (I)'] = normed_df['iInf']
normed_df['CiteI (T)'] = normed_df['tInf']

normed_df['Fit'] = normed_df['RI %']
normed_df['Unique'] = 100-normed_df['Words']
normed_df['News'] = normed_df['hits']
normed_df['Network'] = normed_df['pageRank']
normed_df['Variance'] = normed_df['Var']

w = np.array([60,20,7,3,7,3,20,10,20,5,1])
#w = np.array([10., 10.,  9.,  8.,  7.,  7., 19.,  3., 17.,  8.,  3.])
#w = np.array([10., 10.,  7.,  7.,  6.,  6., 19.,  4., 20.,  8.,  4.])
#w = np.array([13.,  5.,  1., 13., 18.,  0., 11.,  8., 14.,  9.,  8.])
w = w/w.sum()
print(w)
normed_df['Est'] = sum([wt*np.maximum(20,normed_df[col]) for wt,col in zip(w,['Prod (I)','Prod (T)','Cite (I)','Cite (T)','CiteI (I)','CiteI (T)','Fit','Unique','News','Network','Variance'])])

out_df = normed_df.iloc[:,-12:].sort_values('Est',0,False)
df_results_out2['MetaMetric'] = out_df['Est']
df_results_out2 = df_results_out2.sort_values('MetaMetric',0,False)

#out_df = out_df[out_df.index.isin(profile_set)]
out_df.round(1)

In [None]:
import statsmodels.api as sm
#clf = sm.Logit(np.array('Carnegie' in df_results_out2.Affiliation).astype(int),np.array(out_df)[:,:-1]).fit()
#clf
yv = ('Carnegie Mellon University'== np.array(df_results_out2.Affiliation)).astype(int)
xv = out_df[[c for c in out_df.columns if c != 'Est']]
clf= sm.Logit(yv,xv).fit()

from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
clf.summary()

In [None]:
#clf =LogisticRegressionCV(25,max_iter=1e3,class_weight='balanced').fit(xv,yv)
#clf =LogisticRegression(C=1e-3,max_iter=1e3).fit(xv,yv)

w = clf.params
w -= np.min(w)
w = 100*w.ravel()/w.sum()
w.round()

In [None]:
#df_results_out2['Pred'] = clf.predict(xv)
#df_results_out2=df_results_out2.sort_values('Pred')
#df_results_out2

In [None]:
(w * 100).round(1)

In [None]:

out_df[out_df.index.isin(prev_cand)].round(0).astype(int).sort_values('Est',0,False)

In [None]:
pd.set_option('display.max_rows', 150)
out_df[out_df.index.isin(profile_set)].round(0).astype(int).sort_values('Est',0,False)

In [None]:
df_results_out2.loc[df_results_out2.index.isin(prev_cand),'Affiliation'] = 'Carnegie Mellon University'
df_results_out2.join(out_df).to_excel('filter_names.xlsx')


In [None]:
newdf.mean(),newdf.std()

In [None]:
raise

In [None]:
with open('s2.pkl','rb') as fp:
    s2 = pickle.load(fp)
with open('s2p.pkl','rb') as fp:
    s2p = pickle.load(fp)

In [None]:
for k,p in s2p.items():
    break
dfs2p = pd.DataFrame(s2p).T

In [None]:
#(np.array(v.mean())+np.array(v.std())).shape
#dfs2p= pd.to_numeric(dfs2p.stack(), errors='coerce').unstack()
p['citations'][0],p['year'],len(p['references'])

In [None]:
dfs2p['sc'] = dfs2p['citations'].map(lambda x: len(x) if type(x) == list else 0)
dfs2p['sc'] = dfs2p['sc']+1
v = dfs2p[['year','sc']].groupby('year')
plt.plot(v.mean().index,v.mean())
#plt.fill_between(v.mean().index,np.array(v.mean())[:,0]+np.array(v.std())[:,0],np.array(v.mean())[:,0]-np.array(v.std())[:,0],alpha=0.5)
plt.xlim(2010,2021)
plt.ylim(0,100)

In [None]:
import scipy.optimize as opt
baseline = np.array(v.mean().iloc[-8:])[:,0]
def curve_fit(x):
    x[1] = np.clip(x[1],0,1e9)
    vec = np.arange(-7,1)
    #vres = np.log(-x[1]*vec+1+x[0])
    #vres = (1.0-1/(1+np.exp(-vec*x[1])))*x[0]
    vres = np.tanh(-vec*x[1]+x[2])*x[0]
    return np.linalg.norm(baseline-vres)
res = opt.minimize(curve_fit,np.array([1,1,1]))

vec = np.arange(-7,1)
plt.plot(vec[1:],baseline[1:])
vec = np.arange(-20,1)
cite_corr = lambda x: np.tanh(-x*0.338-0.1436)*52.49
vres = cite_corr(vec)
plt.plot(vec,vres)
plt.xlim(-1,-20)

res.x,res.fun

In [None]:
curr_year = dfs2p.year.max()
def cite_correct(year,cite):
    if year == curr_year:
        year = curr_year-1
    if year is None:
        year = curr_year - 10
    curr = cite_corr(year-curr_year)
    teny = cite_corr(10-curr_year)
    return cite*teny/curr
cite_corr(-1),cite_corr(-20),cite_corr(-10),cite_corr(-5)


In [None]:
for k in s2p:
    year = s2p[k]['year'] if 'year' in s2p[k] and s2p[k]['year'] is not None else curr_year
    cite = len(s2p[k]['citations'] if 'citations' in s2p[k] else [])
    s2p[k]['sc'] = cite_correct(year,cite)
    cite = len([1 for _ in s2p[k]['citations'] if _['isInfluential']]if 'citations' in s2p[k] else [])
    s2p[k]['sci'] = cite_correct(year,cite)

In [None]:
s2p[k]['sci'],s2p[k]['sc'],len(s2p[k]['citations'])


In [None]:
for a in s2:
    # 4 things
    d = {
        'tInf':0,
        'iInf':0,
        'tTot':0,
        'iTot':0,
    }
    aid = s2[a]['authorId']
    if type(s2[a]['authorId']) == str:
        aid = [int(s2[a]['authorId'])]
    aid = [int(_) for _ in aid]
    for p in s2[a]['papers']:
        paper = s2p[p['paperId']]
        if 'fieldsOfStudy' in paper and type(paper['fieldsOfStudy']) == list and len(paper['fieldsOfStudy']) > 0 and sum([k in paper['fieldsOfStudy'] for k in ['Materials Science','Computer Science','Engineering','Medicine','Psychology','Mathematics']]) == 0:
            continue
        d['tTot'] += paper['sc']
        d['tInf'] += paper['sci']
        
        tot = 0
        val = 0
        for i,n in enumerate(paper['authors']):
            tot += 1/(1+i)
            if n['authorId'] != None and int(n.get('authorId',0)) in aid:
                val = 1/(1+i)
        mul = (val/tot)
        d['iTot'] += mul*paper['sc']
        d['iInf'] += mul*paper['sci']
    s2[a].update(d)


In [None]:
'fieldsOfStudy' in paper , type(paper['fieldsOfStudy']) == list , len(paper['fieldsOfStudy']) > 0 and sum([k in paper['fieldsOfStudy'] for k in ['Computer Science','Engineering','Medicine','Psychology','Mathematics']])
#with open('s2.pkl','wb') as fp:
#    pickle.dump(s2,fp)

In [None]:
raise

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
import time
browser = webdriver.Firefox()

In [None]:
res = {}

In [None]:
if False:
    import copy
    for n in list(df_results_out2.index)[len(res):]:
        on = copy.deepcopy(n)
        if n.split()[-1][:2] == '00':
            n = ' '.join(n.split()[:-1])
        browser.get('https://www.semanticscholar.org/search?q={}&sort=relevance'.format(n))

        element = WebDriverWait(browser, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "matched-author-shoveler__author-title"))
        )
        txt = browser.page_source
        ids = re.findall('<a class="matched-author-shoveler__author-link" href="(\/author\/.*?)\/([0-9]+)">',txt)
        res[on] = ids
        #if len(ids) > 1:
        #    time.sleep(10)

In [None]:
resd = []
for k,v in res.items():
    d = {'name':k,'ids': v[0][1], 'multi':len(v)}
    resd.append(d)
#pd.DataFrame(resd).set_index('name').to_csv('s2.csv')

In [None]:
raise

In [None]:
dfs2 = pd.read_csv('s2.csv').set_index('name')

done_set = set(sum([v['authorId'] if type(v['authorId'])==list else [v['authorId']] for k,v in s2.items()],[]))
done_set = set([int(x) for x in done_set])

proc_set = []
for n in dfs2.itertuples():
    t = eval(n[1])
    if type(t) == int:
        t = [t]
    for i in t:
        if i not in done_set:
            proc_set.append((n[0],t))
            break
proc_set

In [None]:
import requests
import time
for row in proc_set:
    res = []
    for i in row[1]:
        resp = requests.get('https://api.semanticscholar.org/v1/author/{}'.format(i))
        time.sleep(3.1)
        res.append(json.loads(resp.content))
    d = res[0]
    d['authorId'] = [int(d['authorId'])]
    for i in range(1,len(res)):
        d2 = res[i]
        d['aliases'] = d.get('aliases',[]) + d2.get('aliases',[])
        d['authorId'] = d['authorId'] + [int(d2['authorId'])]
        d['influentialCitationCount'] += d2['influentialCitationCount']
        d['papers'] = d['papers'] + d2['papers']
                                                    
    s2[row[0]] = d

In [None]:
#with open('s2.pkl','wb') as fp:
#    pickle.dump(s2,fp)

In [None]:
import requests
res3 = []
for k,v in s2.items():
    for i in v['papers']:
        res3.append(i['paperId'])
res3s = set(res3)
print(len(res3s),len([_ for _ in res3s if _ not in s2p]))


In [None]:
for i in [_ for _ in res3s if _ not in s2p]:
    resp = requests.get('https://api.semanticscholar.org/v1/paper/{}'.format(i))
    time.sleep(3.2)
    s2p[i] = json.loads(resp.content)

In [None]:
for k,v in s2p.items():
    if 'error' in v or 'message' in v:
        print(v)
#with open('s2p.pkl','wb') as fp:
#    pickle.dump(s2p,fp)

In [None]:
raise

In [None]:
new_new = set()
for name in set(list(interesting_set) +list(new_set)):
    idx = name_idx[name]
    if scoreV['working_years'][idx] < 10:
        pass
    else:
        new_new.add(name)

In [None]:

spec_vectors = [res[list(df_results.index).index(_)] for _ in Directors]
spec_vectors.append(np.array(spec_vectors).mean(0))
spec_vectors = np.array(spec_vectors)

In [None]:
df_results.iloc[np.argmax(res[:,0])]

In [None]:
plt.scatter(res[:,0],res[:,1])
DIR_IDX = df_results.index.isin(Directors)
plt.scatter(spec_vectors[:,0],spec_vectors[:,1])
#df_results.iloc[6737]
#np.where(res[:,0] > 7) # 2004, 6737

In [None]:
from scipy.spatial.distance import cdist,squareform
dists = cdist(spec_vectors,res,'euclidean')
for l,d in zip(labels,dists):
    df_results['euc' + l] = d
dists = cdist(spec_vectors,res,'cosine')
for l,d in zip(labels,dists):
    df_results['cos' + l] = d
df_results['MetaMetric'] *= (1/np.log(np.maximum(0.07,df_results['cosRD'])*50))

In [None]:
df_results.sort_values('cosRD').to_csv('profile_results5.csv')
df_results.corr('spearman').to_csv('corr.csv')

In [None]:
df_results.sort_values('NSF YearlyAward',0,False)

In [None]:
df_results.columns

In [None]:
plt.style.use('fivethirtyeight')

name_plot = ['David P. Woodruff',"Ryan O'Donnell",'Anupam Gupta']
for name in name_plot:
    plt.plot(np.arange(min_year,max_year,YEAR_BLOCKS),scoreV['1/i'][name_idx[name],sTypes.index('Full')],label=name.split()[0])


plt.legend()
plt.xlim(left=2000)

In [None]:
#=D3*SQRT(F3)*(1/LOG(C3))*LOG(MAX(10,SUM(AN3:AU3)))*MAX(1,LOG(AE3))*(1/LOG(MAX(0.07,BN3)*50))*LOG(AM3-1965)

In [None]:
interesting_set = set()
with open('top_ri3.json','rt') as fp:
    interesting_set = set(json.load(fp))
    print(len(interesting_set))
#interesting_set.add('Jeff Clune')


In [None]:
best_v = np.argsort(meta_metric)[::-1]
len(v)

In [None]:
print(len(interesting_set))
beST_N = 15000
for i in range(beST_N):
    idx = best_v[i]
    if working_years[idx] >= 10:
        interesting_set.add(all_authors[idx])
    #print(idx,all_authors[idx],v[idx],ratio_v[idx])
print(meta_metric[best_v[beST_N]],meta_metric[name_idx['Odest Chadwicke Jenkins']],meta_metric[name_idx['Jeff Clune']])
print(len(interesting_set))

In [None]:
with open('top_ri_Metric.json','wt') as fp:
    json.dump(sorted(list(interesting_set)),fp,sort_keys=True,indent=4, separators=(',', ': '))