In [None]:
import os
import sys
import fnmatch
import zipfile
import xmltodict
import numpy as np
import pandas as pd
import json
import gzip
import pickle
import csv
import scipy.sparse

In [None]:
# nsf data
df2 = pd.read_pickle('nsf2.pkl')
df = pd.read_pickle('nsf.pkl')
Xauth = None

In [None]:
r1_confs = pickle.load(open('r1_confs.pkl','rb'))
r1_confs_dict = {_:1 for _ in r1_confs}

In [None]:
# from the big paper thing
papers = pd.read_hdf('papers.h5','table')
unique_names = pickle.load(open('big_names.pkl','rb'))
unique_confs = pickle.load(open('confs.pkl','rb'))

faculty_affil = pd.read_csv('faculty-affiliations.csv')
ranks = pd.read_csv('ranks.csv')
def csv2dict_str_str(fname):
    with open(fname, mode='r') as infile:
        rdr = csv.reader(infile)
        d = {rows[0].strip(): rows[1].strip() for rows in rdr}
    return d
aliasdict = csv2dict_str_str('dblp-aliases.csv')
conf_idx = pickle.load(open('conf_idx.pkl','rb'))
name_idx = pickle.load(open('name_idx.pkl','rb'))

In [None]:
areadict = {
    'icse' : ['ICSE', 'ICSE (1)'],
    'fse'  : ['SIGSOFT FSE', 'ESEC/SIGSOFT FSE'],
    'usenixatc' : ['USENIX Annual Technical Conference', 'USENIX Annual Technical Conference, General Track'], # next tier
    'imc': ['IMC', 'Internet Measurement Conference'],
    'sigmetrics': ['SIGMETRICS', 'SIGMETRICS/Performance', 'POMACS'],
    'mobicom' : ['MobiCom', 'MOBICOM'],
    'rtas' : ['RTAS', 'IEEE Real-Time and Embedded Technology and Applications Symposium'],
    'ccs': ['CCS', 'ACM Conference on Computer and Communications Security'],
    'oakland' : ['IEEE Symposium on Security and Privacy'],
    'usenixsec' : ['USENIX Security Symposium', 'USENIX Security'],
    'pets' : ['PoPETs', 'Privacy Enhancing Technologies'],
    'cav': ['CAV', 'CAV (1)', 'CAV (2)'],
    'lics' : ['LICS', 'CSL-LICS'],
    'nips': ['NIPS', 'NeurIPS'],
    'icml': ['ICML', 'ICML (1)', 'ICML (2)', 'ICML (3)'],
    'aaai': ['AAAI', 'AAAI/IAAI'],
    'ubicomp' : ['UbiComp', 'Ubicomp', 'IMWUT', 'Pervasive'],
    'emnlp': ['EMNLP', 'EMNLP-CoNLL', 'HLT/EMNLP'],
    'acl' : ['ACL', 'ACL (1)', 'ACL (2)', 'ACL/IJCNLP', 'COLING-ACL'],
    'naacl' : ['NAACL', 'HLT-NAACL', 'NAACL-HLT'],
    'cvpr': ['CVPR', 'CVPR (1)', 'CVPR (2)'],
    'eccv': ['ECCV', 'ECCV (1)', 'ECCV (2)', 'ECCV (3)', 'ECCV (4)', 'ECCV (5)', 'ECCV (6)', 'ECCV (7)', 'ECCV (8)', 'ECCV (9)', 'ECCV (10)', 'ECCV (11)', 'ECCV (12)', 'ECCV (13)', 'ECCV (14)', 'ECCV (15)', 'ECCV (16)'],
    'icra': ['ICRA', 'ICRA (1)', 'ICRA (2)'],
    'rss': ['Robotics: Science and Systems'],
    'crypto': ['CRYPTO', 'CRYPTO (1)', 'CRYPTO (2)', 'CRYPTO (3)'],
    'eurocrypt': ['EUROCRYPT', 'EUROCRYPT (1)', 'EUROCRYPT (2)', 'EUROCRYPT (3)'],
}
inverse_area_dict = {}
for k,v in areadict.items():
    n = len(v)
    for i in range(1,n):
        inverse_area_dict[v[i]] = v[0]
for k,v in inverse_area_dict.items():
    if k in conf_idx and v in conf_idx:
        conf_idx[k] = conf_idx[v]

In [None]:
# munge the years
min_year = papers.year.min()
max_year = papers.year.max()
span_years = max_year - min_year +1

print(span_years,min_year,max_year)

In [None]:
nsf_paper_n, _ = df2.shape
nsf_paper_n

In [None]:
import ftfy
from unidecode import unidecode

from collections import Counter,defaultdict
# create or load some author data
def dd():
    return defaultdict(list)
if False:
    papers_per_year = {} 
    author_papers = defaultdict(dd)

    for row in papers.itertuples():
        paper_year = row[10]
        conf = row[2]
        n = row[4]
        authors = row[3]
        yc = papers_per_year.get(conf,np.zeros(2020-1970))
        yc[paper_year-1970] += 1
        papers_per_year[conf] = yc
        for a in authors:
            a = unidecode(ftfy.fix_encoding(a))
            split_name = a.split(' ')
            if not split_name[-1].isalpha() and len(split_name) > 2:
                first_last = split_name[0] +' ' + split_name[-2]
            else: 
                first_last = split_name[0] +' ' + split_name[-1]
            author_papers[first_last.lower()][paper_year].append((conf,n))
    import pickle
    with open('nsf_auth.pkl','wb') as fp:
        pickle.dump(author_papers,fp)
else:
    pass
    with open('nsf_auth.pkl','rb') as fp:
        author_papers = pickle.load(fp)
    with open('papers_per_year.pkl','rb') as fp:
        papers_per_year = pickle.load(fp)

In [None]:
with open('papers_per_year.pkl','wb') as fp:
    pickle.dump(papers_per_year,fp)

In [None]:
def ddn():
    return defaultdict(int)
author_amounts = defaultdict(ddn)
for i,row in enumerate(df2.itertuples()):
    authors, year, amount = row[3],row[4],row[5]
    # some infinite amounts exist! bad!
    if not np.isfinite(amount):
        continue

    amount = amount# min(amount,1e7)
    for a in authors:
        split_name = a.split(' ')
        first_last = split_name[0] +' ' + split_name[-1]
        for yr in range(int(year),2020):
            author_amounts[first_last.lower()][yr] += amount/len(a)


In [None]:
df2[df2.infaward > 1e9]

In [None]:
sorted([(max(v.values()),k) for k,v in author_amounts.items() if k ],reverse=True)

In [None]:
import scipy.stats
import matplotlib.pyplot as plt
sigma = 2
weights = []
for i in range(span_years):
    a = np.array([scipy.stats.norm.pdf( (j-i)/sigma) for j in range(span_years)])
    a[a < 0.05] = 0
    weights.append(a/np.linalg.norm(a))
_ = plt.plot(np.arange(span_years)+min_year,weights[2000-min_year])
plt.grid(True)

In [None]:
import itertools
#pairs_of_years = itertools.product(range(span_years),range(span_years))

wdict = {}
for i,j,k in itertools.product(range(unique_confs.shape[0]),range(span_years),range(span_years)):
    wdict[i*span_years+j,i*span_years+k] = weights[j][k]
wsa = scipy.sparse.dok_matrix((span_years*unique_confs.shape[0],span_years*unique_confs.shape[0]))
wsa._update(wdict)

In [None]:
# create design mattrix
X = scipy.sparse.dok_matrix((nsf_paper_n,span_years*unique_confs.shape[0]))
xdict = {}
duplicate_authors = {}
y = np.zeros(nsf_paper_n,dtype=np.float32)
for i,row in enumerate(df2.itertuples()):
    authors, year, amount = row[3],row[4],row[5]

    for a in authors:
        split_name = a.split(' ')
        if not split_name[-1].isalpha() and len(split_name) > 2:
            first_last = split_name[0] +' ' + split_name[-2]
        else: 
            first_last = split_name[0] +' ' + split_name[-1]
        for year_a,conf_list in author_papers[first_last.lower()].items():
            if year_a <= year:
                for paper in conf_list:
                    j = span_years*conf_idx[paper[0]] + year_a-min_year
                    xdict[(i,j)] = 1/paper[1]
X._update(xdict)
print(X.sum())

In [None]:
X = scipy.sparse.csr_matrix(X)
wsa = scipy.sparse.csr_matrix(wsa)
X = X @ wsa

In [None]:
year_amounts = np.zeros(span_years,dtype=np.float32)
y = np.zeros(nsf_paper_n,dtype=np.float32)

if False:
    for i,row in enumerate(df2.itertuples()):
        authors, year, amount = row[3],row[4],row[5]
        # some infinite amounts exist! bad!

        if not np.isfinite(amount):
            continue
        if amount <= 20000: #what is that even for?
            continue
        # maybe the old years are misleading!?
        #if year < 2002:
        #    continue
        # small grants are misleading? 150000
        #if amount < 1e7:
        #    continue
        # giant grants are msileading?
        #if amount >= 4e5:
        #    amount = 4e5 + np.log((amount-4e5)+1)*4e3
        if amount >= 1e7:
            amount = 1e7 + np.log((amount-1e7)+1)*1e5
        #print(len(authors),sum([(a in author_papers) for a in authors]))
        #print(a)
        #print(len(authors),sum([(a in author_papers) for a in authors]))
        #print(a)
        total_authors = len(authors)
        needed_authors = 0.5 * total_authors # half of all authors
        found_authors = sum([(a.lower() in author_papers) for a in authors])
        if needed_authors > 0 and needed_authors <= found_authors:
            y[i] = amount* (found_authors/total_authors)
            year_amounts[year-1970] += amount
if True: # get cumulative amount
    for i,row in enumerate(df2.itertuples()):
        authors, year, amount = row[3],row[4],row[5]
        a2 = []
        for a in authors:
            split_name = a.split(' ')
            if not split_name[-1].isalpha() and len(split_name) > 2:
                first_last = split_name[0] +' ' + split_name[-2]
            else: 
                first_last = split_name[0] +' ' + split_name[-1]
            a2.append(first_last)
        authors = a2
        
        # some infinite amounts exist! bad!
        if not np.isfinite(amount):
            continue

        if amount < 1000: #50000
            continue
        total_authors = len(authors)
        needed_authors = 0.5 * total_authors # half of all authors
        found_authors = sum([(a.lower() in author_papers) for a in authors])
        if needed_authors > 0 and needed_authors <= found_authors:
            y[i] = sum([author_amounts[first_last.lower()][year] for first_last in authors])
            year_amounts[year-1970] += sum([author_amounts[first_last.lower()][year] for first_last in authors])
nonarxiv = np.ones(span_years*len(unique_confs))
nonarxiv[span_years*conf_idx['CoRR']:span_years*(conf_idx['CoRR']+1)] = 0
skipped_conf = scipy.sparse.diags(nonarxiv)
skipped_data = scipy.sparse.diags((y != 0).astype(float))
print(X.shape,skipped_conf.shape,skipped_data.shape)
import matplotlib.pyplot as plt
y_orig = np.copy(y)
_ = plt.hist(y,100)

In [None]:
#y_orig = np.copy(y)
print(y_orig.min(),y_orig.max())
print((y_orig > 0).sum())
if False: # do log
    y = np.copy(np.log(1+y_orig))
    y[y == np.log(1)] = y[y != np.log(1)].mean()
else:
    y = np.copy(y_orig)
    y[y == 0] = y[y != 0].mean()

from matplotlib.pyplot import figure,hist
hist((y-y.mean())/y.std(),100)
figure()
_ = hist(y,100)

In [None]:
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR

X = scipy.sparse.csr_matrix(X)
clf = SGDRegressor('huber',tol=1e-9,max_iter=100,verbose=0,penalty='l2',alpha=1e-3,epsilon=0.01,average=True)
#clf = SGDRegressor('huber',tol=1e-9,max_iter=100,verbose=1,penalty='l1',alpha=1e-7)

clf.fit(skipped_data @X@ skipped_conf ,(y-y.mean())/y.std())

In [None]:
conf_ord = np.argsort(np.squeeze(clf.coef_))
conf_choice = ['SIGGRAPH','HRI','ECCV','Comput. Graph. Forum','Shape Modeling International','Symposium on Geometry Processing','Computer Aided Geometric Design','I. J. Robotics Res.','CVPR','International Journal of Computer Vision','Robotics: Science and Systems','ICRA','WACV','ICML','AISTATS','CoRR','SIGGRAPH Asia','ECCV','ICCV','ISER','Humanoids','3DV','IROS','CoRL','Canadian Conference on AI','ACCV','Graphics Interface','CRV','BMVC']
ri_confs = np.zeros(len(unique_confs)*span_years)
print(clf.intercept_)
ms = clf.coef_.mean()
ss = clf.coef_.std()
seen = {}
for i in range(len(unique_confs)*span_years):
    idx = conf_ord[-(i+1)]
    conf_name = unique_confs[idx//span_years]
    conf_score = clf.coef_[idx]
    if conf_name in conf_choice:
        ri_confs[idx] = 1
    if conf_name in conf_choice and conf_name not in seen:
        print('{:20s}{}\t{:.1f}'.format(conf_name[:20],str(min_year + (idx % span_years)),(conf_score-ms)/ss))
        seen[conf_name] =1
ri_confs.shape,ri_confs.sum(),X.shape

conf_choice2 = ['SIGGRAPH','BMVC','AAAI','NIPS','CVPR','ICRA','ICML','ICCV','ECCV','IROS',
               'International Journal of Computer Vision','Robotics: Science and Systems']
conf_choice3 = []
vs = clf.coef_.std()
for conf in conf_choice2:
    idx = conf_idx[conf]
    s = max(clf.coef_[span_years*idx:span_years*(idx+1)])
    conf_choice3.append((s,conf))
plt.figure(figsize=(12,8))
for s,conf in sorted(conf_choice3,reverse=True):
    idx = conf_idx[conf]
    _ = plt.plot(np.arange(min_year,max_year+1),(clf.coef_[span_years*idx:span_years*(idx+1)]/vs),label=conf)
plt.grid()
plt.xlabel('year')
plt.ylabel('value')
plt.legend()
#plt.show()
plt.savefig('nsf-fixed-total-2008-nonlog-names.pdf')
figure()
plt.plot(np.arange(min_year,max_year+1),year_amounts)
figure(figsize=(12,8))
for s,conf in sorted(conf_choice3,reverse=True):
    idx = conf_idx[conf]
    _ = plt.plot(np.arange(min_year,max_year+1),papers_per_year[conf],label=conf)
pickle.dump(clf.coef_,open('nsf_fixed_total-nonlog-names.pkl','wb'))


In [None]:
top_k = 50
i = -1
j = 0
seen = {}
while j < top_k:
    i += 1
    idx = conf_ord[-(i+1)]
    conf_name = unique_confs[idx//span_years]
    if conf_name in seen:
        continue
    j+=1
    conf_score = clf.coef_[idx]
    seen[conf_name] = 1
    print('{:20s}\t{}\t\t{:.3f}\t{:.2f}'.format(conf_name[:18],min_year + (idx % span_years),100*conf_score,(conf_score-ms)/ss))

In [None]:
clf.coef_.shape

In [None]:
_ = hist(clf.coef_,70)
pickle.dump(clf.coef_,open('nsf_indep2.pkl','wb'))


In [None]:
if Xauth is None or (Xauth.shape[1] != span_years*unique_confs.shape[0]):  
    Xauth = scipy.sparse.dok_matrix((len(unique_names),span_years*unique_confs.shape[0]))
    xdict = {}
    auth_years = np.ones((len(unique_names),2)) * np.array([3000,1000]) 
    for row in papers.itertuples():
        paper_year = row[10]
        #if row['year'] < 2005:
        #    continue
        #print(row)
        #if row['conf'] == 'CoRR':
        #    continue
        conf = row[2]
        n = row[4]
        authors = row[3]
        j = span_years*conf_idx[conf] + (paper_year-min_year)
        for a in authors:
            i = name_idx[a]
            xdict[(i,j)] = 1/n + xdict.get((i,j),0)
            auth_years[i,0] = min(auth_years[i,0],paper_year)
            auth_years[i,1] = max(auth_years[i,1],paper_year)
    Xauth._update(xdict)

In [None]:
scores = clf.predict(Xauth) - np.squeeze(clf.intercept_)
years_working = (1+auth_years[:,1]-auth_years[:,0])
value_scores = scores
norm_scores = (value_scores)/years_working
ri_filter_mat = scipy.sparse.diags(ri_confs)
ri_scores = clf.predict(Xauth.dot(ri_filter_mat))-np.squeeze(clf.intercept_)
ri_norm_scores = ri_scores/years_working

In [None]:
prev_cand = ['Pulkit Agrawal',
 'Joydeep Biswas',
 'Katherine L. Bouman',
 'David Braun',
 'Jia Deng',
 'Naomi T. Fitter',
 'David F. Fouhey',
 'Saurabh Gupta',
 'Judy Hoffman',
 'Hanbyul Joo',
 'Honglak Lee',
 'Changliu Liu',
 'Petter Nilsson',
 "Matthew O'Toole",
 'Alessandro Roncone',
 'Alanson P. Sample',
 'Manolis Savva',
 'Adriana Schulz',
 'Amy Tabb',
 'Fatma Zeynep Temel',
 'Long Wang',
 'Cathy Wu',
 'Ling-Qi Yan']
print('{:20s}\t{:4s}\t{:4s}\t{:4s}\t{}'.format('name','rate','total','ri','years'))
for ns, name in sorted([(value_scores[name_idx[ni]],ni) for ni in prev_cand],reverse=True):
    ni = name_idx[name]
    print('{:20s}\t{:.2f}\t{:.2f}\t{:.2f}\t{:.0f}'.format(name,100*norm_scores[ni],100*value_scores[ni],100*ri_scores[ni],years_working[ni]))
print('')
curious_names = ['Xiaolong Wang 0004','Judy Hoffman','Paris Siminelakis',
                 'Nicholas Rhinehart',
                 'Humphrey Hu',
                 'David F. Fouhey',
                 'Lerrel Pinto',
                 'Justin Johnson',
                 'Amir Roshan Zamir',
                 'Brian Okorn','David Held']
print('{:20s}\t{:4s}\t{:4s}\t{:4s}\t{}'.format('name','rate','total','ri','years'))
for _,name in sorted([(value_scores[name_idx[_]],_) for _ in curious_names],reverse=True):
    ni = name_idx[name]
    print('{:20s}\t{:.2f}\t{:.2f}\t{:.2f}\t{:.0f}'.format(name,100*norm_scores[ni],100*value_scores[ni],100*ri_scores[ni],years_working[ni]))

In [None]:
print('\n best overall \n')
cmu_scores = []

best_scores = np.argsort(value_scores)[::-1]
#print(best_scores.shape,unique_names[best_scores[0]])
fa_list = list(faculty_affil.name)
fa_a_list = list(faculty_affil.affiliation)
uni_names = [unique_names[i] for i in best_scores[:38000]]
for name in set([aliasdict.get(n, n) for n in uni_names]):
    if name in name_idx:
        uni = 'unknown'
        if name in fa_list:
            uni = fa_a_list[fa_list.index(name)]
        if name not in []:#['Jacob Walker','Justin Johnson','Pieter Abbeel','Martial Hebert','Jessica K. Hodgins','Abhinav Gupta','Christopher G. Atkeson','Tom M. Mitchell','Matthew T. Mason']:
            if years_working[name_idx[name]] < 3:
                continue
            if years_working[name_idx[name]] > 8:
                continue
            if ri_scores[name_idx[name]] < 0.008:
                continue
            if auth_years[name_idx[name],1] < 2017:
                continue
        #if (np.array(X[name_idx[name],:].todense()) * ri_confs).sum() == 0:
        #    continue
        #print(name,auth_years[name_idx[name]])
        score = norm_scores[name_idx[name]]
        ri_vscore = ri_norm_scores[name_idx[name]]
        vscore = value_scores[name_idx[name]]
        cmu_scores.append((vscore,ri_scores[name_idx[name]],score,uni,name,auth_years[name_idx[name]],ri_vscore))
    else:
        pass
        #print(name)
        ri_norm_scores
print('{:22s}\t{:15s}\t{:5s}\t{:3s}\t{:4s}\t{:4s}\t{} {}'.format('name','uni','rate','RI-t','total','RI-r','start','end'))
for vs,ris,s,u,p,yrs,rir in sorted(cmu_scores,reverse=True):
    print('{:22s}\t{:15s}\t{:.3f}\t{:.1f}\t{:.2f}\t{:.2f}\t{} {}'.format(p[:22],u[:15],s*100,ris*100,vs*100,rir*100,int(yrs[0]),int(yrs[1])))


In [None]:
uni_faculty = faculty_affil[faculty_affil.affiliation == 'Carnegie Mellon University'] #Carnegie Mellon University
uni_names = np.array(uni_faculty.name)
uni_names = list(uni_names) + ['Nicholas Rhinehart','Jacob Walker','Lerrel Pinto','Brian Okorn','Leonid Keselman','Siddharth Ancha','Humphrey Hu']
cmu_scores = []
#uni_names = [unique_names[i] for i in (np.argsort(scores)[::-1])[:150]]
for name in set([aliasdict.get(n, n) for n in uni_names]):
    if name in name_idx:
        #if ri_scores[name_idx[name]] < 2.5:
        #    continue
        score = scores[name_idx[name]]
        cmu_scores.append((score,name))
    else:
        pass
        #print(name)
for s,p in sorted(cmu_scores,reverse=True):
    print('{:30s}\t\t{:.3f}'.format(p,s*100))


In [None]:
du -h *.pkl

In [None]:
import gc
gc.collect()

In [None]:
pickle.dump(Xauth,open('xauth.pkl','wb'))
