In [None]:
import os
import sys
import fnmatch
import numpy as np
import pandas as pd
import json
import gzip
import pickle
import csv
import scipy.sparse
Xauth = None
from collections import defaultdict
import matplotlib.pyplot as plt

In [None]:
# setup the update to work despite the broken scipy documentation
try:
    a = scipy.sparse.dok_matrix((10,10))
    a.update({(0,0):1.0})
    scipy.sparse.dok_matrix.my_update = scipy.sparse.dok_matrix.update
except:
    a = scipy.sparse.dok_matrix((10,10))
    a._update({(0,0):1.0})
    scipy.sparse.dok_matrix.my_update = scipy.sparse.dok_matrix._update

In [None]:
with gzip.open('useful_venue_list.pkl.gz','rb') as fp:
    all_venues = pickle.load(fp)
with gzip.open('useful_authors_list.pkl.gz','rb') as fp:
    all_authors = pickle.load(fp)
with gzip.open('useful_papers.pkl.gz','rb') as fp:
    all_papers = pickle.load(fp)


In [None]:
min_year = all_papers[0][6]
max_year = all_papers[-1][6]
span_years = max_year - min_year + 1
print(min_year,max_year,span_years)
conf_idx = {v:i for i,v in enumerate(all_venues)}
name_idx = {v:i for i,v in enumerate(all_authors)}
n_confs = len(all_venues)
n_auths = len(all_authors)
n_papers = len(all_papers)
print(n_confs,n_auths,n_papers)

In [None]:
clf =  np.load('clf_gold.pkl.npy')
years_per_conf = clf.shape[0]//n_confs

YEAR_BLOCKS = span_years//years_per_conf

In [None]:
import scipy.sparse
import gc
if Xauth is None:
    valid_ns = set()
    for paper in all_papers:
        tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
        n = len(authors)
        valid_ns.add(n)
    
    per_author_val = {}
    for n in valid_ns:
        author_scores = 1/(np.arange(n)+1)
        author_score_sum = author_scores.sum()
        per_author_val[n] = author_scores/author_scores.sum()
        
    count_vecs = {}
    paper_vecs = []
    for paper in all_papers:
        tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
        n = len(authors)
        j = years_per_conf*conf_idx[venue] + (year-min_year)//YEAR_BLOCKS

        author_scores = per_author_val[n]
        paper_vecs.append([(name_idx[a],j,v) for a,v in zip(authors,author_scores)])
        
    Xauth = scipy.sparse.dok_matrix((n_auths,years_per_conf*n_confs))
    xdict = {}
  
    for paper_vec in paper_vecs:
        for i,j,v in paper_vec:
            xdict[(i,j)] = v + xdict.get((i,j),0)

    Xauth.my_update(xdict)
            
    Xauth = scipy.sparse.csr_matrix(Xauth)
    paper_vec = []
    xdict = {}
    gc.collect()

In [None]:
cmu_uni = pd.read_csv('other_ranks/cmu_faculty.csv')
cmu_uni = cmu_uni.fillna('Other')
cmu_uni = cmu_uni[cmu_uni.dept == 'RI']
#uni_names = ['Andrea Tagliasacchi','Paul G. Kry']#['Xuemin Shen','H. Vincent Poor','Kang G. Shin','Mohamed-Slim Alouini','Lajos Hanzo']#list(cmu_uni.name)
#uni_names = list(faculty_affil[faculty_affil.affiliation == 'Johns Hopkins University'].name)
uni_names = list(cmu_uni.name)
print(len(uni_names))
conf_counts = {}
conf_counts_value = {}

for paper in all_papers:
    tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    n = len(authors)
    for a in authors:
        if a in uni_names:
            conf_counts[venue] = 1/n + conf_counts.get(venue,0)
            conf_counts_value[venue] = clf[years_per_conf*(conf_idx[venue]) + (year-min_year)//YEAR_BLOCKS]/n + conf_counts_value.get(venue,0)
conf_counts_value = {k: v/conf_counts[k] for k,v in conf_counts_value.items()}
ri_fav_confs = [(conf_counts[_[1]]*conf_counts_value[_[1]],_[1],conf_counts[_[1]],conf_counts_value[_[1]]) for _ in sorted([(v,k) for k,v in conf_counts.items() if v > 0],reverse=True)]

In [None]:
confs_to_filter = [_[1] for _ in sorted(ri_fav_confs,reverse=True) if _[-2] >= 1.0]

In [None]:
only_after = 1970
start_year_idx = max(0,only_after-min_year)//YEAR_BLOCKS

year_filter = np.zeros_like(clf).reshape((-1,years_per_conf))

if False: # filter confs
    for conf in confs_to_filter:
        year_filter[conf_idx[conf],start_year_idx:] = 1
else:
    year_filter[:,start_year_idx:] = 1

total_scores = Xauth.dot(clf * year_filter.reshape((-1)))

In [None]:
best_idx = np.argsort(total_scores)[::-1]
for k in range(10):
    idx = best_idx[k]
    print('{:30s}\t{:.2f}'.format(all_authors[idx],total_scores[idx]))

# Build the APM matrix

In [None]:
Xauth = None
gc.collect()

In [None]:
Xauth = scipy.sparse.dok_matrix((n_papers,n_auths))
y = np.zeros(n_papers)
xdict = {}

for i,paper in enumerate(all_papers):
    tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    n = len(authors)
    for a in authors:
        j = name_idx[a]
        xdict[(i,j)] = 1/n
    ji = years_per_conf*conf_idx[venue] + (year-min_year)//YEAR_BLOCKS
    y[i] = clf[ji]

Xauth.my_update(xdict)

Xauth = scipy.sparse.csr_matrix(Xauth)
paper_vec = []
xdict = {}
gc.collect()

In [None]:
_ = plt.hist(y,50)
plt.figure()
_ = plt.hist((y-y.mean())/y.std(),50)
y.std(),y.mean()
plt.figure()
yt = np.copy(y)
where = np.where(y > 0)
yt2 = np.log(yt[where])
yt[where] = (yt2-yt2.mean())/yt2.std()
yt = np.clip(yt,-3.5,3.5)
_ = plt.hist(yt,50)


In [None]:
from sklearn.linear_model import SGDRegressor
clf2 = SGDRegressor('huber',alpha=1e-3,penalty='l2',tol=1e-6,max_iter=1000,average=True,verbose=1) #,fit_intercept=False
#clf2.fit(Xauth,yt)
clf2.fit(Xauth,y)

In [None]:
apm_weights = clf2.coef_
apm_weights = (apm_weights-apm_weights.mean())/apm_weights.std()
_ = plt.hist(apm_weights,100,log=True)

In [None]:
scipy.stats.spearmanr(apm_weights,total_scores)

In [None]:
best_idx = np.argsort(apm_weights)[::-1]
for k in range(180):
    idx = best_idx[k]
    print('{:30s}\t{:.2f}\t{:.4f}'.format(all_authors[idx],total_scores[idx],apm_weights[idx]))

In [None]:
uni_scores = sorted([(apm_weights[name_idx[name]],name) for name in uni_names if name in name_idx],reverse=True)
uni_scores

In [None]:
uni_scores = sorted([(total_scores[name_idx[name]],name) for name in uni_names if name in name_idx],reverse=True)
uni_scores

In [None]:
curious_names = ['Xiaolong Wang 0004','Judy Hoffman','Paris Siminelakis','Roie Levin','Leonid Keselman',
                 'Nicholas Rhinehart','Vincent Sitzmann','Siddharth Ancha','Xingyu Lin',
                 'Humphrey Hu',
                 'David F. Fouhey','Chelsea Finn',
                 'Lerrel Pinto',
                 'Justin Johnson',
                 'Amir Roshan Zamir','Dominik Peters','Jonathan T. Barron','Dorsa Sadigh','Derek Hoiem','Vaggos Chatziafratis',
                 'Brian Okorn','David Held']

In [None]:
uni_scores = sorted([(apm_weights[name_idx[name]],name) for name in curious_names if name in name_idx],reverse=True)
uni_scores

In [None]:
uni_scores = sorted([(total_scores[name_idx[name]],name) for name in curious_names if name in name_idx],reverse=True)
uni_scores

In [None]:
prev_cand = ['Pulkit Agrawal',
 'Joydeep Biswas',
 'Katherine L. Bouman',
 'David Braun',
 'Naomi T. Fitter',
 'David F. Fouhey',
 'Saurabh Gupta',
 'Judy Hoffman',
 'Hanbyul Joo',
 'Changliu Liu',
 'Petter Nilsson',
 "Matthew O'Toole",
 'Alessandro Roncone',
 'Alanson P. Sample',
 'Manolis Savva',
 'Adriana Schulz',
 'Amy Tabb',
 'Fatma Zeynep Temel',
 'Long Wang',
 'Cathy Wu',
 'Ling-Qi Yan']

In [None]:
uni_scores = sorted([(apm_weights[name_idx[name]],name) for name in prev_cand if name in name_idx],reverse=True)
uni_scores

In [None]:
uni_scores = sorted([(total_scores[name_idx[name]],name) for name in prev_cand if name in name_idx],reverse=True)
uni_scores

In [None]:
prev_cand = ['Yonatan Bisk',
             'Angela Dai',
             'Abe Davis',
             'Tali Dekel',
             'Jaime F. Fisac',
             'Zakia Hammal',
             'Josie Hughes',
             'László A. Jeni',
             'Angjoo Kanazawa',
             'Deepak Pathak',
             'Lerrel Pinto',
             'Elaine Short',
             'Wen Sun 0002',
             'Jiajun Wu',
             #'Ji Zhang', # disambig
             'Jun-Yan Zhu',
             'Yuke Zhu'
            ]
[_ for _ in prev_cand if _ not in name_idx]

In [None]:
uni_scores = sorted([(apm_weights[name_idx[name]],name) for name in prev_cand if name in name_idx],reverse=True)
uni_scores

In [None]:
uni_scores = sorted([(total_scores[name_idx[name]],name) for name in prev_cand if name in name_idx],reverse=True)
uni_scores