# configure weight generation

In [None]:
import os
import sys
import fnmatch
import numpy as np
import pandas as pd
import json
import gzip
import pickle
import csv
import scipy.sparse
Xauth = None
from collections import defaultdict

In [None]:
# setup the update to work despite the broken scipy documentation
try:
    a = scipy.sparse.dok_matrix((10,10))
    a.update({(0,0):1.0})
    scipy.sparse.dok_matrix.my_update = scipy.sparse.dok_matrix.update
except:
    a = scipy.sparse.dok_matrix((10,10))
    a._update({(0,0):1.0})
    scipy.sparse.dok_matrix.my_update = scipy.sparse.dok_matrix._update


In [None]:
weight_to_get_idx = int(os.environ.get('REGRESSION_TASK_IDX',0))
NORM_VOLUME = float(os.environ.get('REGRESSION_SIZE_NORM',2))
NORM_CONF_NUM = bool(int(os.environ.get('REGRESSION_NORM_CONF_NUM',1)))

be_verbose = not ('REGRESSION_TASK_IDX' in os.environ)


WEIGHT_TO_GET = ['faculty','nsfmarginal','nsftotal','salary'][weight_to_get_idx]
USE_LOG = False
if weight_to_get_idx == 1:
    USE_LOG = True
TOP_K = 75
SGD_ITER = 80
YEAR_BLOCKS = 2 # 1 uses a by-year model
BY_YEAR_SIGMA = {0: 2, 1: 2, 2: 4, 3:4}[weight_to_get_idx] # how many years to splat the by-year model
weight_file_template = 'weights_{}_above6_{}_{}_{}_{}_{}.pkl'
L2REG = 3e-3
LRPRINT = -int(np.log10(L2REG)*10)
NORM_YEARS = False
if WEIGHT_TO_GET != 'faculty':
    TOP_K = 0

# get a new filename
for i in range(50):
    tmp = weight_file_template.format(WEIGHT_TO_GET,'log' if USE_LOG else 'linear',YEAR_BLOCKS,TOP_K,LRPRINT,i)
    if not os.path.exists(tmp):
        break
# overwrite in the case of command line executation
if 'REGRESSION_TASK_IDX' in os.environ:
    i = 0
weight_file = weight_file_template.format(WEIGHT_TO_GET,'log' if USE_LOG else 'linear',YEAR_BLOCKS,TOP_K,LRPRINT,i)
weight_file

# load data

In [None]:
with gzip.open('useful_venue_list.pkl.gz','rb') as fp:
    all_venues = pickle.load(fp)
with gzip.open('useful_authors_list.pkl.gz','rb') as fp:
    all_authors = pickle.load(fp)
with gzip.open('useful_papers.pkl.gz','rb') as fp:
    all_papers = pickle.load(fp)

In [None]:
conf_idx = {v:i for i,v in enumerate(all_venues)}
name_idx = {v:i for i,v in enumerate(all_authors)}
n_confs = len(all_venues)
n_auths = len(all_authors)
r1_confs = pickle.load(open('old_version/r1_confs.pkl','rb'))
r1_confs_dict = {_:1 for _ in r1_confs}

In [None]:
faculty_affil = pd.read_csv('faculty-affiliations.csv')
ranks = pd.read_csv('other_ranks/ranks.csv')
def csv2dict_str_str(fname):
    with open(fname, mode='r') as infile:
        rdr = csv.reader(infile)
        d = {rows[0].strip(): rows[1].strip() for rows in rdr}
    return d
aliasdict = csv2dict_str_str('dblp-aliases-expanded.csv')

In [None]:
if 'nsf' in WEIGHT_TO_GET :
    df_nsf = pd.read_pickle('nsf2.pkl')

In [None]:
for i,a in enumerate(all_authors):
    #ns = a.split(' ')
    #n_s = ns[0] + ' ' + ns[-1]

    #if not (ns[0] + ns[-1]).isalpha():
    #    continue
    #if n_s not in name_idx:
    #    name_idx[n_s] = name_idx[a]
    # this version is better but maybe worse?
    split_name = a.split(' ')
    if not split_name[-1].isalpha() and len(split_name) > 2:
        first_last = split_name[0] +' ' + split_name[-2]
    else: 
        first_last = split_name[0] +' ' + split_name[-1]
    if first_last not in name_idx:
        name_idx[first_last] = i

In [None]:
numbered_names = set([_ for _ in list(all_authors) if _.split(' ')[-1].isdigit()]) #and int(_.split(' ')[-1][-1]) > 1])
ambi_numbers = [' '.join(_.split(' ')[:-1]) for _ in numbered_names]
for name in numbered_names:
    ns = name.split(' ')
    ambi_numbers.append(ns[0] + ' ' + ns[-2])
#ambi_numbers = numbered_names

clobber_names = {}
for name in all_authors:
    clobber_names[name] = 1 + clobber_names.get(name,0)
for name in all_authors:
    ns = name.split(' ')
    if ns[-1].isdigit():
        n2 = ' '.join(ns[:-1])
        clobber_names[n2] = 1 + clobber_names.get(n2,0)
        if len(ns) > 3:
            n2 = ns[0] + ' ' + ns[-2]
            clobber_names[n2] = 1 + clobber_names.get(n2,0)
    else:
        if len(ns) > 2:
            n2 = ns[0] + ' ' + ns[-1]
            clobber_names[n2] = 1 + clobber_names.get(n2,0)

clobbered_names = [k for k,v in clobber_names.items() if v > 1]
ambi_numbers = set(ambi_numbers+clobbered_names)
len(clobbered_names),len(ambi_numbers)

In [None]:
ambiguous_matches = list(ambi_numbers)
ambiguous_matches += [ 'John Anderson','George Lakoff','Barbara White','William Bechtel', 'Hooman Darabi',  'Ermanno Bencivenga','George Lakoff','Robert Martin','David Kaplan','Seth Yalcin','Giacomo Bonanno','Adam Sennet','John Strain','Ohyun Kwon','Thomas Schwartz','Wendy Fong','Steven Lehman','Jeffrey Hoyt','Michael Katz','Gregory J. O. Beran','Kenneth Leung','Won Kim','Michael Ball','William Evans','Zheng Sun','John Kim','Tao Wang','William Smith','Richard Wallace','David Cohen','Wei Liu','Wei Wei','Bo Liu','Yuan Xie','Bo Li','Song-Chun Zhu','Vladimir Chernov','Richard Wilson','Gang Li','Jun Liu','Li Tian','Li Li','Ran Wei','Lin Zhang','Feng Wang','Wei Wang','Xi Zhang','Lei Wang','Yu Zhang','Jun Li','Gang Liu','Dong Wang','Gang Zeng','Yue Zhang','Jing Huang','Wei Chen','Yue Wang','Jing Liu','Han Li','Xi Chen','Yang Yang','Xiangmin Xu','Rong Wang','Xin Liu','Lei He','Feng Guo','Yan Zhang','Wei Xiong','Zheng Zhang','Fei Xu','Rui Wang','Yuan Zhang','Yu Zhou','Jing Yang','Li Zhang','Ke Zhang','Jiang Du','Kwang-Ting Cheng','Kai Liu','Hong Liu','Xia Li','Zhu Wang','Xiang Zhang','Liming Zhang','Qi Zhang','Feng Liu','Xu Chen','Bingyuan Liu','Xiaohua Huang','Yu Qiao','Qing Zhao','Chao Wang','Bin Chen','Pei Chen','Xu Yang','Yue Liu','Aditi Chandra','Yang Liu','Dong Li','Chen Li','Zhenbiao Yang','Yi Zhang','Tao Jiang','Jian Zhang']
ambiguous_matches += ['Curtis Roads','David Wessel','Kaija Saariaho','Stephen Morris','George Taylor','William Miller','Behzad Razavi','Yu Hu','Ning Li','Gert Cauwenberghs','Ying Wei','Payam Heydari','Ping Liang','Babak Daneshrad','Zhiying Wang','Alexander Vardy','Wenjun Zhang','Rajarshi Mukherjee','Sangho Shin','Majid Sarrafzadeh','Upamanyu Madhow','Hao Chen','Edward Lee','Yi Chen','Yong Kim','Michael Cheng','Xiaolong Li','Kenneth Rose','Ke Xu','Chen-Nee Chuah','Wentai Liu','Qiang Zhou','Jiong Li','Hadi Esmaeilzadeh']
ambiguous_matches += ['William White','Robert Rosenthal','Daniel Schneider','Min Zhao','Mark Wilson']
#ambiguous_matches += ['Ming Liu','Shantanu Sinha','Jishen Zhao','Yanhong Li','Lifeng Lai','Young-Han Kim','Somayeh Sojoudi','Shuguang Cui','Mahnoosh Alizadeh','Kannan Ramchandran','Tara Javidi','Anant Sahai','Steven Weber','Shaolei Ren','Jing Xu','Lara Dolecek','Kamalika Chaudhuri','Massimo Franceschetti','Ramtin Pedarsani','Bin Yu','Francesco Bullo','David Tse','Hao Li','Benjamin Recht','Marco Levorato','Duncan Callaway','Lior Pachter','Paulo Tabuada','Timothy Brown','Joseph Wang','Fabio Pasqualetti','Sharon Aviran','Kang Zhang','Jing Wang','Michael Bell','Anil Aswani','George Varghese','Francesco Borrelli','Phong Nguyen','Athina Markopoulou','Hamid Jafarkhani','Po-Ning Chen','Wotao Yin','Khoa Nguyen','Shu Lin','Kameshwar Poolla','Yueyue Fan','Dipak Ghosal','Abhay Parekh']
ambiguous_matches +=['Zhe Chen','Yizhou Sun','Yanhong Liu','Shachar Lovett','Rina Dechter','Borivoje Nikolic','Krste Asanovic','Elad Alon','Ying Hu','Ye Li','Xin Chen',]
ambiguous_matches +=['Steven Lee','Wen Jiang','Yu Huang','John Campbell','Wei Zhou','Gang Chen','Shivendu Shivendu','Vijay Khatri','Yi Sun','Yong Chen','Bin Liu','Mark Anderson','Erik Rolland','Jing Zhao','Li Fan','Yi Xie','Li Cai','Jia Shen','Lili Yang','Tong Wang','John Campbell','Wei Zhou','Gang Chen','Lixia Zhang','Sujit Dey','Jiasi Chen','Jin Zhang','Lin Liu','Mary Hegarty',]
ambiguous_matches += ['Xin Zhou','Min Li','Huiying Li','Dong Yu','Yong Huang','Jing Shi','Zhiwei Zhang','Xiao Hu']
ambiguous_matches += ['Joseph Barton','Jason Woo','Evelyn S. Tecoma','Bei Wang']
ambiguous_matches += ['Bin Yu','Yi Tang','Jing Wang','Wei Ren']
ambiguous_matches += ['Ming Gu','Wei Xu','Bo Yu','Tao Yang','Lin Lin''Zhi Ding','Tao Ye','Kai Zhu','Steven Weber']
ambiguous_matches += ['Muhammad Arif','Ke Li','Ming Liu','Jiawei Chen','Hao Li','Yang Xu','Xin Guo','Hao Cheng','Ye Zhang']
ambiguous_matches += ['David Pearce','Ilya Dumer','Richard Bamler','Pablo Tamayo','Pamela Samuelson','Michael Pratt','Bruce Blumberg','Patrick Farrell','Albert Lai','Phong Nguyen','Michael Levine','Marco Conti','Oliver Arnold','Deepak Gupta','Chun-Nan Hsu','Jun Wu','Hui Sun','Michael Franklin','Richard Allen','Ida Sim']
ambiguous_matches += ['Michael Rios','Joel Watson','Lara Buchak','Michael Weiner','Venkatesan Sundaresan','Hoori Ajami','Neil Jones','Yi-Lin Yang','Adrian Preda','Jonathan Wurtele','Kumar Sharma','Phioanh Nghiemphu','Mark Asta','Lei Song','Boris Maciejovsky','Nader Pourmand','Fang Wei','Jie Zheng','Shane White','Stephen Small','Xiaobin Yang','Steven Evans','Jiming Jiang']
ambiguous_matches += ['Robin Hill','Marcus Opp','Yimin Zou','Lera Boroditsky','Morana Alac','Diba Mirza','Volkan Rodoplu','Michael Ryan','Grace Chang']

#globecom + icc
ambiguous_matches += ['Izhak Rubin','Zhi Ding','Shuguang Cui','Albert Chan','Lei Cao','Robert Cohen','Ming Xiao','Hyong Kim','Massimo Tornatore','Bill Lin','Daniel Lee',"Daniel O'Neill",'Lin Tian','Go Hasegawa','Meng Chen','Haitao Zheng','Jing Xu','Biao He','Bo Wei','Eric Wong','Lin Lin','Xu Wang','Tao Chen','Hong Zhou']

# still not easy
ambiguous_matches += ['Wu Li','Thomas Strohmer','Alfred Kobsa','Itay Neeman','Jonathan Furner','Jinyi Qi','Peng Ding','Alexandru Nicolau','Itay Neeman','Kang Zhang','Ilan Adler',"Barry O'Neill",'Gideon Weiss']

# the pagerank gods
ambiguous_matches += ['Wei Zhang','Wei Li','Lei Wang','Jing Li','Yang Liu','Yu Zhang','Lei Zhang','Jun Wang','Li Zhang','Jing Wang','Xin Wang','Hai Jin','Hui Li','Jian Wang','Yan Li','Jing Zhang','Wen Gao','Li Li','Wei Chen','Wei Liu','Yang Li','Yan Zhang','Yang Yang','Jun Zhang','Yong Wang','Xin Li','Yan Wang']

ambiguous_matches = set(ambiguous_matches)
if WEIGHT_TO_GET == 'salary':

    uc_profs = faculty_affil[faculty_affil.affiliation.str.contains('University of California')]
    # salary data
    dt = {'Employee Name': str,
    'Job Title': str,
    'Base Pay': float,
    'Overtime Pay': float,
    'Other Pay': float,
    'Benefits': float,
    'Total Pay': float,
    'Total Pay & Benefits': float,
    'Year': float,
    'Notes': str,
    'Agency': str,
    'Status': str}
    na_values = [ 'Aggregate','#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'n/a', 'nan', 'null']
    dfs = [pd.read_csv('./download/university-of-california-{}.csv'.format(i),dtype=dt,na_values=na_values) for i in range(2015,2019)]
    [_.shape for _ in dfs],sum([_.shape[0] for _ in dfs])
    dfs = [_[_['Job Title'].str.contains('PROF')] for _ in dfs]
    #dfs = [_[_['Job Title'].str.contains('B/E/E')] for _ in dfs]
    dfs = [_[['Employee Name','Total Pay & Benefits']] for _ in dfs]
    dfs = [_.reset_index(drop=True) for _ in dfs]
    [_.shape for _ in dfs],sum([_.shape[0] for _ in dfs])
    from collections import defaultdict
    ca_pay = defaultdict(int)
    for df in dfs:
        df = df.fillna(0)
        for row in df.itertuples():
            ca_pay[row[1]] = max(ca_pay[row[1]],row[2])
            
    
    keys = list(ca_pay.keys())
    ca_pay_prof = {}
    for name in keys:
        name_s = name.split(' ')
        if name in ambiguous_matches:
            continue
        if name in name_idx:
            n = name
        elif name_s[0] + name_s[-1] in name_idx:
            n = name_s[0] + name_s[-1] 
        else:
            continue
        #if n in faculty_affil.name.str.lower():
        ca_pay_prof[name] = (name_idx[n] ,ca_pay[name])

        #faculty_affil
    #print(len(ca_pay_prof))
    #print(ca_pay_prof['Kristofer Pister'],ca_pay_prof['Pieter Abbeel'],ca_pay_prof['Sergey Levine'],ca_pay_prof['Jitendra Malik'],len(ca_pay_prof))
    ca_prof_n = len(ca_pay_prof)

In [None]:
#[df.shape for df in dfs]

In [None]:
if WEIGHT_TO_GET == 'salary':
    uc_prof_dict = {k:1 for k in uc_profs.name}
    i = 0
    #ca_pay_prof = {k:v for k,v in ca_pay_prof.items() if k in uc_prof_dict}
    #
    ca_prof_n = len(ca_pay_prof)
    print('faculty found:',ca_prof_n)

In [None]:
if False:
    # salary data
    dt = {'Employee Name': str,
    'Job Title': str,
    'Base Pay': float,
    'Overtime Pay': float,
    'Other Pay': float,
    'Benefits': float,
    'Total Pay': float,
    'Total Pay & Benefits': float,
    'Year': float,
    'Notes': str,
    'Agency': str,
    'Status': str}
    na_values = [ 'Aggregate','#N/A', '#N/A N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'n/a', 'nan', 'null']
    dfs = [pd.read_csv('downloads/university-of-california-{}.csv'.format(i),dtype=dt,na_values=na_values) for i in range(2015,2019)]
    [_.shape for _ in dfs],sum([_.shape[0] for _ in dfs])
    dfs = [_[_['Job Title'].str.contains('PROF')] for _ in dfs]
    dfs = [_[_['Job Title'].str.contains('B/E/E')] for _ in dfs]


In [None]:
#dfs[0][dfs[0]['Total Pay & Benefits'] > 1e6]
#dfs[0][dfs[0]['Employee Name'] == 'Stefano Soatto']
#ASSOC PROF-AY-B/E/E #PROF-AY-B/E/E #PROF-AY-B/E/E PROF-AY-B/E/E 	73921.0
#PROF-AY  B/E/E
# dfs[0]

# generate years and authorship matrix

In [None]:
min_year = all_papers[0][6]
max_year = all_papers[-1][6]
span_years = max_year - min_year + 1


if YEAR_BLOCKS!=0:
    offset_years = [i//YEAR_BLOCKS for i in range(span_years)]
    year_ind = max(offset_years)+1
    year_span_printable = {}
    for i in range(year_ind):
        start_year = offset_years.index(i) + min_year
        end_year = len(offset_years) - 1 - offset_years[::-1].index(i) + min_year
        year_span_printable[i] = str(start_year)[-2:] +'t' + str(end_year)[-2:]
    list(year_span_printable.values())
years_per_conf = year_ind if BY_YEAR_SIGMA != 0 else span_years

if BY_YEAR_SIGMA != 0:
    import scipy.stats
    import matplotlib.pyplot as plt
    plt.style.use('fivethirtyeight')
    plt.style.use('default')

    weights = []
    for i in range(years_per_conf):
        a = np.array([scipy.stats.norm.pdf( (j-i)/BY_YEAR_SIGMA) for j in range(years_per_conf)])
        a[a < 0.05] = 0
        weights.append(a/np.linalg.norm(a))
    plt.figure(figsize=(6,3))
    plt.subplot(1,2,1)
    _ = plt.plot(YEAR_BLOCKS*np.arange(years_per_conf)+min_year,weights[(2000-min_year)//YEAR_BLOCKS],lw=4)
    plt.xlim(1970,2020)

    plt.xticks(np.arange(1970,2021,10),[str(_) for _ in np.arange(1970,2021,10)])
    plt.grid(True)
    plt.tight_layout()

    plt.subplot(1,2,2)
    _ = plt.plot(YEAR_BLOCKS*np.arange(years_per_conf)+min_year,weights[(2018-min_year)//YEAR_BLOCKS],lw=4)
    plt.xlim(1970,2020)
    plt.xticks(np.arange(1970,2021,10),[str(_) for _ in np.arange(1970,2021,10)])
    plt.grid(True)
    plt.tight_layout()


plt.tight_layout()
plt.savefig('tgauss.pdf')

In [None]:
if False:
    with open('blacklist.pkl','rb') as fp:
        BLACKLIST = pickle.load(fp)
    BLACKLIST = set(BLACKLIST)
    len(BLACKLIST)
    import random
    CONFS_TO_SPLIT = set(['ICRA','CDC','CVPR','NIPS','AAAI','CHI','ICML','IJCAI','CIKM'])
    SPLIT_SIZES = [0.5,0.25,0.13,0.12]
    SPLIT_CONST = []
    i =0 
    for s in SPLIT_SIZES:
        i+=s
        SPLIT_CONST.append(i)
    LOOKUP = {}
    for p in zip(BLACKLIST,itertools.product(CONFS_TO_SPLIT,SPLIT_SIZES)):
        k = p[1][0] + '_' + str(p[1][1])
        #print(p,k)

        LOOKUP[k] = p
    NORM_VOLUME = np.sqrt(2)

In [None]:
count_of_papers = np.zeros(years_per_conf*n_confs)
paper_tmp=[]
for paper in all_papers:
    tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    yr = (year-min_year)//YEAR_BLOCKS
    
    #if venue in BLACKLIST:
    #    continue

    #if venue in CONFS_TO_SPLIT:
    #    r = random.random()
    #    for i,t in enumerate(SPLIT_CONST):
    #        if r < t:
    #            break
    #    key =   venue + '_' + str(SPLIT_SIZES[i])
    #   venue = LOOKUP[key][0]
    
    j = years_per_conf*conf_idx[venue] + yr
    count_of_papers[j] += 1
    if year == 2018 and venue == 'IEEE Access':
        paper_tmp.append(paper)
# safe divide


In [None]:
_ = plt.hist(np.power(count_of_papers[np.where(count_of_papers > 0)],1/1.618),300)

In [None]:
#for maxi in np.argsort(count_of_papers)[::-1][:5]:
#maxi= np.argmax(count_of_papers)
    #print(count_of_papers[maxi],all_venues[maxi//years_per_conf],(maxi%years_per_conf)*YEAR_BLOCKS + 1970,len(paper_tmp))

In [None]:
tmp_reshape = count_of_papers.reshape((-1,years_per_conf))
number_of_confs_per_year = np.minimum(tmp_reshape,1).sum(0)
plt.plot(np.arange(1970,2020,YEAR_BLOCKS),number_of_confs_per_year)
plt.grid(True)
plt.figure()
plt.plot(np.arange(1970,2020,YEAR_BLOCKS),tmp_reshape.sum(0))
plt.figure()
plt.plot(np.arange(1970,2020,YEAR_BLOCKS),tmp_reshape.sum(0)/number_of_confs_per_year)

if NORM_CONF_NUM ==  False:
    number_of_confs_per_year = np.ones_like(number_of_confs_per_year)
confs_norm_vector = 1.0/number_of_confs_per_year
confs_norm_vector /= confs_norm_vector.mean()

In [None]:
papers_exist= count_of_papers.copy()
papers_exist[np.where(papers_exist > 0)] = 1

count_of_papers = np.maximum(1,count_of_papers)
if NORM_VOLUME == 0:
    count_of_papers = np.ones(years_per_conf*n_confs)
elif NORM_VOLUME < 0:
    count_of_papers = np.log(count_of_papers+1)
else:
    count_of_papers = count_of_papers ** (1/NORM_VOLUME)
count_of_papers /= count_of_papers.mean()

papers_exist.sum()


In [None]:
if BY_YEAR_SIGMA != 0:
    import itertools
    #pairs_of_years = itertools.product(range(span_years),range(span_years))

    wdict = {}
    for i,j,k in itertools.product(range(n_confs),range(years_per_conf),range(years_per_conf)):
        wdict[i*years_per_conf+j,i*years_per_conf+k] = weights[j][k]
    wsa = scipy.sparse.dok_matrix((years_per_conf*n_confs,years_per_conf*n_confs))
    wsa.my_update(wdict)

In [None]:
#wsa.sum(1)

In [None]:
def dd():
    return defaultdict(list)
if 'nsf' in WEIGHT_TO_GET or 'salary' in WEIGHT_TO_GET:
    from unidecode import unidecode
    # what papers everyone (in first/last name sense) published in every year
    author_papers = defaultdict(dd)

    for paper in all_papers:
        tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
        n = len(authors)
        
        for a in authors:
            a = unidecode(a)
            split_name = a.split(' ')
            #first_last = split_name[0] +' ' + split_name[-1]
            #author_papers[first_last][year].append((venue,n))
            if not split_name[-1].isalpha() and len(split_name) > 2:
                first_last = split_name[0] +' ' + split_name[-2]
            else: 
                first_last = split_name[0] +' ' + split_name[-1]
            if first_last in ambiguous_matches:
                continue
            author_papers[first_last.lower()][year].append((venue,n))

In [None]:
def ddn():
    return defaultdict(int)
if 'nsf' in WEIGHT_TO_GET:
    nsf_paper_n = 0
    # total amount of NSF funding recieved by a person up to a given year
    author_amounts = defaultdict(ddn)
    for i,row in enumerate(df_nsf.itertuples()):
        authors, year, amount = row[3],row[4],row[5]
        # some infinite amounts exist! bad!
        if not np.isfinite(amount):
            continue
        # what is this even?
        if amount < 1000: 
            continue
        amount = amount# min(amount,1e7)
        for a in authors:
            a = aliasdict.get(a,a)
            split_name = a.split(' ')
            first_last = split_name[0] +' ' + split_name[-1]
            for yr in range(int(year),max_year+1):
                author_amounts[first_last.lower()][yr] += amount/len(a)
        nsf_paper_n += 1

In [None]:
auth_years = np.ones((n_auths,2)) * np.array([3000,1000]) 
for paper in all_papers:
    tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    for a in authors:
        i = name_idx[a]
        auth_years[i,0] = min(auth_years[i,0],year)
        auth_years[i,1] = max(auth_years[i,1],year)

In [None]:
if WEIGHT_TO_GET == 'faculty':
    count_vecs = {}
    paper_vecs = []
    for paper in all_papers:
        tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper

        #if venue in BLACKLIST:
        #    continue
        #    
        #if venue in CONFS_TO_SPLIT:
        #    r = random.random()
        #    for i,t in enumerate(SPLIT_CONST):
        #        if r < t:
        #            break
        #    key =   venue + '_' + str(SPLIT_SIZES[i])
        #    venue = LOOKUP[key][0]
        
        n = len(authors)
        j = years_per_conf*conf_idx[venue] + (year-min_year)//YEAR_BLOCKS
        
        
        if n not in count_vecs:
            author_scores = 1/(np.arange(n)+1) # I guess it's the thing to do is 
            #author_scores[-1] = author_scores[0]
            author_score_sum = author_scores.sum()
            #author_scores /= author_score_sum
            count_vecs[n] = author_scores / author_score_sum
        author_scores = count_vecs[n]
        paper_vecs.append([(name_idx[a],j,v) for a,v in zip(authors,author_scores)])

In [None]:
if WEIGHT_TO_GET == 'faculty':
    import scipy.sparse

    Xauth = scipy.sparse.dok_matrix((n_auths,years_per_conf*n_confs))
    xdict = {}
    if False:
        for paper in all_papers:
            tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
            n = len(authors)
            j = years_per_conf*conf_idx[venue] + (year-min_year)//YEAR_BLOCKS
            for ai in range(n):#zip(count_vecs[n],authors):
                i = name_idx[authors[ai]]
                xdict[(i,j)] = (1/n) + xdict.get((i,j),0)
                #xdict[(i,j)] = count_vecs[n][ai] + xdict.get((i,j),0)
    else:
        for paper_vec in paper_vecs:
            for i,j,v in paper_vec:
                xdict[(i,j)] = confs_norm_vector[j%years_per_conf] * v/count_of_papers[j] + xdict.get((i,j),0) #

    Xauth.my_update(xdict)

    Xauth = scipy.sparse.csr_matrix(Xauth)
    
    
    Xreg = scipy.sparse.csr_matrix.copy(Xauth)
    #print(Xauth.shape,Xreg.shape)
elif 'nsf' in WEIGHT_TO_GET:
    # create design mattrix
    nsf_paper_n = df_nsf.shape[0]
    Xreg = scipy.sparse.dok_matrix((df_nsf.shape[0],years_per_conf*n_confs))
    xdict = {}
    y = np.zeros(nsf_paper_n,dtype=np.float32)
    for i,row in enumerate(df_nsf.itertuples()):
        authors, year, amount = row[3],row[4],row[5]
        # some infinite amounts exist! bad!
        if not np.isfinite(amount):
            continue
        # what is this even?
        if amount < 1000: 
            continue
        for a in authors:
            a = aliasdict.get(a,a)
            split_name = a.split(' ')
            first_last = split_name[0] +' ' + split_name[-1]
            for year_a,conf_list in author_papers[first_last.lower()].items():
                if year_a <= year:
                    for paper in conf_list:
                        j = years_per_conf*conf_idx[paper[0]] + (year_a-min_year)//YEAR_BLOCKS
                        xdict[(i,j)] = confs_norm_vector[j%years_per_conf] *(1/paper[1])/count_of_papers[j]

    Xreg.my_update(xdict)
    
    #print(Xreg.sum())
elif 'salary' == WEIGHT_TO_GET:
    Xreg = scipy.sparse.dok_matrix((ca_prof_n,years_per_conf*n_confs))
    xdict = {}
    y_unique_confs = {}
    y = np.zeros(ca_prof_n,dtype=np.float32)
    y_paper = np.zeros(ca_prof_n,dtype=np.float32)
    for idx,d in enumerate(ca_pay_prof.items()):
        k,v = d
        a = all_authors[v[0]]
        y[idx] = v[1]
        sum_paper = 0 
        for year_a,conf_list in author_papers[a.lower()].items():
            for paper in conf_list:
                #if paper[0] not in r1_confs_dict:
                #    continue
                j = years_per_conf*conf_idx[paper[0]] + (year_a-min_year)//YEAR_BLOCKS
                xdict[(idx,j)] = confs_norm_vector[j%years_per_conf]*(1/paper[1])/count_of_papers[j]
                sum_paper += 1.0/paper[1]
                new_set = y_unique_confs.get(idx,set())
                new_set.add(paper[0])
                y_unique_confs[idx] = new_set
        y_paper[idx] = sum_paper
    Xreg.my_update(xdict)
    y_orig = np.copy(y)
    print(Xreg.sum())
    y_unique_confs_vec = np.zeros(ca_prof_n,dtype=np.float32)
    for i in range(ca_prof_n):
        y_unique_confs_vec[i] = len(y_unique_confs.get(i,set()))
print('Design matrix has shape',Xreg.shape)


In [None]:
#scaling_confs = np.nan_to_num(conf_sums.reshape((-1,years_per_conf)).sum(1)/new_conf_sums.reshape((-1,years_per_conf)).sum(1))

In [None]:
if BY_YEAR_SIGMA != 0:
    # convert matrix
    Xreg = scipy.sparse.csr_matrix(Xreg)
    wsa = scipy.sparse.csr_matrix(wsa)

    # get sums
    conf_sums = np.array(Xreg.sum(0))
    # get splat
    Xreg = Xreg @ wsa
    if True: # try to handle non-existing years correctly
        # clear 0s
        clear_emptys = scipy.sparse.diags(papers_exist)
        Xreg = Xreg @ clear_emptys
        # get normalize
        new_conf_sums = np.array(Xreg.sum(0))
        scaling_confs = np.nan_to_num(conf_sums.reshape((-1,years_per_conf)).sum(1)/new_conf_sums.reshape((-1,years_per_conf)).sum(1))
        
        norm_matrix = scipy.sparse.diags(np.repeat(scaling_confs,years_per_conf))
        # normalize
        Xreg = Xreg @ norm_matrix

In [None]:
[_ for _ in all_venues if 'From Database to Cyb' in _]

In [None]:
confs_with_weird = ['GLOBECOM','ICC']
if weight_to_get_idx == 3: # TEST:to find bad alias names
    ca_prof_names = [v[0] for k,v in enumerate(ca_pay_prof.items())]
    vector = np.zeros(Xreg.shape[0])
    for conf_to_test in confs_with_weird:
        #conf_to_test = 'ISPA/IUCC'
        tmp_idx = years_per_conf*conf_idx[conf_to_test]
        tmp_idx2 = years_per_conf*(conf_idx[conf_to_test]+1)
        vector2 = np.array(Xreg[:,tmp_idx:tmp_idx2].todense())
        vector += vector2.sum(1) 
        #for idx in np.argsort(vector)[::-1]:
        #    print(ca_prof_names[idx],vector[idx])
    #for idx in np.argsort(vector)[::-1]:
        #if vector[idx] > 0:
            #print(ca_prof_names[idx],vector[idx])
            #print("'"+ca_prof_names[idx]+"'",end=',')
    #vector2[ca_prof_names.index('Jitendra Malik')]


In [None]:
if WEIGHT_TO_GET == 'faculty':
    y = np.zeros(n_auths)
    for i in range(TOP_K): 
        uni_name = ranks.iloc[i]['uni']
        uni_faculty = faculty_affil[faculty_affil.affiliation == uni_name]
        uni_names = np.array(uni_faculty.name)
        for name in set([aliasdict.get(n, n) for n in uni_names]):
            if name in name_idx:
                y[name_idx[name]] = 1
elif WEIGHT_TO_GET == 'nsfmarginal':
    year_amounts = np.zeros(span_years,dtype=np.float32)
    y = np.zeros(nsf_paper_n,dtype=np.float32)

    for i,row in enumerate(df_nsf.itertuples()):
        authors, year, amount = row[3],row[4],row[5]
        
        authors2 = []
        for a in authors:
            a = aliasdict.get(a,a)
            split_name = a.split(' ')
            first_last = split_name[0] +' ' + split_name[-1]
            authors2.append(first_last)
        authors = authors2
        # some infinite amounts exist! bad!
        if not np.isfinite(amount):
            continue
        if amount <= 20000: #what is that even for?
            continue
        # maybe the old years are misleading!?
        #if year < 2002:
        #    continue
        # small grants are misleading? 150000
        #if amount < 1e7:
        #    continue
        # giant grants are msileading?
        #if amount >= 4e5:
        #    amount = 4e5 + np.log((amount-4e5)+1)*4e3
        if amount >= 1e7:
            amount = 1e7 + np.log((amount-1e7)+1)*1e5
        #print(len(authors),sum([(a in author_papers) for a in authors]))
        #print(a)
        #print(len(authors),sum([(a in author_papers) for a in authors]))
        #print(a)
        total_authors = len(authors)
        needed_authors = 0.5 * total_authors # half of all authors
        found_authors = sum([(a.lower() in author_papers) for a in authors])
        if needed_authors > 0 and needed_authors <= found_authors:
            y[i] = amount* (found_authors/total_authors)
            #year_amounts[year-min_year] += amount
elif WEIGHT_TO_GET == 'nsftotal':
    for i,row in enumerate(df_nsf.itertuples()):
        authors, year, amount = row[3],row[4],row[5]
        authors2 = []
        for a in authors:
            a = aliasdict.get(a,a)
            split_name = a.split(' ')
            first_last = split_name[0] +' ' + split_name[-1]
            authors2.append(first_last)
        authors = authors2
        
        # some infinite amounts exist! bad!
        if not np.isfinite(amount):
            continue

        if amount < 10000: #50000
            continue
        total_authors = len(authors)
        needed_authors = 0.5 * total_authors # half of all authors
        found_authors = sum([(a.lower() in author_papers) for a in authors])
        if needed_authors > 0 and needed_authors <= found_authors:
            y[i] = sum([author_amounts[first_last.lower()][year] for first_last in authors])
            #year_amounts[year-min_year] += sum([author_amounts[first_last.lower()][year] for first_last in authors])

skipped_data = scipy.sparse.diags((y != 0).astype(float))
y_orig = np.copy(y)

In [None]:
if 'nsf' in WEIGHT_TO_GET:
    if USE_LOG: # do log
        y = np.copy(np.log(1+y_orig))
        #y[y == np.log(1)] = y[y != np.log(1)].mean()
    else:
        y = np.copy(y_orig)
        #y[y == 0] = y[y != 0].mean()
    from matplotlib.pyplot import figure,hist
    hist((y-y.mean())/y.std(),100)
    figure()
    _ = hist(y,100)
    #print(skipped_data.sum())
if 'salary' in WEIGHT_TO_GET:
    y = np.copy(y_orig)
    skipped_data_vec =  np.ones_like(y)  *(y < 800000) * (y > 120000) * (y_paper >= 3.0) * (y_unique_confs_vec >= 3) #* (y_paper < 500) #* (y_orig > 50000)
    #print(skipped_data_vec.sum())
    skipped_data_vec = skipped_data_vec.astype(np.float)
    skipped_data = scipy.sparse.diags(skipped_data_vec)
    y[skipped_data_vec == 0] = y[skipped_data_vec != 0].mean()
    
    if USE_LOG: # do log
        y = np.copy(np.log(1+y))
    else:
        y = np.copy(y)
    from matplotlib.pyplot import figure,hist
    hist((y-y.mean())/y.std(),50)
    figure()
    
    _ = hist(y[abs(y-y.mean()) > 3000],50)
    #print(skipped_data_vec.sum(),)
    print('faculty used ',skipped_data_vec.sum())

In [None]:

if WEIGHT_TO_GET == 'faculty':
    from sklearn.linear_model import SGDClassifier
    if False: # old, allows only positive weights, uses elasticnet, etc.
        Xreg = scipy.sparse.csr_matrix(Xreg)
        #clf = SGDClassifier('modified_huber',average=False,verbose=1,warm_start=True,tol=1e-5,max_iter=1,alpha=1e-4,penalty='elasticnet',l1_ratio=0.9,epsilon=0.75)

        clf.fit(Xreg,y)
        for i in range(SGD_ITER):
            minv = clf.coef_[clf.coef_ > 0].min()
            maxv = clf.coef_[clf.coef_ > 0].max()
            #clf.coef_ = np.maximum(minv,clf.coef_)
            clf = clf.partial_fit(Xreg,y)
        #minv = clf.coef_[clf.coef_ > 0].min()
        #clf.coef_ = np.maximum(minv,clf.coef_)
    else: #simple, maybe worse but simple
        #clf = SGDClassifier('modified_huber',average=False,verbose=1,tol=1e-7,max_iter=SGD_ITER,alpha=1e-3)
        clf = SGDClassifier('modified_huber',average=False,verbose=be_verbose,tol=1e-9,max_iter=SGD_ITER,alpha=L2REG,epsilon=0.01)
        #y[y ==0] = -1
        clf.fit(Xreg,y)
if 'nsf' in WEIGHT_TO_GET:
    from sklearn.linear_model import SGDRegressor

    Xreg = scipy.sparse.csr_matrix(Xreg)
    clf = SGDRegressor('huber',tol=1e-9,max_iter=SGD_ITER,penalty='l2',verbose=be_verbose,alpha=L2REG,epsilon=0.01)
    #clf = SGDRegressor('huber',tol=1e-9,max_iter=100,verbose=1,penalty='l1',alpha=1e-7)

    clf.fit(skipped_data@Xreg ,y)#(y-y.mean())/y.std()
if 'salary' in WEIGHT_TO_GET:
    from sklearn.linear_model import SGDRegressor

    Xreg = scipy.sparse.csr_matrix(Xreg) 
    #clf = SGDRegressor('huber',tol=1e-9,max_iter=SGD_ITER*10,penalty='l2',verbose=False,alpha=L2REG,epsilon=0.01,average=True)
    #clf = SGDRegressor('huber',tol=1e-9,max_iter=100,verbose=1,penalty='l1',alpha=1e-7)
    clf = SGDRegressor('huber',tol=1e-9,max_iter=SGD_ITER*10,penalty='l2',verbose=be_verbose,alpha=L2REG)

    clf.fit(skipped_data @Xreg ,y)#(y-y.mean())/y.std())
result_clf = np.squeeze(clf.coef_)


In [None]:
#SS = result_clf.std()

In [None]:
#TMP = result_clf.reshape((-1,years_per_conf))
#RESULTS = defaultdict(list)
#for k,v in LOOKUP.items():
#    true_conf = v[1][0]
#    size = v[1][1]
#    RESULTS[true_conf].append((size,TMP[conf_idx[v[0]]].mean()/SS))

In [None]:
#p_vals = []
#for k,v in RESULTS.items():
#    A = np.array(v)
#    res = scipy.stats.pearsonr(A[:,0],A[:,1])
#    p_vals.append(res[1])
#p_vals = np.array(p_vals)
#p_vals.mean(),np.median(p_vals),p_vals.min(),p_vals.max()

In [None]:
#RESULTS

# evaluate model

In [None]:
if False and WEIGHT_TO_GET == 'faculty':
    import csv
    with open('pairwise_tiers.csv','rt') as csvfile:
        filereader = csv.reader(csvfile)
        conf_pairs = [[t.strip('\ufeff') for t in _ if len(t)!=0] for _ in filereader]
    classifier_cost = 0
    conf_ord = np.argsort(result_clf)
    conf_rank_dict = {}
    num_elem = n_confs*years_per_conf
    for i in range(num_elem):
        idx = conf_ord[i]
        conf_name = all_venues[idx//years_per_conf]
        conf_score = result_clf[idx]
        #if conf_score == 0:
        #    conf_rank_dict[conf_name] = num_elem
        #else:
        conf_rank_dict[conf_name] = i
    pair_len = len(conf_pairs)//2
    for i in range(pair_len):
        better = conf_pairs[2*i]
        worse = conf_pairs[2*i+1]
        #print(better,worse)
        for b in better:
            for w in worse:
                classifier_cost += (conf_rank_dict[w] < conf_rank_dict[b])
                if conf_rank_dict[w] < conf_rank_dict[b]:
                    print(w,conf_rank_dict[w],'\t',b,conf_rank_dict[b])

    all_choices = clf.decision_function(Xauth)
    frac_correct = (all_choices[y.astype(np.bool)] > 0).sum()
    print(classifier_cost,frac_correct/y.sum())

In [None]:
#result_clf = np.copy(orig_clf)
import matplotlib.pyplot as plt
# normalize data by year
if NORM_YEARS:
    orig_clf = np.copy(result_clf)

    result_clf = result_clf.reshape((-1,years_per_conf))

    #plt.plot(result_clf.sum(0)/result_clf.sum(0).sum(),label='sum')
    plt.plot(result_clf.std(0)/result_clf.std(0).sum(),label='std')
    #print(abs(result_clf.mean(0)).mean(),abs(result_clf.std(0)).mean())
    plt.legend()
    #result_clf = (result_clf)/result_clf.std(0)
    result_clf = (result_clf-result_clf.mean(0))/result_clf.std(0)
    result_clf = result_clf.reshape((-1))
    #result_clf = np.minimum(30,np.maximum(result_clf,-30))

else:
    orig_clf = np.copy(result_clf)


In [None]:
with open(weight_file,'wb') as fp:
    pickle.dump(orig_clf,fp)
print('saved {}'.format(weight_file))
if 'REGRESSION_TASK_IDX' in os.environ:
    # THIS IS FINE. JUST AN EARLY EXIT
    sys.exit(0)

In [None]:
conf_choice = ['SIGGRAPH','NIPS','3DV','HRI','Comput. Graph. Forum','Shape Modeling International',
               'Symposium on Geometry Processing',' Computer Aided Geometric Design','ICLR',
               'AAAI','I. J. Robotics Res.','CVPR','International Journal of Computer Vision',
               'Robotics: Science and Systems','ICRA','WACV','ICML','AISTATS','CoRR','SIGGRAPH Asia',
               'ECCV','ICCV','ISER','Humanoids','3DV','IROS','CoRL','Canadian Conference on AI',
               'ACCV ','Graphics Interface','CRV','BMVC']
ri_confs = np.zeros(n_confs*years_per_conf)
conf_ord = np.argsort(result_clf)
#print(clf.intercept_)
ms = result_clf.mean()
ss = result_clf.std()
for i in range(n_confs*years_per_conf):
    idx = conf_ord[-(i+1)]
    conf_name = all_venues[idx//years_per_conf]
    conf_score = result_clf[idx]
    if conf_name in conf_choice:
        ri_confs[idx] = 1
    if conf_name in conf_choice and (idx%years_per_conf)==(year_ind-2):
        print_name =conf_name + '_' + year_span_printable[idx%years_per_conf]
        print('{:40s}\t{:.1f}'.format(print_name[:35],(conf_score-ms)/ss))


In [None]:
top_k = 250
for i in range(top_k):
    idx = conf_ord[-(i+1)]
    conf_name = all_venues[idx//years_per_conf]
    conf_score = result_clf[idx]
    print_name =conf_name + '_' + year_span_printable[idx%years_per_conf]
    print('{:60s}\t{:.1f}'.format(print_name[:55],(conf_score-ms)/ss))


In [None]:
for i in range(20000):
    idx = conf_ord[-(i+1)]
    conf_name = all_venues[idx//years_per_conf]
    conf_score = result_clf[idx]
    if conf_name in conf_choice:
        ri_confs[idx] = 1
    if (idx%years_per_conf)==(year_ind-2):
        print_name =conf_name + '_' + year_span_printable[idx%years_per_conf]
        print('{:100s}\t{:.1f}'.format(print_name,(conf_score-ms)/ss))


In [None]:
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
plt.figure()
conf_choice2 = ['SIGGRAPH','AAAI','NIPS','CVPR','ICRA','ICML','ICCV','ECCV',
               'International Journal of Computer Vision','Robotics: Science and Systems']
conf_choices = [conf_choice2, 
                ['STOC','FOCS','SODA','EC','WINE'],
                ['UAI','AAAI','IJCAI','ICML','NIPS'],
                ['ICCV','ECCV','CVPR','International Journal of Computer Vision','3DV','WACV','IEEE Trans. Pattern Anal. Mach. Intell.'],
                ['ICRA','Robotics: Science and Systems','IROS','CoRL','HRI','ISER','FSR'],
                ['SIGGRAPH','SIGGRAPH Asia','ACM Trans. Graph.','Graphics Interface']
               ]
#conf_choices = [['Robotics: Science and Systems','IROS','ICRA','CoRL','WAFR','HRI','ISER']]
for conf_choice2 in conf_choices:
    plt.figure()
    #conf_choice2 = 
    conf_choice3 = []
    vs = result_clf.std()
    for conf in conf_choice2:
        idx = conf_idx[conf]
        #s = max(result_clf[years_per_conf*idx:years_per_conf*(idx+1)])
        s = result_clf[years_per_conf*(idx+1)-1]

        conf_choice3.append((s,conf))
    plt.figure(figsize=(12,8))
    for s,conf in sorted(conf_choice3,reverse=True):
        idx = conf_idx[conf]
        weights = [result_clf[years_per_conf*idx + yr]/vs for yr in offset_years]
        _ = plt.plot(np.arange(min_year,max_year+1),weights,label=conf,lw=5)
    plt.grid(True)
    plt.xlabel('year')
    plt.ylabel('value')
    #plt.ylim(-5,20)
    plt.legend()
    #plt.show()


In [None]:
 'seaborn-white','seaborn','ggplot', 'seaborn-colorblind', 'seaborn-muted','seaborn-whitegrid'

In [None]:
for style in []:#plt.style.available:
    plt.style.use(style)
    plt.figure(figsize=(12,8))
    for s,conf in sorted(conf_choice3,reverse=True):
        idx = conf_idx[conf]
        weights = [result_clf[years_per_conf*idx + yr]/vs for yr in offset_years]
        _ = plt.plot(np.arange(min_year,max_year+1),weights,label=conf,lw=5)
    plt.grid(True)
    plt.xlabel('year')
    plt.ylabel('value')
    plt.title(style)
    plt.legend()
    


In [None]:
Xauth = None
if Xauth is None:
    count_vecs = {}
    paper_vecs = []
    for paper in all_papers:
        tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
        n = len(authors)
        j = years_per_conf*conf_idx[venue] + (year-min_year)//YEAR_BLOCKS

        if n not in count_vecs:
            author_scores = np.ones(n) #1/(np.arange(n)+1) 
            #author_scores[-1] = author_scores[0]
            author_score_sum = author_scores.sum()
            #author_scores /= author_score_sum
            count_vecs[n] = author_scores #/ author_score_sum
        else:
            author_scores = count_vecs[n]
            paper_vecs.append([(name_idx[a],j,v) for a,v in zip(authors,author_scores)])

In [None]:
#Xauth = None

In [None]:
if Xauth is None or Xauth.shape[1] != years_per_conf*n_confs:
    import scipy.sparse
    Xauth = scipy.sparse.dok_matrix((n_auths,years_per_conf*n_confs))
    xdict = {}
    if False:
        for paper in all_papers:
            tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
            n = len(authors)
            j = years_per_conf*conf_idx[venue] + (year-min_year)//YEAR_BLOCKS
            for ai in range(n):#zip(count_vecs[n],authors):
                i = name_idx[authors[ai]]
                #xdict[(i,j)] = 1/n + xdict.get((i,j),0)
                xdict[(i,j)] = count_vecs[n][ai] + xdict.get((i,j),0)

    else:
        for paper_vec in paper_vecs:
            for i,j,v in paper_vec:
                xdict[(i,j)] = v + xdict.get((i,j),0)

    Xauth.my_update(xdict)
            
    Xauth = scipy.sparse.csr_matrix(Xauth)

In [None]:
#result_clf = np.copy(orig_clf)

In [None]:

total_scores = Xauth.dot(result_clf)
years_working = (1+auth_years[:,1]-auth_years[:,0])
value_scores = (total_scores)/years_working
ri_filter_mat = scipy.sparse.diags(ri_confs)
ri_total_scores = Xauth.dot(ri_filter_mat).dot(result_clf)
ri_value_scores = ri_total_scores/years_working
pub_num = Xauth.sum(1)
rs = ri_total_scores.std()
rm = ri_total_scores.mean()

ts = total_scores.std()
tm = total_scores.mean()

vs = value_scores.std()
vm = value_scores.mean()

In [None]:
prev_cand = ['Pulkit Agrawal',
 'Joydeep Biswas',
 'Katherine L. Bouman',
 'David Braun',
 'Jia Deng',
 'Naomi T. Fitter',
 'David F. Fouhey',
 'Saurabh Gupta',
 'Judy Hoffman',
 'Hanbyul Joo',
 'Honglak Lee',
 'Changliu Liu',
 'Petter Nilsson',
 "Matthew O'Toole",
 'Alessandro Roncone',
 'Alanson P. Sample',
 'Manolis Savva',
 'Adriana Schulz',
 'Amy Tabb',
 'Fatma Zeynep Temel',
 'Long Wang',
 'Cathy Wu',
 'Ling-Qi Yan']
print('{:20s}\t{:4s}\t{:4s}\t{:4s}\t{}\t{}'.format('name','total','rate','ri','years','pubs'))
for ns, name in sorted([(total_scores[name_idx[ni]],ni) for ni in prev_cand],reverse=True):
    ni = name_idx[name]
    print('{:20s}\t{:.2f}\t{:.2f}\t{:.2f}\t{:.0f}\t{:.1f}'.format(name,
                                                                  (total_scores[ni]-tm)/ts,
                                                                  (value_scores[ni]-vm)/vs,
                                                                  (ri_total_scores[ni]-rm)/rs,
                                                                  years_working[ni],pub_num[ni,0]))
print('')
curious_names = ['Xiaolong Wang 0004','Judy Hoffman','Paris Siminelakis','Roie Levin','Leonid Keselman',
                 'Nicholas Rhinehart','Vincent Sitzmann','Siddharth Ancha','Xingyu Lin',
                 'Humphrey Hu','Avideh Zakhor',
                 'David F. Fouhey','Chelsea Finn','Nathan Michael',
                 'Lerrel Pinto','Wen Sun 0002','Samuel Clarke','Ge Lv',
                 'Justin Johnson',
                 'Amir Roshan Zamir','Dominik Peters','Jonathan T. Barron','Dorsa Sadigh','Derek Hoiem','Vaggos Chatziafratis',
                 'Brian Okorn','David Held']
print('{:20s}\t{:4s}\t{:4s}\t{:4s}\t{}\t{}'.format('name','total','rate','ri','years','pubs'))
for _,name in sorted([(total_scores[name_idx[_]],_) for _ in curious_names],reverse=True):
    ni = name_idx[name]
    print('{:20s}\t{:.2f}\t{:.2f}\t{:.2f}\t{:.0f}\t{:.1f}'.format(name,
                                                                  (total_scores[ni]-tm)/ts,
                                                                  (value_scores[ni]-vm)/vs,
                                                                  (ri_total_scores[ni]-rm)/rs,
                                                                  years_working[ni],pub_num[ni,0]))

In [None]:
uni_faculty = faculty_affil[faculty_affil.affiliation == 'Carnegie Mellon University'] #Carnegie Mellon University
uni_names = np.array(uni_faculty.name)
uni_names = list(uni_names)
cmu_scores = []
for name in set([aliasdict.get(n, n) for n in uni_names]):
    if name in name_idx:
        score = total_scores[name_idx[name]]
        cmu_scores.append(((score-tm)/ts,name))
for s,p in sorted(cmu_scores,reverse=True):
    print('{:30s}\t\t{:.3f}'.format(p,s))


In [None]:
from collections import Counter,defaultdict
def di():
    return defaultdict(float)
 
author_by_year = defaultdict(di)
for paper in all_papers:
    tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    n = len(authors)
    for a in authors:
        author_by_year[a][year] += result_clf[years_per_conf*conf_idx[venue] + offset_years[year-min_year]]/n

In [None]:
from scipy.ndimage.filters import gaussian_filter1d
plt.figure(figsize=(8,8))
example_names = ['Takeo Kanade','Martial Hebert','Christopher G. Atkeson','Howie Choset','Deva Ramanan','Jessica K. Hodgins'] #,'Pieter Abbeel'
for example_name in example_names:
    example_value = np.zeros(max_year+1-min_year)
    years = author_by_year[example_name]
    yrs = [_ for _ in years.keys() if _  > 0]
    start_year = min(yrs)
    end_year = max(yrs)
    span = end_year - start_year
    start_year,end_year,span
    for y,v in years.items():
        example_value[y-min_year] += v
            
    plt.plot(np.arange(min_year,max_year+1)[:-1],gaussian_filter1d(example_value[:-1], sigma=3),label=example_name,lw=3)
    #plt.plot(gaussian_filter1d(example_value[:span], sigma=2),label=example_name )

#plt.plot((val_by_year/v_count),label='average author')
plt.ylabel('annual value (3yr avg)')
#plt.xlabel('year since first publication')
plt.xlabel('working year')

plt.legend()
plt.grid()

In [None]:

df_corr = pd.read_csv('other_ranks/correlation_cleaned.csv')
df_corr = df_corr.drop(columns=[_ for _ in df_corr.columns if 'Unnamed' in _])
df_corr = df_corr.drop(columns=['pms','n_papers'])
df_corr = df_corr.rename(columns={'totals': 'venue_score', 'csrp': 'csr_pubs','csrpn': 'csr_adj','gcite': 'influence'})
df_corr = df_corr[['name','papers', 'citations', 'h-index',
       'i10','csr_pubs', 'csr_adj','venue_score','influence']]
df_corr = df_corr.dropna('index')
df_corr.index = df_corr.name

ts = total_scores.std()
tm = total_scores.mean()
for name in df_corr.name:
    if name in name_idx:
        idx = name_idx[name]
        df_corr.loc[name,'venue_score'] = (total_scores[idx]-tm)/ts
print(df_corr.corr('spearman').loc['influence','venue_score'],df_corr.corr('kendall').loc['influence','venue_score'],df_corr.corr('spearman').loc['h-index','venue_score'])
#if clfn == clfs_test.shape[-1]:
df_corr.corr('spearman')

In [None]:
with open(weight_file,'wb') as fp:
    pickle.dump(orig_clf,fp)
print('saved {}'.format(weight_file))

In [None]:
one_auth_confs = {}
clf_std = result_clf.std()
clf_mean = result_clf.mean()
for paper in all_papers:
    tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    if 'Christopher G. Atkeson' in authors:
        one_auth_confs[venue] = 1 + one_auth_confs.get(venue,0)

In [None]:
#for v,k in sorted([(v,k) for k,v in one_auth_confs.items()],reverse=True):
#    print('{:40s}\t{}\t{:.1f}'.format(k,v,(result_clf[conf_idx[k]*years_per_conf+4]-clf_mean)/clf_std))

In [None]:
page_counts = {}
for paper in all_papers:
    tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    page_counts[pages] = 1 + page_counts.get(pages,0)

In [None]:
#sorted([(v,k) for k,v in page_counts.items()],reverse=True)

In [None]:
df_merged = pd.read_csv('other_ranks/faculty_affil_scholar.csv')

In [None]:
fuzzythresh=0.9 #0.9
df_merged.loc[df_merged.fuzzyscore > fuzzythresh,'dblpname'] = df_merged[df_merged.fuzzyscore > fuzzythresh].fuzzyname
df_merged.loc[df_merged.fuzzyscore > fuzzythresh,'dblpexists'] = 1
df_merged = df_merged[df_merged.dblpexists == 1]
df_merged['venue_score'] = np.ones_like(df_merged.dblpexists)
ts = total_scores.std()
tm = total_scores.mean()
df_merged = df_merged.set_index(df_merged.dblpname)
seen_map = {}
for name in df_merged.index:
    if name in name_idx:
        idx = name_idx[name]
        df_merged.loc[name,'venue_score'] = total_scores[idx]
        seen_map[name] = 1


In [None]:
df_merged = df_merged.drop(columns=['Unnamed: 0','First Name','Last Name','Sholar link','Rank (Full, Associate, Assistant, Other)','Full Name','University_y','University_x','Unnamed: 11',"ID",'fuzzyname','dblpexists','fuzzyscore','UniversityID'])

In [None]:
new_datas = []
for row in faculty_affil.itertuples():
    if row[1] in name_idx and row[1] not in seen_map:
        seen_map[row[1]] = 1
        new_data = {}
        new_data['dblpname'] = row[1]
        #new_data['index'] = row[1]
        new_data['school'] = row[2]
        new_data['venue_score'] = total_scores[idx]
        new_datas.append(new_data)
        #df_merged = df_merged.append([row[1],np.nan,np.nan,np.nan,row[2],row[1],(total_scores[idx]-tm)/ts])

In [None]:
print(df_merged.shape)
df_csr_to_add = pd.DataFrame(new_datas)
#df_csr_to_add = df_csr_to_add.set_index('dblpname')
df_csr_to_add = df_csr_to_add.set_index('dblpname')
#df_merged = pd.concat([df_merged,df_csr_to_add])
print(df_merged.shape)
#print(faculty_affil.shape)

In [None]:
df_t10 = df_merged[df_merged['t10-index'].notna()]
#df_t10.drop()

In [None]:
school_vals = df_t10.groupby('school').aggregate('sum').sort_values('venue_score',0,False)
t10_schools = school_vals.sort_values('t10-index',0,False)
school_vals.corr('spearman')

In [None]:
 
df_t10.corr('spearman')


In [None]:
#df_hindex = df_merged[df_merged['h-index'].notna()]
#df_hindex.corr('spearman')


In [None]:
school_vals

In [None]:
bp2 = pd.read_csv('other_ranks/uni_rank_bp.csv')
times = pd.read_csv('other_ranks/uni_rank_times.csv')

srf2 = pd.read_csv('other_ranks/uni_rank_mergedscholar.csv')
st2 = pd.read_csv('other_ranks/uni_rank_st.csv')
qt2 = pd.read_csv('other_ranks/uni_rank_qt.csv')
sr2 = pd.read_csv('other_ranks/uni_rank_sr.csv')
pr2 = pd.read_csv('other_ranks/uni_rank_pr.csv')
cm2 = pd.read_csv('other_ranks/uni_rank_cs.csv')
usn2 = pd.read_csv('other_ranks/uni_rank_usn.csv')
df_csr = pd.read_csv('other_ranks/ranks.csv')

pr2.USN2010 = pr2.USN2010.map(lambda x: int(x) if x.isnumeric() else np.nan)


In [None]:
datasets = [
    [(row[2],row[1]) for row in st2.itertuples()],
    [(row[2],row[1]) for row in times.itertuples()],
    [(row[2],row[1]) for row in qt2.itertuples()],
    [(row[6],row[1]) for row in pr2.itertuples()],
    [(row[2],row[1]) for row in cm2.itertuples()],
    [(row[2],row[1]) for row in sr2.itertuples()],
    [(row[-1],row[2]) for row in srf2.itertuples()],

    [(row[0],idx+1) for idx,row in enumerate(t10_schools.itertuples())],
    [(row[2],row[1]) for row in df_csr.itertuples()],
    [(row[0],idx+1) for idx,row in enumerate(school_vals.itertuples())],
    [(row[2],row[1]) for row in bp2.itertuples()],
    [(row[6],row[4]) for row in pr2.sort_values('NRC95',ascending=True).itertuples() ],
    [(row[6],row[3]) for row in pr2.sort_values('USN2010',ascending=True).itertuples() if np.isfinite(row[3]) ],
    [(row[2],row[1]) for row in usn2.itertuples()]
]
dataset_names = ['Shanghai','Times','QS','Prestige','CSMetrics',
                 'ScholarRank','ScholarRankFull','t10Sum','CSRankings','Mine','BestPaper','NRC95',"USN10",'USN18']
n_datasets = len(datasets)


In [None]:
corr_matrix = np.zeros((n_datasets,n_datasets))
count_matrix = np.zeros((n_datasets,n_datasets))
name_datasets = [ [v[0] for v in d] for d in datasets ]
all_names = sorted(list(set(sum(name_datasets,[]))))
all_vec = [sum([name in d for d in name_datasets])>=(len(datasets)-4) for name in all_names]
subset_names = [name for name,vec in zip(all_names,all_vec) if vec]
subset_names,len(subset_names)
import scipy.stats as stats
for i in range(n_datasets):
    inames = [u[0] for u in datasets[i]]
    for j in range(i,n_datasets):
        jnames = [u[0] for u in datasets[j]]

        #exist_1 = [((ni in subset_names) and (ni in jnames)) for ni in inames]
        #exist_2 = [((nj in subset_names) and (nj in inames))for nj in jnames]
        exist_1 = [((True) and (ni in jnames)) for ni in inames]
        exist_2 = [((True) and (nj in inames))for nj in jnames]
        
        d1 = np.array(datasets[i])[exist_1]
        d2 = np.array(datasets[j])[exist_2]
        v1 = d1[:,1].astype(np.float)
        v2 = np.array([d2[np.where(d2[:,0] == name)[0][0],1] for name in d1[:,0]]).astype(np.float)
        c = stats.spearmanr(v1,v2)[0]
        corr_matrix[i][j] = c
        corr_matrix[j][i] = c
        count_matrix[i][j] = len(v1)
        count_matrix[j][i] = len(v2)
        #print(v1,v2)

In [None]:
np.set_printoptions(precision=2)
print(corr_matrix)

print('mean best')
for s,n in sorted([(s,n) for n,s in zip(dataset_names,corr_matrix.mean(1))],reverse=True):
    print('{:30s}\t{:.3f}'.format(n,s))
print('\n usnews best')
for s,n in sorted([(s,n) for n,s in zip(dataset_names,corr_matrix[-1])],reverse=True):
    print('{:30s}\t{:.3f}'.format(n,s))
print('\n names')

for n in dataset_names:
    print(n)

In [None]:
count_matrix

In [None]:
by_year_mean = result_clf.reshape((-1,years_per_conf)).mean(0)
by_year_std = result_clf.reshape((-1,years_per_conf)).std(0)
print(by_year_std)
plt.figure(figsize=(24,6))
plt.subplot(1,2,1)
confs_of_interest = ['SIGGRAPH','AAAI','NIPS','CVPR','ICRA','ICML','ICCV','ECCV', 'I. J. Robotics Res.',
                'WACV','CHI','ACC','HRI',  'AAMAS','IJCAI',
               'ISER','Robotics: Science and Systems','IROS','CoRL','ICLR','3DV']
#confs_of_interest = ['CVPR','ICRA',"ICCV",'CoRL','Robotics: Science and Systems','ECCV','WACV','IROS']
#confs_of_interest = ['SIGIR','JCDL','CIKM','KDD','WWW','SIGMOD Conference','VLDB']
confs_of_interest = ['SIGIR','JCDL','CIKM','KDD','WWW','SIGMOD Conference','VLDB']
#confs_of_interest = ['AAAI',"NIPS",'ICML','IJCAI','UAI','AISTATS','COLT']
#confs_of_interest = ['ICRA',"IROS",'ISER','CoRL','Robotics: Science and Systems',"WAFR"]
#confs_of_interest = ['SODA','STOC','FOCS','WINE','EC','COLT','Theory of Computing']
#confs_of_interest =    ['CHI','ACM Trans. Comput.-Hum. Interact.','CSCW','UbiComp','UIST','ICWSM']

for conf in confs_of_interest:
    idx = conf_idx[conf]
    weights = [(result_clf[years_per_conf*idx + yr]-by_year_mean[yr])/by_year_std[yr] for yr in offset_years]
    plt.plot(np.arange(min_year,max_year+1),weights,label=conf,lw=4)
plt.grid(True)
plt.legend()
plt.title('adjusted')
plt.subplot(1,2,2)
plt.title('unadjusted')
for conf in confs_of_interest:
    idx = conf_idx[conf]
    weights = [result_clf[years_per_conf*idx + yr] for yr in offset_years]
    plt.plot(np.arange(min_year,max_year+1),weights,label=conf,lw=4)
plt.legend()
plt.grid(True)


In [None]:
print(df_corr.corr('spearman').loc['influence','venue_score'])
print(df_corr.corr('spearman').loc['h-index','venue_score'])
print(corr_matrix[-1][8])
print(df_t10.corr('spearman').loc['venue_score','t10-index'])

In [None]:
from scipy.ndimage.filters import gaussian_filter1d

cmu_facutly = pd.read_csv('other_ranks/cmu_faculty.csv')
subdept = {}
subdept_count = {}

plt.figure(figsize=(8,8))
yearly_subdept = {}

seen = {}

cmu_facutly.dept = cmu_facutly.dept.fillna('CSD')

for sd in cmu_facutly.dept.unique():
    for row in cmu_facutly[cmu_facutly.dept == sd].itertuples():
        name = aliasdict.get(row[1],row[1])
        if name in name_idx and name not in seen:
            seen[name] = 1
            subdept[row[2]] = total_scores[name_idx[name]] + subdept.get(row[2],0)
            subdept_count[row[2]] = 1 + subdept_count.get(row[2],0)

seen = {}

subdept = {k:v for k,v in subdept.items()}

for value, dept in sorted([(v,k) for k,v in subdept.items()],reverse=True):
    print(dept,value)
    example_value = np.zeros(max_year+1-min_year)
    count_value = np.zeros(max_year+1-min_year)

    for row in cmu_facutly[cmu_facutly.dept == dept].itertuples():
        name = aliasdict.get(row[1],row[1])
        if name in name_idx and name not in seen:
            seen[name] = 1
            years = author_by_year[name]
            yrs = [_ for _ in years.keys() if _  > 0]
            if len(yrs) > 0:
                start_year = min(yrs)
                end_year = max(yrs)
                span = end_year - start_year
                start_year,end_year,span
                for y,v in years.items():
                    example_value[y-min_year] += v
                    count_value[y-min_year] += 1

    plt.plot(np.arange(min_year,max_year+1)[:-2],gaussian_filter1d((example_value)[:-2], sigma=2),label='{} ({:.1f})'.format(dept,subdept[dept]/1000),lw=3)
    #plt.plot(gaussian_filter1d(example_value[:span], sigma=2),label=example_name )

#plt.plot((val_by_year/v_count),label='average author')
plt.ylabel('annual value (2yr sigma smoothing)')
#plt.xlabel('year since first publication')
plt.xlabel('working year')

plt.legend()
plt.grid()
print(sum(subdept.values()))
print(total_scores[name_idx['Takeo Kanade']])


In [None]:
subdept_count

In [None]:
with open('other_ranks/msar.json') as fp:
    msar = json.load(fp)

In [None]:
try:
    df_msar = pd.read_csv('other_ranks/traditional_conf_scores.csv')
except:
    from fuzzywuzzy import process, fuzz
    df_msar = pd.DataFrame(msar)
    dblp_conf_name = []
    matchable_names = [fuzz._process_and_sort(n,False) for n in all_venues]
    for row in df_msar.itertuples():
        #print(row[2],row[-1])
        try:
            if row[-1] in conf_idx:
                dblp_conf_name.append(row[-1])
            elif row[2] in conf_idx:
                dblp_conf_name.append(row[2])
            elif len(row[2].split('/')) > 1:
                found = False
                for subname in row[2].split('/'):
                    if found == False and subname in conf_idx:
                        dblp_conf_name.append(subname)
                        found = True
                if found == False:
                    raise
            elif len(row[2].split('(')) > 1:
                substr = row[2].split('(')
                found = False
                for subname in [substr[0],substr[1][:-1]]:
                    if found == False and subname in conf_idx:
                        dblp_conf_name.append(subname)
                        found = True
                if found == False:
                    raise
            else:
                raise
        except:
            bestshort,bestlong = None,None
            if row[2] != None:
                matcher = fuzz.SequenceMatcher(None, fuzz._process_and_sort(row[2],False))
                n3s = []
                for n2 in matchable_names:
                    matcher.set_seq2(n2)
                    n3s.append(matcher.ratio())
                v=np.argmax(n3s)
                bestshort = (all_venues[v],n3s[v])
                #print(bestshort[1:],end='\t')
            if row[-1] != None:
                matcher = fuzz.SequenceMatcher(None, fuzz._process_and_sort(row[-1],False))
                n3s = []
                for n2 in matchable_names:
                    matcher.set_seq2(n2)
                    n3s.append(matcher.ratio())
                v=np.argmax(n3s)
                bestlong = (all_venues[v],n3s[v])
                #print(bestlong[1:],end='\t')
            if bestlong and bestlong[-1] > 0.96:
                dblp_conf_name.append(bestlong[0])
                #print(bestlong,row)
            elif bestshort and bestshort[-1] > 0.96:
                dblp_conf_name.append(bestshort[0])
                #print(bestshort,row)
            else:
                #print(bestlong,bestshort,row[2],row[-1])
                dblp_conf_name.append('NotAConf')
    df_msar['dblp_name'] = dblp_conf_name
    df_msar.to_csv('other_ranks/traditional_conf_scores.csv')
    #ILPS/ISLP/NACLP/SLP
    #DISC(WDAG)


In [None]:
df_msar.shape,df_msar[df_msar.dblp_name == 'NotAConf'].shape


In [None]:
df_msar_filt = df_msar[df_msar.dblp_name != 'NotAConf'].copy()#.sort_values('H',ascending=False)
scores = []
for row in df_msar_filt.itertuples():
    conf = row[-1]
    if conf in conf_idx:
        idx = conf_idx[conf]
        weights = [result_clf[years_per_conf*idx + yr] for yr in offset_years[1984-min_year:2014-min_year]]
        #scores.append(result_clf[years_per_conf*idx + offset_years[2014-min_year]])

        scores.append(np.max(np.array([w for w in weights])))
    else: # some naming issue
        scores.append(-1)
        print(conf)

    #print(scores[-1],weights)

In [None]:
df_msar_filt['venue_scores'] = scores
df_msar_filt = df_msar_filt[~df_msar_filt.dblp_name.duplicated()].copy()
df_msar_filt = df_msar_filt[~df_msar_filt.venue_scores.duplicated()].copy()
#df_msar_filt['h-approx'] = 0.54*np.sqrt(df_msar_filt.citations)

df_msar_filt.shape

In [None]:
thresh = 10
print(df_msar_filt[df_msar_filt.H > thresh].shape)
df_msar_filt[df_msar_filt.H > thresh].corr('spearman')


In [None]:
df_msar_filt[df_msar_filt.category == 'Computer Vision'].sort_values('venue_scores',ascending=False)

In [None]:
(df_msar_filt.venue_scores == -0.029198231466385834).sum()

In [None]:
df_msar.category.unique()