## Ideas for statistics


#### 3 types of authorship model scores (full, 1/n, 1/position)
* Mean Scores across years (x)
* Max Scores across year & Year of Max ( )
* Average value in last 7 years  ( )
* Most productive co-author (x)

#### Remaining Analysis from 1/position
* "RI" Conf sub-score (X)
* "Top Graphics","Top Vision", "Top Robotics", "Top ML", "Other" sub-scores (x)
* Average number of authors (x)
* Average Author Position (x)
* Average & Median "quality" of collab ( )
* Current Affiliation (x)
* Total number of collab ( )
* Top 3 collabs (x)
* Top 3 conferences from generated value ( )
* Career length (x)
* Number of collabs w/ more than 4 papers

#### Advanced Stats from 1/n
* 5 unlabeled variants of plus-minus (w/ intercept) (x)
* 5 unlabeled variants of plius-minus (w/o intercept) (x)

#### NSF Data
* Total number of grants (x)
* Total grant money (x)
* fractional grant money (x)
* grant money of collabs ( )

In [None]:
import os
import sys
import fnmatch
import numpy as np
import pandas as pd
import json
import gzip
import pickle
import csv
import scipy.sparse
Xauth = None
from collections import defaultdict
import matplotlib.pyplot as plt

In [None]:
# setup the update to work despite the broken scipy documentation
try:
    a = scipy.sparse.dok_matrix((10,10))
    a.update({(0,0):1.0})
    scipy.sparse.dok_matrix.my_update = scipy.sparse.dok_matrix.update
except:
    a = scipy.sparse.dok_matrix((10,10))
    a._update({(0,0):1.0})
    scipy.sparse.dok_matrix.my_update = scipy.sparse.dok_matrix._update

In [None]:
with gzip.open('useful_venue_list.pkl.gz','rb') as fp:
    all_venues = pickle.load(fp)
with gzip.open('useful_authors_list.pkl.gz','rb') as fp:
    all_authors = pickle.load(fp)
with gzip.open('useful_papers.pkl.gz','rb') as fp:
    all_papers = pickle.load(fp)


In [None]:

min_year = all_papers[0][6]
max_year = all_papers[-1][6]
span_years = max_year - min_year + 1
print(min_year,max_year,span_years)
conf_idx = {v:i for i,v in enumerate(all_venues)}
name_idx = {v:i for i,v in enumerate(all_authors)}
n_confs = len(all_venues)
n_auths = len(all_authors)
n_papers = len(all_papers)
print(n_confs,n_auths,n_papers)

In [None]:
scoreV = {}
clf =  np.load('clf_gold.pkl.npy')
years_per_conf = clf.shape[0]//n_confs
YEAR_BLOCKS = span_years//years_per_conf
clf[2323]


In [None]:

for FI in [False,True]:
    scoreV['_apm' + str(FI)] = np.load('apm'+str(FI) + '.npy')
    scoreV['pw_apm' + str(FI)] = np.load('pwapm'+str(FI) + '.npy')
    scoreV['pweff_apm' + str(FI)] = np.load('pweffapm'+str(FI) + '.npy')
    scoreV['pwunk_apm' + str(FI)] = np.load('pwunkapm'+str(FI) + '.npy')
    print(scoreV['pwunk_apm' + str(FI)].shape)


In [None]:
if False:
    try:
        import gzip
        import pickle
        with gzip.open('scoresV2.pkl.gz','rb') as fp:
            scoreV = pickle.load(fp)
    except:
        print('failed!')
    PROCESS_DATA = len(scoreV) < 13
    print(scoreV['pwunk_apm' + str(FI)].shape)
    USE_LIMITS = False
else:
    PROCESS_DATA = True
    USE_LIMITS = False
    last_years = np.load('last_years.npy')

In [None]:
[(k,_.shape) for k,_ in scoreV.items() if _.shape[0] == 2468621]

In [None]:
cmu_uni = pd.read_csv('other_ranks/cmu_faculty.csv')
cmu_uni = cmu_uni.fillna('Other')
cmu_uni = cmu_uni[cmu_uni.dept == 'RI']
uni_names = set(list(cmu_uni.name))

print(len(uni_names))
conf_counts = {}
conf_counts_value = {}

#interesting_set = uni_names

for paper in all_papers:
    tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    if year < 2004:
        continue
    n = len(authors)
    for a in authors:
        if a in uni_names:
            conf_counts[venue] = 1/n + conf_counts.get(venue,0)
            conf_counts_value[venue] = clf[years_per_conf*(conf_idx[venue]) + (year-min_year)//YEAR_BLOCKS]/n + conf_counts_value.get(venue,0)
conf_counts_value = {k: v/conf_counts[k] for k,v in conf_counts_value.items()}
ri_fav_confs = [(conf_counts[_[1]]*conf_counts_value[_[1]],_[1],conf_counts[_[1]],conf_counts_value[_[1]]) for _ in sorted([(v,k) for k,v in conf_counts.items() if v > 0],reverse=True)]

In [None]:
ri_confs = [_[1] for _ in sorted(ri_fav_confs,reverse=True) if _[-2] >= 1.25]
#confs_to_filter =['ICRA','IROS','Robotics: Science and Systems']
ri_confs

In [None]:
if PROCESS_DATA:
    auth_years = np.ones((n_auths,2)) * np.array([3000,1000]) 
    for paper in all_papers:
        tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
        for a in authors:
            i = name_idx[a]
            auth_years[i,0] = min(auth_years[i,0],year)
            auth_years[i,1] = max(auth_years[i,1],year)
    working_years = (auth_years[:,1] - auth_years[:,0]+1)
    scoreV['working_years'] = working_years
    scoreV['auth_years'] = auth_years
    scoreV['last_years'] = last_years


In [None]:
if PROCESS_DATA:
    valid_ns = set()
    for paper in all_papers:
        tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
        n = len(authors)
        valid_ns.add(n)


In [None]:
if USE_LIMITS:
    for i in range(max(valid_ns)):
        valid_ns.add(i)

In [None]:
conf_types = {
        'RI': ri_confs,
        'ML':['NIPS','ICML','AAAI','AISTATS','IJCAI','UAI','CoRL','ICLR'],
        'CV':['CVPR','ICCV','ECCV','IEEE Trans. Pattern Anal. Mach. Intell.','FGR','Int. J. Comput. Vis.','WACV','BMVC','ACCV'],
        'ROB':['HRI','Int. J. Robotics Res.','Robotics: Science and Systems','Humanoids','WAFR','IROS','ICRA','FSR','ISER','ISRR','AAMAS','IEEE Robotics Autom. Lett.','IEEE Trans. Robotics and Automation'],
        'GR':['ACM Trans. Graph.','Comput. Graph. Forum','SIGGRAPH','SIGGRAPH Asia','Symposium on Computer Animation'],
             }

In [None]:
am_types = ['full','1/n','1/i']

In [None]:
if PROCESS_DATA:
    confTypeN = len(conf_types)+1
    YearConf = scipy.sparse.lil_matrix((n_confs*years_per_conf,years_per_conf*confTypeN))
    for i in range(years_per_conf):
        year_filter = np.zeros_like(clf).reshape((-1,years_per_conf))
        year_filter[:,i] = 1
        YearConf[:,i*confTypeN] = (clf * year_filter.reshape(clf.shape))[:,np.newaxis]
        j = 1
        for f_type, f_confs in conf_types.items():
            year_filter = np.zeros_like(clf).reshape((-1,years_per_conf))
            for conf in f_confs:
                year_filter[conf_idx[conf],i] = 1
            YearConf[:,i*confTypeN+j] = (clf * year_filter.reshape(clf.shape))[:,np.newaxis]
            j+=1
    YearConf = scipy.sparse.csr_matrix(YearConf)

In [None]:
import scipy.sparse
import gc
if PROCESS_DATA:
    for amt in am_types:

        per_author_val = {}

        if amt == 'full':
            for n in valid_ns:
                author_scores = np.ones(n)
                per_author_val[n] = author_scores
        elif amt == '1/n':
            for n in valid_ns:
                author_scores = (np.ones(n))
                per_author_val[n] = author_scores/author_scores.sum()
        elif amt == '1/i':
            for n in valid_ns:
                author_scores = 1/(np.arange(n)+1)
                per_author_val[n] = author_scores/author_scores.sum()
        else:
            raise

        count_vecs = {}
        paper_vecs = []
        for paper in all_papers:
                
            tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
            n = len(authors)
            j = years_per_conf*conf_idx[venue] + (year-min_year)//YEAR_BLOCKS

            author_scores = per_author_val[n]
            if USE_LIMITS:
                tmpp = []
                tmpapers = authors[:-1] if n >= 2 else authors
                tmpscores = author_scores[:-1] if n >= 2 else author_scores

                for a,v in zip(tmpapers,tmpscores):
                    idx = name_idx[a]
                    if year > last_years[idx]:
                        continue
                    tmpp.append((idx,j,v))
                paper_vecs.append(tmpp)
            else:
                paper_vecs.append([(name_idx[a],j,v) for a,v in zip(authors,author_scores)])

        Xauth = scipy.sparse.dok_matrix((n_auths,years_per_conf*n_confs))
        xdict = {}

        for paper_vec in paper_vecs:
            for i,j,v in paper_vec:
                xdict[(i,j)] = v + xdict.get((i,j),0)

        Xauth.my_update(xdict)

        Xauth = scipy.sparse.csr_matrix(Xauth)



        scoreV[amt] = Xauth @ YearConf


        paper_vec = []
        xdict = {}
        gc.collect()

In [None]:
import scipy.ndimage
for am in am_types:
    #scores = np.array(scoreV[am]).reshape((n_auths,years_per_conf,-1)).astype(np.float32)
    scores = np.array(scoreV[am].todense()).reshape((n_auths,years_per_conf,-1)).astype(np.float32)
    scores = np.transpose(scores,(0,2,1))
    smooth_kernel = scipy.ndimage.gaussian_filter1d(np.identity(years_per_conf,np.float32),1)
    scores = scores @ smooth_kernel
    scoreV[am] = scores

In [None]:
sTypes = ['Full'] + [k for k,v in conf_types.items()]
scores.dtype,scores.nbytes,gc.collect()

In [None]:
auth_years = scoreV['auth_years']
working_years = scoreV['working_years']

total_scores = scoreV['1/i'][:,sTypes.index('Full')].sum(1)
ri_scores = scoreV['1/i'][:,sTypes.index('RI')].sum(1)
ri_eff_scores = ri_scores/working_years#,np.maximum(auth_years[:,1]-2000,1))

ri_scores_max = scoreV['1/i'][:,sTypes.index('RI')].max(1)
ri_scores_max_yr = np.argmax(scoreV['1/n'][:,sTypes.index('RI')],axis=1)*YEAR_BLOCKS + min_year


In [None]:
scoreV['1/i_total_1970'] = scoreV['1/i'][:,sTypes.index('Full')].sum(1)
scoreV['1/i_RI_1970'] = scoreV['1/i'][:,sTypes.index('RI')].sum(1)
for sub in ['ROB','CV','GR','ML']:
    den = scoreV['1/i_{}_1970'.format(sub)] = scoreV['1/i'][:,sTypes.index(sub)].sum(1)
scoreV['full_total_1970'] = scoreV['full'][:,sTypes.index('Full')].sum(1)
scoreV['1/n_total_1970'] = scoreV['1/n'][:,sTypes.index('Full')].sum(1)


scoreV['1/n_max'] = scoreV['1/n'][:,sTypes.index('Full')].max(1)
scoreV['1/n_max_yr'] = np.argmax(scoreV['1/n'][:,sTypes.index('Full')],axis=1)*YEAR_BLOCKS+min_year

scoreV['1/i_max'] = scoreV['1/i'][:,sTypes.index('Full')].max(1)
scoreV['1/i_max_yr'] = np.argmax(scoreV['1/i'][:,sTypes.index('Full')],axis=1)*YEAR_BLOCKS+min_year

scoreV['full_max'] = scoreV['full'][:,sTypes.index('Full')].max(1)
scoreV['full_max_yr'] = np.argmax(scoreV['full'][:,sTypes.index('Full')],axis=1)*YEAR_BLOCKS+min_year

In [None]:
for s in am_types:
    del scoreV[s]

In [None]:
for k,v in scoreV.items():
    print(k,v.shape)

In [None]:
avg_auth_pos = np.zeros(n_auths)
avg_auth_cnt = np.zeros(n_auths)
auth_been_last = np.zeros(n_auths)
auth_first_last_year = 3000*np.ones(n_auths)

for paper in all_papers:
    tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    n = len(authors)
    if n  > 1:
        for i,a in enumerate(authors):
            idx = name_idx[a]
            pos = i/(n-1)
            auth_been_last[idx] += int(pos == 1)
            avg_auth_pos[idx] += pos
            avg_auth_cnt[idx] += 1
        auth_first_last_year[name_idx[authors[-1]]] = min(year,auth_first_last_year[name_idx[authors[-1]]])
np.save('last_years',auth_first_last_year)

In [None]:
last_years = np.load('last_years.npy')
scoreV['last_years'] = last_years

In [None]:
if PROCESS_DATA:
    import gzip
    import pickle
    with gzip.open('scoresV2.pkl.gz','wb') as fp:
        pickle.dump(scoreV,fp)

In [None]:
print(len(scoreV),PROCESS_DATA)#,years_per_conf