In [None]:
import os
import sys
import fnmatch
import numpy as np
import pandas as pd
import json
import gzip
import pickle
import csv
import scipy.sparse
Xauth = None
from collections import defaultdict
import matplotlib.pyplot as plt

In [None]:
# setup the update to work despite the broken scipy documentation
try:
    a = scipy.sparse.dok_matrix((10,10))
    a.update({(0,0):1.0})
    scipy.sparse.dok_matrix.my_update = scipy.sparse.dok_matrix.update
except:
    a = scipy.sparse.dok_matrix((10,10))
    a._update({(0,0):1.0})
    scipy.sparse.dok_matrix.my_update = scipy.sparse.dok_matrix._update

In [None]:
with gzip.open('useful_venue_list.pkl.gz','rb') as fp:
    all_venues = pickle.load(fp)
with gzip.open('useful_authors_list.pkl.gz','rb') as fp:
    all_authors = pickle.load(fp)
with gzip.open('useful_papers.pkl.gz','rb') as fp:
    all_papers = pickle.load(fp)


In [None]:
min_year = all_papers[0][6]
max_year = all_papers[-1][6]
span_years = max_year - min_year + 1
print(min_year,max_year,span_years)
conf_idx = {v:i for i,v in enumerate(all_venues)}
name_idx = {v:i for i,v in enumerate(all_authors)}
n_confs = len(all_venues)
n_auths = len(all_authors)
n_papers = len(all_papers)
print(n_confs,n_auths,n_papers)

In [None]:
clf =  np.load('clf_gold.pkl.npy')
years_per_conf = clf.shape[0]//n_confs

YEAR_BLOCKS = span_years//years_per_conf

In [None]:
import scipy.sparse
import gc
if Xauth is None:
    valid_ns = set()
    for paper in all_papers:
        tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
        n = len(authors)
        valid_ns.add(n)
    
    per_author_val = {}
    for n in valid_ns:
        author_scores = np.ones(n)#1/(np.arange(n)+1)
        #author_score_sum = author_scores.sum()
        per_author_val[n] = author_scores#/author_scores.sum()
        
    count_vecs = {}
    paper_vecs = []
    for paper in all_papers:
        tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
        n = len(authors)
        j = years_per_conf*conf_idx[venue] + (year-min_year)//YEAR_BLOCKS

        author_scores = per_author_val[n]
        paper_vecs.append([(name_idx[a],j,v) for a,v in zip(authors,author_scores)])
        
    Xauth = scipy.sparse.dok_matrix((n_auths,years_per_conf*n_confs))
    xdict = {}
  
    for paper_vec in paper_vecs:
        for i,j,v in paper_vec:
            xdict[(i,j)] = v + xdict.get((i,j),0)

    Xauth.my_update(xdict)
            
    Xauth = scipy.sparse.csr_matrix(Xauth)
    paper_vec = []
    xdict = {}
    gc.collect()

In [None]:
cmu_uni = pd.read_csv('other_ranks/cmu_faculty.csv')
cmu_uni = cmu_uni.fillna('Other')
cmu_uni = cmu_uni[cmu_uni.dept == 'RI']
#uni_names = ['Andrea Tagliasacchi','Paul G. Kry']#['Xuemin Shen','H. Vincent Poor','Kang G. Shin','Mohamed-Slim Alouini','Lajos Hanzo']#list(cmu_uni.name)
#uni_names = list(faculty_affil[faculty_affil.affiliation == 'Johns Hopkins University'].name)
uni_names = list(cmu_uni.name)
print(len(uni_names))
conf_counts = {}
conf_counts_value = {}

for paper in all_papers:
    tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    n = len(authors)
    for a in authors:
        if a in uni_names:
            conf_counts[venue] = 1/n + conf_counts.get(venue,0)
            conf_counts_value[venue] = clf[years_per_conf*(conf_idx[venue]) + (year-min_year)//YEAR_BLOCKS]/n + conf_counts_value.get(venue,0)
conf_counts_value = {k: v/conf_counts[k] for k,v in conf_counts_value.items()}
ri_fav_confs = [(conf_counts[_[1]]*conf_counts_value[_[1]],_[1],conf_counts[_[1]],conf_counts_value[_[1]]) for _ in sorted([(v,k) for k,v in conf_counts.items() if v > 0],reverse=True)]

In [None]:
confs_to_filter = [_[1] for _ in sorted(ri_fav_confs,reverse=True) if _[-2] >= 1.0]

In [None]:
only_after = 2000
start_year_idx = max(0,only_after-min_year)//YEAR_BLOCKS

year_filter = np.zeros_like(clf).reshape((-1,years_per_conf))

if False: # filter confs
    for conf in confs_to_filter:
        year_filter[conf_idx[conf],start_year_idx:] = 1
else:
    year_filter[:,start_year_idx:] = 1

total_scores = Xauth.dot(clf * year_filter.reshape((-1)))

In [None]:
best_idx = np.argsort(total_scores)[::-1]
for k in range(10):
    idx = best_idx[k]
    print('{:30s}\t{:.2f}'.format(all_authors[idx],total_scores[idx]))

# University rankings

In [None]:
faculty_affil = pd.read_csv('csrankings.csv')
faculty_affil = faculty_affil[faculty_affil.name.isin(name_idx)]
faculty_affil['score'] = [total_scores[name_idx[name]] for name in faculty_affil.name]
faculty_affil['count'] = [(1 if total_scores[name_idx[name]] > 0 else 0) for name in faculty_affil.name]

In [None]:
scores_sum = faculty_affil.groupby('affiliation').sum().sort_values('score',0,False)

scores_log = scores_sum.copy()
scores_log.score = scores_log.score/np.log(scores_log['count']+1)
scores_log = scores_log.sort_values('score',0,False)

scores_mean = scores_sum.copy()
scores_mean.score = scores_mean.score/scores_mean['count']
scores_mean = scores_mean.sort_values('score',0,False)

scores_median = faculty_affil.groupby('affiliation').median().sort_values('score',0,False)
print(scores_sum['count'].mean())

In [None]:
for lbl,scores in zip(['sum','log','mean','median'],[scores_sum,scores_log,scores_mean,scores_median]):
    scores = scores.drop(['count'],1)
    print('\n{}'.format(lbl))
    print(scores.head(10))
    plt.figure()
    plt.hist(scores.score,100)
    plt.title(lbl)

# University rankings by sampling

In [None]:
from collections import defaultdict

SAMPLE_N = 20
TRIAL_N = 100

uni_scores = defaultdict(list)
uni_ranks = defaultdict(list)

for ti in range(TRIAL_N):
    unis = []
    for uni in faculty_affil.affiliation.unique():
        faculty = faculty_affil[faculty_affil.affiliation == uni]
        uni_score = faculty.score.sum()
        
        p_vec = np.array(faculty.score/uni_score)
        
        if SAMPLE_N < (p_vec > 0).sum():
            #p_vec += 1.0/np.ones_like(p_vec)
            #p_vec /= p_vec.sum()
            choices = np.random.choice(faculty.name,SAMPLE_N,False,p_vec)
        else:
            choices = list(faculty.name)
        sample_score = sum([total_scores[name_idx[name]] for name in choices])
        uni_scores[uni].append(sample_score)
        unis.append((sample_score,uni))
    ranks = sorted(unis,reverse=True)
    for ri,s in enumerate(ranks):
        uni_ranks[s[1]].append(ri+1)

In [None]:
uni_means = sorted([(np.mean(l),np.std(l),np.min(l),np.max(l),sorted(l)[int(0.4*TRIAL_N)],sorted(l)[int(0.6*TRIAL_N)],k) for k,l in uni_ranks.items()],reverse=False)

In [None]:
uni_means

In [None]:
uni_means = sorted([(np.mean(l),np.std(l),np.min(l),np.max(l),k) for k,l in uni_scores.items()],reverse=True)

In [None]:
uni_mean_scores = np.array([_[0] for _ in uni_means])
uni_std_scores = np.array([_[1] for _ in uni_means])

STD = 1
uni_dist_scores = []
for m,s in zip(uni_mean_scores,uni_std_scores):
    ub = (uni_mean_scores > m+STD*s).sum() + 1
    lb = (uni_mean_scores > m-STD*s).sum() + 1
    uni_dist_scores.append((ub,lb))

In [None]:
for um,bs in zip(uni_means,uni_dist_scores):
    print(um[-1],bs)

# University rankings by DP

In [None]:
from collections import defaultdict

TRIAL_N = 100

uni_scores = defaultdict(list)
uni_ranks = defaultdict(list)

for ti in range(TRIAL_N):
    unis = []
    for uni in faculty_affil.affiliation.unique():
        faculty = faculty_affil[faculty_affil.affiliation == uni]
        
        choices = [name for name in faculty.name if np.random.rand() > 0.5]

        sample_score = sum([total_scores[name_idx[name]] for name in choices])
        uni_scores[uni].append(sample_score)
        unis.append((sample_score,uni))
    ranks = sorted(unis,reverse=True)
    for ri,s in enumerate(ranks):
        uni_ranks[s[1]].append(ri+1)

In [None]:
uni_means = sorted([(np.mean(l),np.std(l),np.min(l),np.max(l),sorted(l)[int(0.4*TRIAL_N)],sorted(l)[int(0.6*TRIAL_N)],k) for k,l in uni_ranks.items()],reverse=False)

In [None]:
uni_means

In [None]:
uni_means = sorted([(np.mean(l),np.std(l),np.min(l),np.max(l),k) for k,l in uni_scores.items()],reverse=True)

In [None]:
uni_mean_scores = np.array([_[0] for _ in uni_means])
uni_std_scores = np.array([_[1] for _ in uni_means])

STD = 1
uni_dist_scores = []
for m,s in zip(uni_mean_scores,uni_std_scores):
    ub = (uni_mean_scores > m+STD*s).sum() + 1
    lb = (uni_mean_scores > m-STD*s).sum() + 1
    uni_dist_scores.append((ub,lb))

In [None]:
for um,bs in zip(uni_means,uni_dist_scores):
    print(um[-1],bs)

# University Rankings by LP

In [None]:
GS = total_scores[best_idx[0]]
eps = 0.1
lv = GS/(eps) 
GS,lv

this seems really rough, I don't think I can do this

# University Rankings by random conf subsample

In [None]:
from collections import defaultdict

TRIAL_N = 100

uni_scores = defaultdict(list)
uni_ranks = defaultdict(list)

for ti in range(TRIAL_N):
    filter_year = np.ones_like(clf).reshape((-1,years_per_conf))
    filter_year *= np.random.randint(2,size=(n_confs,1)).astype(np.float)
    total_scores = Xauth.dot(clf * filter_year.ravel())
    faculty_affil['score'] = [total_scores[name_idx[name]] for name in faculty_affil.name]
    for ri,row in enumerate(faculty_affil.groupby('affiliation').sum().sort_values('score',0,False).itertuples()):
        uni = row[0]
        sample_score = row[1]

        uni_scores[uni].append(sample_score)
        uni_ranks[uni].append(ri+1)

In [None]:
uni_means = sorted([(np.mean(l),np.std(l),np.min(l),np.max(l),sorted(l)[int(0.4*TRIAL_N)],sorted(l)[int(0.6*TRIAL_N)],k) for k,l in uni_ranks.items()],reverse=False)

In [None]:
uni_means

In [None]:
uni_means = sorted([(np.mean(l),np.std(l),np.min(l),np.max(l),k) for k,l in uni_scores.items()],reverse=True)

In [None]:
uni_mean_scores = np.array([_[0] for _ in uni_means])
uni_std_scores = np.array([_[1] for _ in uni_means])

STD = 2
uni_dist_scores = []
for m,s in zip(uni_mean_scores,uni_std_scores):
    ub = (uni_mean_scores > m+STD*s).sum() + 1
    lb = (uni_mean_scores > m-STD*s).sum() + 1
    uni_dist_scores.append((ub,lb))

In [None]:
for um,bs in zip(uni_means,uni_dist_scores):
    print(um[-1],bs)