In [None]:
import csv
import fnmatch
import gzip
import json
import os
import pickle
import sys
import zipfile

import numpy as np
import pandas as pd
import scipy.sparse

import xmltodict

In [None]:
papers = pd.read_hdf("papers.h5", "table")
Xauth = None

In [None]:
unique_names = pickle.load(open("big_names.pkl", "rb"))
unique_confs = pickle.load(open("confs.pkl", "rb"))

In [None]:
conf_idx = pickle.load(open("conf_idx.pkl", "rb"))
name_idx = pickle.load(open("name_idx.pkl", "rb"))

In [None]:
areadict = {
    "icse": ["ICSE", "ICSE (1)"],
    "fse": ["SIGSOFT FSE", "ESEC/SIGSOFT FSE"],
    "usenixatc": [
        "USENIX Annual Technical Conference",
        "USENIX Annual Technical Conference, General Track",
    ],  # next tier
    "imc": ["IMC", "Internet Measurement Conference"],
    "sigmetrics": ["SIGMETRICS", "SIGMETRICS/Performance", "POMACS"],
    "mobicom": ["MobiCom", "MOBICOM"],
    "rtas": [
        "RTAS",
        "IEEE Real-Time and Embedded Technology and Applications Symposium",
    ],
    "ccs": ["CCS", "ACM Conference on Computer and Communications Security"],
    "oakland": ["IEEE Symposium on Security and Privacy"],
    "usenixsec": ["USENIX Security Symposium", "USENIX Security"],
    "pets": ["PoPETs", "Privacy Enhancing Technologies"],
    "cav": ["CAV", "CAV (1)", "CAV (2)"],
    "lics": ["LICS", "CSL-LICS"],
    "nips": ["NIPS", "NeurIPS"],
    "icml": ["ICML", "ICML (1)", "ICML (2)", "ICML (3)"],
    "aaai": ["AAAI", "AAAI/IAAI"],
    "ubicomp": ["UbiComp", "Ubicomp", "IMWUT", "Pervasive"],
    "emnlp": ["EMNLP", "EMNLP-CoNLL", "HLT/EMNLP"],
    "acl": ["ACL", "ACL (1)", "ACL (2)", "ACL/IJCNLP", "COLING-ACL"],
    "naacl": ["NAACL", "HLT-NAACL", "NAACL-HLT"],
    "cvpr": ["CVPR", "CVPR (1)", "CVPR (2)"],
    "eccv": [
        "ECCV",
        "ECCV (1)",
        "ECCV (2)",
        "ECCV (3)",
        "ECCV (4)",
        "ECCV (5)",
        "ECCV (6)",
        "ECCV (7)",
        "ECCV (8)",
        "ECCV (9)",
        "ECCV (10)",
        "ECCV (11)",
        "ECCV (12)",
        "ECCV (13)",
        "ECCV (14)",
        "ECCV (15)",
        "ECCV (16)",
    ],
    "icra": ["ICRA", "ICRA (1)", "ICRA (2)"],
    "rss": ["Robotics: Science and Systems"],
    "crypto": ["CRYPTO", "CRYPTO (1)", "CRYPTO (2)", "CRYPTO (3)"],
    "eurocrypt": ["EUROCRYPT", "EUROCRYPT (1)", "EUROCRYPT (2)", "EUROCRYPT (3)"],
}
inverse_area_dict = {}
for k, v in areadict.items():
    n = len(v)
    for i in range(1, n):
        inverse_area_dict[v[i]] = v[0]
for k, v in inverse_area_dict.items():
    if k in conf_idx and v in conf_idx:
        conf_idx[k] = conf_idx[v]

In [None]:
# import ftfy
# from unidecode import unidecode
# for i in range(len(unique_names)):
#    unique_names[i] = unidecode(ftfy.fix_encoding(unique_names[i]))
#    name_idx[unique_names[i]] = i

In [None]:
min_year = papers.year.min()
max_year = papers.year.max()
span_years = max_year - min_year + 1

In [None]:
if Xauth is None or (Xauth.shape[1] != span_years * unique_confs.shape[0]):
    Xauth = scipy.sparse.dok_matrix(
        (len(unique_names), span_years * unique_confs.shape[0])
    )
    xdict = {}
    auth_years = np.ones((len(unique_names), 2)) * np.array([3000, 1000])
    for row in papers.itertuples():
        paper_year = row[10]
        conf = row[2]
        n = row[4]
        authors = row[3]
        j = span_years * conf_idx[conf] + (paper_year - min_year)
        for a in authors:
            i = name_idx[a]
            xdict[(i, j)] = 1 / n + xdict.get((i, j), 0)
            auth_years[i, 0] = min(auth_years[i, 0], paper_year)
            auth_years[i, 1] = max(auth_years[i, 1], paper_year)
    Xauth._update(xdict)

In [None]:
faculty_affil = pd.read_csv("faculty-affiliations.csv")


def csv2dict_str_str(fname):
    with open(fname, mode="r") as infile:
        rdr = csv.reader(infile)
        d = {rows[0].strip(): rows[1].strip() for rows in rdr}
    return d


aliasdict = csv2dict_str_str("dblp-aliases.csv")

In [None]:
if False:
    clfs = [
        pickle.load(open("faculty_indep.pkl", "rb")),
        pickle.load(open("nsf_indep2.pkl", "rb")),
        pickle.load(open("salary_indep.pkl", "rb")),
    ]
    clfs = [
        pickle.load(open("faculty_indep-fixed.pkl", "rb")),
        pickle.load(open("faculty_indep-fixed_5.pkl", "rb")),
        pickle.load(open("faculty_indep-fixed_16.pkl", "rb")),
        # pickle.load(open('nsf_fixed_total-log.pkl','rb')),
        # pickle.load(open('nsf_fixed_total-nonlog.pkl','rb')),
        # pickle.load(open('nsf_fixed_marginal-log.pkl','rb')),
        # pickle.load(open('nsf_fixed_marginal-nonlog.pkl','rb')),
        # pickle.load(open('salary-fixed-log.pkl','rb')),
        # pickle.load(open('salary-fixed-nonlog.pkl','rb')),
        pickle.load(open("nsf_fixed_marginal-log-names.pkl", "rb")),
        pickle.load(open("nsf_fixed_marginal-nonlog-names.pkl", "rb")),
        pickle.load(open("salary-fixed-log-names.pkl", "rb")),
        pickle.load(open("salary-fixed-nonlog-names.pkl", "rb")),
    ]
clfs = [
    pickle.load(open("faculty_indep-fixed.pkl", "rb")),
    pickle.load(open("faculty_indep-fixed_5.pkl", "rb")),
    pickle.load(open("faculty_indep-fixed_16.pkl", "rb")),
    # pickle.load(open('nsf_fixed_total-log.pkl','rb')),
    pickle.load(open("nsf_fixed_total-log-names.pkl", "rb")),
    # pickle.load(open('salary-fixed-nonlog.pkl','rb')),
    pickle.load(open("nsf_fixed_marginal-log-names.pkl", "rb")),
    # pickle.load(open('nsf_fixed_marginal-nonlog-names.pkl','rb')),
    # pickle.load(open('salary-fixed-log-names.pkl','rb')),
    pickle.load(open("salary-fixed-nonlog-names.pkl", "rb")),
]
clfs = [np.squeeze(_) for _ in clfs]
print([_.shape for _ in clfs])
clp = 2000  # 7
clfs2 = []
for result_clf in clfs:
    result_clf = result_clf.reshape((-1, span_years))

    # plt.plot(result_clf.sum(0)/result_clf.sum(0).sum(),label='sum')
    print(abs(result_clf.mean(0)).mean(), abs(result_clf.std(0)).mean())
    # result_clf = np.minimum(30,np.maximum(result_clf,-30))
    # result_clf = (result_clf)/result_clf.std(0)
    result_clf = (result_clf - result_clf.mean(0)) / result_clf.std(0)
    result_clf = result_clf.reshape((-1))
    clfs2.append(result_clf)
clfs = clfs2
clfs = [np.minimum(clp, np.maximum(-clp, (c - c.mean()) / c.std())) for c in clfs]
clfs = np.vstack(clfs)
auth_years = pickle.load(open("auth_years.pkl", "rb"))
clf = np.mean(clfs, 0)

# remove arXiv
print(clf.shape[0], conf_idx["CoRR"])
non_arxiv = np.ones(clf.shape[0])
for i in range(span_years):
    non_arxiv[span_years * conf_idx["CoRR"] + i] = 0
clf = clf * non_arxiv
clf_gold = np.copy(clf)

In [None]:
conf_ord = np.argsort(np.squeeze(clf))
#'Scientometrics','JCDL','NIPS',
conf_choice = [
    "ICRA (1)",
    "ICML (2)",
    "SIGGRAPH",
    "HRI",
    "ECCV (8)",
    "ECCV (1)",
    "Comput. Graph. Forum",
    "Shape Modeling International",
    "Symposium on Geometry Processing",
    "Computer Aided Geometric Design",
    "I. J. Robotics Res.",
    "CVPR",
    "International Journal of Computer Vision",
    "Robotics: Science and Systems",
    "ICRA",
    "WACV",
    "ICML",
    "AISTATS",
    "CoRR",
    "SIGGRAPH Asia",
    "ECCV",
    "ICCV",
    "ISER",
    "Humanoids",
    "3DV",
    "IROS",
    "CoRL",
    "Canadian Conference on AI",
    "ACCV",
    "Graphics Interface",
    "CRV",
    "BMVC",
]
ri_confs = np.zeros(len(unique_confs) * span_years)
ms = clf.mean()
ss = clf.std()
np.set_printoptions(precision=1)
seen = {}
for i in range(len(unique_confs) * span_years):
    idx = conf_ord[-(i + 1)]
    conf_name = unique_confs[idx // span_years]
    conf_score = clf[idx]
    if conf_name in conf_choice:
        ri_confs[idx] = 1
    if conf_name in conf_choice and conf_name not in seen:
        vec = clfs[:, idx]
        print(
            "{:20s}{}\t{:.1f}\t{}".format(
                conf_name[:20],
                str(min_year + (idx % span_years)),
                (conf_score - ms) / ss,
                vec,
            )
        )
        seen[conf_name] = 1
ri_confs.shape, ri_confs.sum()

In [None]:
top_k = 50
i = -1
j = 0
seen = {}
while j < top_k:
    i += 1
    idx = conf_ord[-(i + 1)]
    conf_name = unique_confs[idx // span_years]
    if conf_name in seen:
        continue
    j += 1
    conf_score = clf[idx]
    seen[conf_name] = 1
    print(
        "{:20s}\t{}\t\t{:.3f}\t{:.2f}".format(
            conf_name[:18],
            min_year + (idx % span_years),
            100 * conf_score,
            (conf_score - ms) / ss,
        )
    )

In [None]:
import matplotlib.pyplot as plt

for clf in [clf_gold]:
    plt.figure()
    conf_choice2 = [
        "SIGGRAPH",
        "AAAI",
        "NIPS",
        "CVPR",
        "ICRA",
        "ICML",
        "ICCV",
        "ECCV",
        "International Journal of Computer Vision",
        "Robotics: Science and Systems",
    ]
    # conf_choice2 = ['CVPR','ECCV','ICCV','International Journal of Computer Vision','Robotics: Science and Systems']
    # conf_choice2 = ['SIGMOD Conference','VLDB','ICDT','KDD','ACM Trans. Inf. Syst.','PODS']
    # conf_choice2 = ['ACL','EMNLP','EACL']

    conf_choices = [
        conf_choice2,
        [
            "STOC",
            "FOCS",
            "SODA",
            "EC",
            "WINE",
            "Electronic Colloquium on Computational Complexity (ECCC)",
        ],
        ["UAI", "AAAI", "IJCAI", "ICML", "NIPS"],
        [
            "ECCV",
            "ICCV",
            "CVPR",
            "BMVC",
            "CRV",
            "International Journal of Computer Vision",
            "3DV",
            "WACV",
        ],
        [
            "Robotics: Science and Systems",
            "IROS",
            "CoRL",
            "WAFR",
            "HRI",
            "ISER",
            "ICRA",
        ],
        ["SIGGRAPH", "SIGGRAPH Asia", "ACM Trans. Graph.", "Graphics Interface"],
    ]
    # conf_choices = [['Robotics: Science and Systems','IROS','ICRA','CoRL','WAFR','HRI','ISER']]
    for conf_choice2 in conf_choices:
        plt.figure()
        # conf_choice2 =
        conf_choice3 = []
        vs = clf.std()
        for conf in conf_choice2:
            idx = conf_idx[conf]
            s = clf[
                span_years * idx + 2015 - 1970
            ]  # max(clf[span_years*idx:span_years*(idx+1)])
            conf_choice3.append((s, conf))
        plt.figure(figsize=(12, 8))
        for s, conf in sorted(conf_choice3, reverse=True):
            idx = conf_idx[conf]
            _ = plt.plot(
                np.arange(min_year, max_year + 1)[:-4],
                (clf[span_years * idx : span_years * (idx + 1)] / vs)[:-4],
                label=conf,
                lw=5,
            )
        plt.grid()
        plt.xlabel("year")
        plt.ylabel("value")
        plt.legend()
        # plt.show()

    # plt.show()
clf = clf_gold

In [None]:
import matplotlib.pyplot as plt

_ = plt.hist(clf, 70)

In [None]:
import itertools

import matplotlib.pyplot as plt
import scipy.stats

sigma = 3
weights = []
for i in range(span_years):
    a = np.array([scipy.stats.norm.pdf((j - i) / sigma) for j in range(span_years)])
    a[a < 0.05] = 0
    weights.append(a / np.linalg.norm(a))
_ = plt.plot(np.arange(span_years) + min_year, weights[2000 - min_year])
plt.grid(True)
# pairs_of_years = itertools.product(range(span_years),range(span_years))

wdict = {}
for i, j, k in itertools.product(
    range(unique_confs.shape[0]), range(span_years), range(span_years)
):
    wdict[i * span_years + j, i * span_years + k] = weights[j][k]
wsa = scipy.sparse.dok_matrix(
    (span_years * unique_confs.shape[0], span_years * unique_confs.shape[0])
)
wsa._update(wdict)
Xauth = scipy.sparse.csr_matrix(Xauth)
wsa = scipy.sparse.csr_matrix(wsa)
Xauth = Xauth @ wsa

In [None]:
scores = Xauth.dot(clf_gold)
years_working = 1 + auth_years[:, 1] - auth_years[:, 0]
value_scores = scores
norm_scores = (value_scores) / years_working
ri_filter_mat = scipy.sparse.diags(ri_confs)
ri_scores = Xauth.dot(ri_filter_mat).dot(clf_gold)
ri_norm_scores = ri_scores / years_working
pub_num = Xauth.sum(1)
total_scores = np.copy(scores)

In [None]:
prev_cand = [
    "Pulkit Agrawal",
    "Joydeep Biswas",
    "Katherine L. Bouman",
    "David Braun",
    "Jia Deng",
    "Naomi T. Fitter",
    "David F. Fouhey",
    "Saurabh Gupta",
    "Judy Hoffman",
    "Hanbyul Joo",
    "Honglak Lee",
    "Changliu Liu",
    "Petter Nilsson",
    "Matthew O'Toole",
    "Alessandro Roncone",
    "Alanson P. Sample",
    "Manolis Savva",
    "Adriana Schulz",
    "Amy Tabb",
    "Fatma Zeynep Temel",
    "Long Wang",
    "Cathy Wu",
    "Ling-Qi Yan",
]
print(
    "{:20s}\t{:4s}\t{:4s}\t{:4s}\t{}\t{}".format(
        "name", "rate", "total", "ri", "years", "pubs"
    )
)
for ns, name in sorted(
    [(value_scores[name_idx[ni]], ni) for ni in prev_cand], reverse=True
):
    ni = name_idx[name]
    print(
        "{:20s}\t{:.2f}\t{:.2f}\t{:.2f}\t{:.0f}\t{:.1f}".format(
            name,
            norm_scores[ni],
            value_scores[ni],
            ri_scores[ni],
            years_working[ni],
            pub_num[ni, 0],
        )
    )
print("")
curious_names = [
    "Xiaolong Wang 0004",
    "Judy Hoffman",
    "Paris Siminelakis",
    "Roie Levin",
    "Leonid Keselman",
    "Rick Goldstein",
    "Nicholas Rhinehart",
    "Vincent Sitzmann",
    "Siddharth Ancha",
    "Xingyu Lin",
    "Humphrey Hu",
    "David F. Fouhey",
    "Chelsea Finn",
    "Dinesh Jayaraman",
    "Wen Sun 0002",
    "Lerrel Pinto",
    "Justin Johnson",
    "Amir Roshan Zamir",
    "Dominik Peters",
    "Jonathan T. Barron",
    "Dorsa Sadigh",
    "Derek Hoiem",
    "Vaggos Chatziafratis",
    "Brian Okorn",
    "David Held",
]
print(
    "{:20s}\t{:4s}\t{:4s}\t{:4s}\t{}\t{}".format(
        "name", "rate", "total", "ri", "years", "pubs"
    )
)
for _, name in sorted(
    [(value_scores[name_idx[_]], _) for _ in curious_names], reverse=True
):
    ni = name_idx[name]
    print(
        "{:20s}\t{:.2f}\t{:.2f}\t{:.2f}\t{}\t{:.1f}".format(
            name,
            norm_scores[ni],
            value_scores[ni],
            ri_scores[ni],
            years_working[ni],
            pub_num[ni, 0],
        )
    )

In [None]:
uni_faculty = faculty_affil[
    faculty_affil.affiliation == "Carnegie Mellon University"
]  # Carnegie Mellon University
uni_names = np.array(uni_faculty.name)
uni_names = list(uni_names)
cmu_scores = []
ts = value_scores.std()
# uni_names = [unique_names[i] for i in (np.argsort(scores)[::-1])[:150]]
for name in set([aliasdict.get(n, n) for n in uni_names]):
    if name in name_idx:
        # if ri_scores[name_idx[name]] < 2.5:
        #    continue
        score = value_scores[name_idx[name]]
        idx = name_idx[name]
        cmu_scores.append((total_scores[idx], name))
    else:
        pass
        # print(name)
for s, p in sorted(cmu_scores, reverse=True):
    print("{:30s}\t\t{:.1f}".format(p, s / ts))

In [None]:
# for clp in [200]:#range(1,15,1):
clp = 5
clfs = [
    pickle.load(open("faculty_indep-fixed.pkl", "rb")),
    pickle.load(open("faculty_indep-fixed_5.pkl", "rb")),
    pickle.load(open("faculty_indep-fixed_16.pkl", "rb")),
    pickle.load(open("nsf_fixed_total-log-names.pkl", "rb")),
    # pickle.load(open('salary-fixed-nonlog.pkl','rb')),
    pickle.load(open("nsf_fixed_marginal-log-names.pkl", "rb")),
    # pickle.load(open('nsf_fixed_marginal-nonlog-names.pkl','rb')),
    # pickle.load(open('salary-fixed-log-names.pkl','rb')),
    pickle.load(open("salary-fixed-nonlog-names.pkl", "rb")),
]
clfs = [np.squeeze(_) for _ in clfs]
clp = 100
clfs2 = []
for result_clf in clfs:
    result_clf = result_clf.reshape((-1, span_years))

    # plt.plot(result_clf.sum(0)/result_clf.sum(0).sum(),label='sum')
    print(abs(result_clf.mean(0)).mean(), abs(result_clf.std(0)).mean())
    # result_clf = np.minimum(30,np.maximum(result_clf,-30))
    # result_clf = (result_clf)/result_clf.std(0)
    result_clf = (result_clf - result_clf.mean(0)) / result_clf.std(0)
    result_clf = result_clf.reshape((-1))
    clfs2.append(result_clf)
clfs = clfs2
clfs = [np.minimum(clp, np.maximum(-clp, (c - c.mean()) / c.std())) for c in clfs]
clfs = np.vstack(clfs)
auth_years = pickle.load(open("auth_years.pkl", "rb"))
clf_gold = np.mean(clfs, 0)

clfs_test = np.vstack([clfs, clf_gold.reshape(1, -1)])
for clfn, clf in enumerate(clfs_test):
    total_scores = Xauth.dot(clf)
    df_corr = pd.read_csv("correlation_cleaned.csv")
    df_corr = df_corr.drop(columns=[_ for _ in df_corr.columns if "Unnamed" in _])
    df_corr = df_corr.drop(columns=["pms", "n_papers"])
    df_corr = df_corr.rename(
        columns={
            "totals": "venue_score",
            "csrp": "csr_pubs",
            "csrpn": "csr_adj",
            "gcite": "influence",
        }
    )
    pd.set_option("precision", 2)
    df_corr = df_corr[
        [
            "name",
            "papers",
            "citations",
            "h-index",
            "i10",
            "csr_pubs",
            "csr_adj",
            "venue_score",
            "influence",
        ]
    ]
    df_corr = df_corr.dropna("index")
    df_corr.index = df_corr.name

    ts = total_scores.std()
    for name in df_corr.name:
        if name in name_idx:
            idx = name_idx[name]
            df_corr.loc[name, "venue_score"] = total_scores[idx] / ts
    print(
        clp,
        clfn,
        df_corr.corr("spearman").loc["influence", "venue_score"],
        df_corr.corr("kendall").loc["influence", "venue_score"],
        df_corr.corr("spearman").loc["h-index", "venue_score"],
    )
    # if clfn == clfs_test.shape[-1]:
df_corr.corr("spearman")

In [None]:
df_corr.corr("spearman").loc["venue_score"]

In [None]:
ls *names*.pkl

In [None]:
clf.shape[0] / span_years

# different stuff

In [None]:
vec_mat = pickle.load(open("top_conf_embed.pkl", "rb"))
ind_mat = np.array(pickle.load(open("top_conf_ind.pkl", "rb"))).reshape((-1))

In [None]:
ind_mat.shape
full_to_sub = {}
j = 0
for i, v in enumerate(ind_mat):
    if v:
        full_to_sub[i] = j
        j += 1

In [None]:
vec_mat.shape, len(full_to_sub)

In [None]:
Xauth.shape

In [None]:
rep_mat = np.repeat(ind_mat, span_years).astype(np.float)
rep_mat.shape

In [None]:
xmap = scipy.sparse.dok_matrix((Xauth.shape[1], vec_mat.shape[0]))
xdict = {}
print(xmap.shape)
for i, v in enumerate(rep_mat):
    if v:
        xdict[(i, full_to_sub[i // span_years])] = 1
xmap._update(xdict)

In [None]:
mapped_all = Xauth.dot(xmap)

In [None]:
mapped_all_mag = mapped_all.sum(1)

In [None]:
if False:
    uni_faculty = faculty_affil[
        faculty_affil.affiliation == "Carnegie Mellon University"
    ]  # Carnegie Mellon University
    uni_names = np.array(uni_faculty.name)
    uni_names = list(uni_names)
    pd.Series(uni_names).to_csv("cmu_faculty.csv")
else:
    cmu_uni = pd.read_csv("cmu_faculty.csv")
    cmu_uni = cmu_uni.fillna("Other")
    cmu_uni = cmu_uni[cmu_uni.dept == "RI"]
    # print(list(cmu_uni.name))
    uni_names = list(cmu_uni.name)
    uni_labels = list(cmu_uni.dept)
    uni_labels_unique = list(set(uni_labels))
cmu_scores = []
uni_colors = []
# uni_names = [unique_names[i] for i in (np.argsort(scores)[::-1])[:150]]
for name, d in set(
    [(aliasdict.get(n, n), dept) for n, dept in zip(uni_names, uni_labels)]
):
    if name in name_idx:
        # if ri_scores[name_idx[name]] < 2.5:
        #    continue
        loc = mapped_all[name_idx[name], :].dot(vec_mat)
        loc /= max(1, mapped_all_mag[name_idx[name]])
        cmu_scores.append((loc, name))
        uni_colors.append(uni_labels_unique.index(d))
        # print(name,d)
    else:
        pass

In [None]:
ri_names  = list(cmu_uni.name)
for row in

In [None]:
embed = np.vstack([_[0] for _ in cmu_scores])

In [None]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [None]:
embedder = TSNE(2, 2, init="random", n_iter=6500)
# embedder = TSNE(2,1)

ys = embedder.fit_transform(embed)

In [None]:
import matplotlib.colors
import matplotlib.patheffects as path_effects

nc = len(uni_labels_unique)
cmap = plt.get_cmap("tab10_r")
plt.figure(figsize=(12, 12))
plt.scatter(
    ys[:, 0],
    ys[:, 1],
    s=20,
    lw=0.2,
    edgecolors="k",
    c=cmap(np.array(uni_colors) / 10 + 0.2),
)
for i in range(ys.shape[0]):
    rv = np.random.randn(2)
    xr, yr = 0.2 * (rv)  # /np.linalg.norm(rv)
    text = plt.text(
        ys[i, 0] + xr,
        yr + ys[i, 1],
        cmu_scores[i][1],
        size="12",
        color=cmap(uni_colors[i] / 10 + 0.2),
        horizontalalignment="center",
        verticalalignment="center",
        alpha=0.8,
        weight="demibold",
    )
    # text.set_path_effects([path_effects.Stroke(linewidth=0.05, foreground='black'),
    #           path_effects.Normal()])
plt.xticks([], [])
plt.yticks([], [])
plt.tight_layout()
plt.savefig("cmu_faculty.pdf")

# very different stuff?

In [None]:
from sklearn.cluster import MiniBatchKMeans

r1_scores = pickle.load(open("r1_scores.pkl", "rb"))
r1_confs = pickle.load(open("r1_confs.pkl", "rb"))

In [None]:
embed = TSNE(perplexity=15, n_iter=10000, metric="cosine")
ys = embed.fit_transform(vec_mat)

In [None]:
km = MiniBatchKMeans(25, n_init=100)
km.fit(vec_mat)

In [None]:
clf_small = np.zeros(len(full_to_sub))
for old, new in full_to_sub.items():
    clf_small[new] = clf[old * span_years : (old + 1) * span_years][-2]
    print(r1_confs[new], clf_small[new])
    if r1_confs[new] == "JCDL":
        print(new)

In [None]:
#
# conf_d = cdist(vec_mat[31].reshape((1,-1)),vec_mat,metric='cosine')[0]
# conf_s = np.argsort(conf_d)
# for i in range(10):
#    print(r1_confs[conf_s[i]],conf_d[conf_s[i]],clf_small[conf_s[i]])
# old = conf_idx['Scientometrics']
# print('Scientometrics',max(clf[old*span_years:(old+1)*span_years]))

In [None]:
import matplotlib.patheffects as path_effects
import matplotlib.pyplot as plt

plt.figure(figsize=(25, 25))
cmap = plt.get_cmap("tab20c_r")
# plt.subplot(1,2,1)
plt.scatter(
    ys[:, 0],
    ys[:, 1],
    c=km.labels_ / 25,
    cmap="tab20c_r",
    s=0.15 * r1_scores ** 2,
    lw=0.1,
    edgecolors="k",
)
for i in range(25):
    tmp = [(r1_scores[i], i) for i, v in enumerate(km.labels_ == i) if v]
    score_idx = sorted(tmp, reverse=True)
    print(i)
    k = 4
    j = 0
    for s, idx in reversed(score_idx[:k]):
        rv = np.random.randn(2)
        xr, yr = 3 * (rv)  # /np.linalg.norm(rv)
        text = plt.text(
            xr + ys[idx, 0],
            2 * (j - (k - 1) / 2) / (k - 1) + ys[idx, 1],
            r1_confs[idx],
            size="18",
            color=np.array(cmap(i / 24)),
            ha="center",
            va="center",
            alpha=0.9,
            weight="bold",
        )
        text.set_path_effects(
            [
                path_effects.Stroke(linewidth=1, foreground="black"),
                path_effects.Normal(),
            ]
        )
        j += 1
        print("\t", r1_confs[idx])
    # print()
plt.xticks([], [])
plt.yticks([], [])
plt.tight_layout()
plt.savefig("tsne1.pdf")
# plt.title('Clusters and largest venues',size=48)
if False:
    plt.subplot(1, 2, 2)
    import matplotlib.pyplot as plt
    import matplotlib.patheffects as path_effects

    cmap = plt.get_cmap(
        "tab20c_r"
    )  # cmap=cmap,c=km.labels_/26 #cmap=cmap2,c=cmap_small
    cmap2 = plt.get_cmap("viridis")
    cmap_small = clf_small - clf_small.min()
    cmap_small = cmap_small / cmap_small.max()
    plt.scatter(
        ys[:, 0],
        ys[:, 1],
        cmap=cmap2,
        c=cmap_small,
        s=0.15 * r1_scores ** 2,
        lw=0.1,
        edgecolors="k",
    )
    for i in range(26):
        tmp = [(clf_small[i], i) for i, v in enumerate(km.labels_ == i) if v]
        score_idx = sorted(tmp, reverse=True)
        # print(i)
        k = 3
        j = 0
        print(i)
        for s, idx in reversed(score_idx[:k]):
            rv = np.random.randn(2)
            xr, yr = 2 * (rv)  # /np.linalg.norm(rv) #np.array(cmap(i/25))
            # print(cmap_small[idx],idx)
            text = plt.text(
                ys[idx, 0] + xr,
                3 * (j - (k - 1) / 2) / (k - 1) + ys[idx, 1],
                r1_confs[idx],
                size="20",
                color=np.array(cmap2(cmap_small[idx])),
                ha="center",
                va="center",
                alpha=0.9,
                weight="bold",
            )
            text.set_path_effects(
                [
                    path_effects.Stroke(
                        linewidth=2, foreground="white"
                    ),  # foreground= np.array(cmap(i/25))),
                    path_effects.Normal(),
                ]
            )
            j += 1
            print("\t", r1_confs[idx], s)
    # plt.title('Highest Quality',size=48)
    plt.savefig("tsne1.pdf")
    # print('\t',r1_confs[idx])
    # print()

In [None]:
import matplotlib.patheffects as path_effects
import matplotlib.pyplot as plt

plt.figure(figsize=(25, 25))
cmap = plt.get_cmap("tab20c_r")
# plt.subplot(1,2,1)
plt.scatter(
    ys[:, 0],
    ys[:, 1],
    c=km.labels_ / 26,
    cmap="tab20c_r",
    s=0.15 * r1_scores ** 2,
    lw=0.1,
    edgecolors="k",
)
for i in range(26):
    tmp = [(r1_scores[i], i) for i, v in enumerate(km.labels_ == i) if v]
    score_idx = sorted(tmp, reverse=True)
    k = 4
    j = 0
    for s, idx in reversed(score_idx[:k]):
        rv = np.random.randn(2)
        xr, yr = 3 * (rv)  # /np.linalg.norm(rv)
        text = plt.text(
            xr + ys[idx, 0],
            2 * (j - (k - 1) / 2) / (k - 1) + ys[idx, 1],
            r1_confs[idx],
            size="14",
            color=np.array(cmap(i / 25)),
            ha="center",
            va="center",
            alpha=0.4,
            weight="bold",
        )
        text.set_path_effects(
            [
                path_effects.Stroke(linewidth=1, foreground="black"),
                path_effects.Normal(),
            ]
        )
        j += 1
    # print()
plt.xticks([], [])
plt.yticks([], [])
plt.tight_layout()
plt.title("Clusters of Venues", size=25)
if False:
    plt.subplot(1, 2, 2)
    import matplotlib.pyplot as plt
    import matplotlib.patheffects as path_effects

    cmap = plt.get_cmap(
        "tab20c_r"
    )  # cmap=cmap,c=km.labels_/26 #cmap=cmap2,c=cmap_small
    cmap2 = plt.get_cmap("viridis")
    cmap_small = clf_small - clf_small.min()
    cmap_small = cmap_small / cmap_small.max()
    plt.scatter(
        ys[:, 0],
        ys[:, 1],
        cmap=cmap2,
        c=cmap_small,
        s=0.15 * r1_scores ** 2,
        lw=0.1,
        edgecolors="k",
    )
    for i in range(26):
        tmp = [(clf_small[i], i) for i, v in enumerate(km.labels_ == i) if v]
        score_idx = sorted(tmp, reverse=True)
        # print(i)
        k = 3
        j = 0
        for s, idx in reversed(score_idx[:k]):
            rv = np.random.randn(2)
            xr, yr = 2 * (rv)  # /np.linalg.norm(rv) #np.array(cmap(i/25))
            text = plt.text(
                ys[idx, 0] + xr,
                3 * (j - (k - 1) / 2) / (k - 1) + ys[idx, 1],
                r1_confs[idx],
                size="20",
                color=np.array(cmap2(cmap_small[idx])),
                ha="center",
                va="center",
                alpha=0.9,
                weight="bold",
            )
            text.set_path_effects(
                [
                    path_effects.Stroke(
                        linewidth=2, foreground="white"
                    ),  # foreground= np.array(cmap(i/25))),
                    path_effects.Normal(),
                ]
            )
            j += 1
    plt.title("Highest Quality", size=48)
    # print('\t',r1_confs[idx])
    # print()

In [None]:
import matplotlib.patheffects as path_effects
import matplotlib.pyplot as plt

plt.figure(figsize=(25, 25))
cmap = plt.get_cmap("tab20c_r")  # cmap=cmap,c=km.labels_/26 #cmap=cmap2,c=cmap_small
cmap2 = plt.get_cmap("viridis")
cmap_small = clf_small - clf_small.min()
cmap_small = cmap_small / cmap_small.max()
plt.scatter(
    ys[:, 0],
    ys[:, 1],
    cmap=cmap,
    c=km.labels_ / 26,
    s=0.15 * r1_scores ** 2,
    lw=0.1,
    edgecolors="k",
)
for i in range(26):
    tmp = [(clf_small[i], i) for i, v in enumerate(km.labels_ == i) if v]
    score_idx = sorted(tmp, reverse=True)
    # print(i)
    k = 3
    j = 0
    # print(i)
    for s, idx in reversed(score_idx[:k]):
        rv = np.random.randn(2)
        xr, yr = 3 * (rv)  # /np.linalg.norm(rv) #np.array(cmap(i/25))
        # print(cmap_small[idx],idx)
        # text = plt.text(ys[idx,0]+xr,3*(j-(k-1)/2)/(k-1)+ys[idx,1],r1_confs[idx],size='20',color=np.array(cmap(km.labels_[idx]/25)),
        #         ha='center',va='center',alpha=0.9,weight='bold')
        # text.set_path_effects([path_effects.Stroke(linewidth=1, foreground='black'),
        #               path_effects.Normal()])
        j += 1
        # print('\t',r1_confs[idx],s)
    # print('\t',r1_confs[idx])
    # print()

In [None]:
import matplotlib.patheffects as path_effects
import matplotlib.pyplot as plt

plt.figure(figsize=(25, 25))
cmap = plt.get_cmap("tab20c_r")  # cmap=cmap,c=km.labels_/26 #cmap=cmap2,c=cmap_small
cmap2 = plt.get_cmap("viridis")
cmap_small = clf_small - clf_small.min()
cmap_small = np.minimum(1, 2 * cmap_small / cmap_small.max())
plt.scatter(
    ys[:, 0],
    ys[:, 1],
    cmap=cmap2,
    c=cmap_small,
    s=0.15 * r1_scores ** 2,
    lw=0.1,
    edgecolors="k",
)
for i in range(26):
    tmp = [(clf_small[i], i) for i, v in enumerate(km.labels_ == i) if v]
    score_idx = sorted(tmp, reverse=True)
    # print(i)
    k = 30
    j = 0
    # print(i)
    for s, idx in score_idx[:k]:
        rv = np.random.randn(2)
        xr, yr = 3 * (rv)  # /np.linalg.norm(rv) #np.array(cmap(i/25))
        # print(cmap_small[idx],idx)
        # text = plt.text(ys[idx,0]+xr,3*(j-(k-1)/2)/(k-1)+ys[idx,1],r1_confs[idx],size='20',color=np.array(cmap(km.labels_[idx]/25)),
        #         ha='center',va='center',alpha=0.9,weight='bold')
        # text.set_path_effects([path_effects.Stroke(linewidth=1, foreground='black'),
        #               path_effects.Normal()])
        j += 1
        print(i, "\t", r1_confs[idx], s)
    # print('\t',r1_confs[idx])
    # print()

In [None]:
ranks = pd.read_csv("../ranks.csv")
top_k = 36
subplot = int(round(np.sqrt(top_k)))
min_v = ys.min(0)
max_v = ys.max(0)
plt.figure(figsize=(subplot * 4, subplot * 4))

for i in range(top_k):
    Uname = ranks.iloc[i, :].uni
    uni_faculty = faculty_affil[faculty_affil.affiliation == Uname]
    uni_names = np.array(uni_faculty.name)
    uni_names = list(uni_names)
    cmu_scores = []
    for name in set([aliasdict.get(n, n) for n in uni_names]):
        if name in name_idx:
            loc = mapped_all[name_idx[name], :].dot(ys)
            loc /= max(1, mapped_all_mag[name_idx[name]])
            cmu_scores.append((loc))
    cmu_scores = np.squeeze(np.array(cmu_scores))
    plt.subplot(subplot, subplot, i + 1)
    plt.hexbin(
        cmu_scores[:, 0],
        cmu_scores[:, 1],
        gridsize=13,
        extent=(min_v[0], max_v[0], min_v[1], max_v[1]),
        vmin=0,
        vmax=4,
    )
    plt.title(Uname, color="k", weight="demibold")
    plt.xticks([], [])
    plt.yticks([], [])
    plt.tight_layout()

plt.tight_layout()
plt.savefig("fingers.pdf")

In [None]:
good_names = prev_cand  # + curious_names
plt.figure(figsize=(25, 25))


cmu_uni = pd.read_csv("cmu_faculty.csv")
cmu_uni = cmu_uni.fillna("Other")
# print(list(cmu_uni.name))
uni_names = list(cmu_uni.name) + good_names
uni_labels = list(cmu_uni.dept) + len(good_names) * ["cand"]
uni_labels_unique = list(set(uni_labels)) + ["cand"]
cmu_scores = []
cmu_full_dim = []
cmu_names = []
uni_colors = []
cand_num = 0
for name, d in [(aliasdict.get(n, n), dept) for n, dept in zip(uni_names, uni_labels)]:
    if name in cmu_names:
        continue
    if name in name_idx:
        # if ri_scores[name_idx[name]] < 2.5:
        #    continue
        loc = mapped_all[name_idx[name], :].dot(ys)
        loc /= max(1, mapped_all_mag[name_idx[name]])
        cmu_scores.append((loc))
        loc = mapped_all[name_idx[name], :].dot(vec_mat)
        loc /= max(1, mapped_all_mag[name_idx[name]])
        cmu_full_dim.append((loc))
        cmu_names.append(name)
        uni_colors.append(uni_labels_unique.index(d))
        if d == "cand":
            cand_num += 1
    else:
        pass
cmu_scores = np.squeeze(np.array(cmu_scores))
import matplotlib.colors

nc = len(uni_labels_unique)
cmap = plt.get_cmap("tab10")
plt.figure(figsize=(30, 30))
plt.scatter(
    cmu_scores[:, 0],
    cmu_scores[:, 1],
    s=120,
    lw=1,
    edgecolors="k",
    c=cmap(np.array(uni_colors) / nc - 0.0),
    alpha=0.5,
)
for i in range(cmu_scores.shape[0]):
    rv = np.random.randn(2)
    xr, yr = 0.7 * (rv)  # /np.linalg.norm(rv)
    plt.text(
        cmu_scores[i, 0] + xr,
        yr + cmu_scores[i, 1],
        cmu_names[i],
        size="16",
        color=cmap(uni_colors[i] / nc),
        horizontalalignment="center",
        verticalalignment="center",
        alpha=0.9,
    )

In [None]:
cmu_full_dim = np.squeeze(np.array(cmu_full_dim))
faculty = cmu_full_dim[:-cand_num, :]
cands = cmu_full_dim[-cand_num:, :]
faculty.shape, cands.shape
nullvec = np.linalg.norm(faculty, axis=1) == 0
faculty[nullvec] = np.random.randn(nullvec.sum(), faculty.shape[1])

In [None]:
from scipy.spatial.distance import cdist

dist = cdist(faculty, cands, metric="cosine")
min_dist = np.argmin(dist, 0)
top_k = 3
if False:  # if included candidates in cmu_names, plot their 3 nearest neighbors

    for i, cand_name in enumerate(cmu_names[-cand_num:]):
        nns = np.argsort(dist[:, i])
        print("{:20s}".format(cand_name), end="\t")
        for j in range(top_k):
            name_dist = "{} ({:.1f})".format(
                cmu_names[nns[j]][:25], 100 * dist[nns[j], i]
            )
            print("{:30s}".format(name_dist), end=" ")
        print("\n", end="")
        nns.shape

# Age Curve

In [None]:
from collections import Counter, defaultdict


def di():
    return defaultdict(float)


author_by_year = defaultdict(di)
for row in papers.itertuples():
    paper_year = row[10]
    conf = row[2]
    n = row[4]
    authors = row[3]
    for a in authors:
        auth = aliasdict.get(a, a)
        author_by_year[auth][paper_year] += (
            clf[span_years * conf_idx[conf] + paper_year - 1970] / n
        )

In [None]:
val_by_year = np.zeros(2019 - 1969)
v_count = np.zeros(2019 - 1969)
for auth, years in author_by_year.items():
    yrs = years.keys()
    start_year = min(yrs)
    end_year = max(yrs)
    span = end_year - start_year
    if span < 85:
        for y, v in years.items():
            val_by_year[y - start_year] += v
            v_count[y - start_year] += 1

In [None]:
from scipy.ndimage.filters import gaussian_filter1d

plt.figure(figsize=(8, 8))
example_names = [
    "Takeo Kanade",
    "Martial Hebert",
    "Christopher G. Atkeson",
    "Howie Choset",
    "Deva Ramanan",
    "Matthew T. Mason",
    "Jessica K. Hodgins",
    "Abhinav Gupta",
]  # ,'Pieter Abbeel'
for example_name in example_names:
    example_value = np.zeros(max_year + 1 - min_year)
    years = author_by_year[example_name]
    yrs = [_ for _ in years.keys() if _ > 0]
    start_year = min(yrs)
    end_year = max(yrs)
    span = end_year - start_year
    start_year, end_year, span
    for y, v in years.items():
        example_value[y - 1970] += v

    plt.plot(
        np.arange(1970, 2018),
        gaussian_filter1d(example_value[:-2], sigma=2),
        label=example_name,
    )
    # plt.plot(gaussian_filter1d(example_value[:span], sigma=2),label=example_name )

# plt.plot((val_by_year/v_count),label='average author')
plt.ylabel("annual value (3yr avg)")
# plt.xlabel('year since first publication')
plt.xlabel("working year")

plt.legend()
plt.grid()
plt.savefig("cmu_careers.pdf")

In [None]:
plt.plot(val_by_year)
plt.title("author value by year")
plt.xlabel("years since first publication")
plt.ylabel("total annual value generated")
plt.grid(True)
plt.figure()
plt.plot(v_count)
plt.xlabel("years since first publication")
plt.ylabel("number of authors")
plt.grid(True)
plt.figure()
plt.plot(val_by_year / v_count)
plt.title("author value by year")
plt.xlabel("years since first publication")
plt.ylabel("average annual value generated")
plt.grid(True)

In [None]:
plt.figure(figsize=(15, 10))
for i in range(0, 26, 5):
    plt.subplot(2, 3, i // 5 + 1)
    val_by_year_surv = np.zeros(2019 - 1969)
    v_count_surv = np.zeros(2019 - 1969)
    for auth, years in author_by_year.items():
        yrs = years.keys()
        start_year = min(yrs)
        end_year = max(yrs)
        span = end_year - start_year
        if span >= i:
            # value_vec = np.array(list(years.values()))
            # min_v = value_vec.min()
            # total = (value_vec-min_v).sum()
            for y, v in years.items():
                val_by_year_surv[y - start_year] += v  # (v-min_v)/total
                v_count_surv[y - start_year] += 1
    plt.plot(val_by_year_surv / v_count_surv)
    plt.title("author value by year (career $\geq$ {} yrs)".format(i))
    plt.xlabel("years since first publication")
    plt.ylabel("annual value generated")
    # sorted_vals = sorted(val_by_year_surv)
    # plt.ylim(-250,1950)
    plt.grid(True)

# Adjusted Plus-Minus

In [None]:
Xapm = scipy.sparse.dok_matrix((papers.shape[0], len(unique_names)))
xdict = {}
y = np.zeros(papers.shape[0])
for row in papers.itertuples():
    paper_year = row[10]
    conf = row[2]
    n = row[4]
    authors = row[3]
    if conf == "CoRR":
        continue
    # if conf not in r1_confs:
    #    continue
    y[row[0]] = clf[span_years * conf_idx[conf] + paper_year - min_year]
    for a in authors:
        xdict[(row[0], name_idx[a])] = 1
        # Xapm[row[0],name_idx[a]] = 1
Xapm._update(xdict)

In [None]:
_ = plt.hist((y - y.mean()) / y.std(), 50)
y.std(), y.mean()
plt.figure()
_ = plt.hist(y, 50)

In [None]:
from sklearn.linear_model import SGDRegressor

# huber is noise tolerant, squared is not, zeros weights conferences equally, otherwise using learned weightrs
X = scipy.sparse.csr_matrix(Xapm)
# good ones
clf2 = SGDRegressor(
    "squared_loss",
    alpha=1e-3,
    penalty="l2",
    tol=1e-6,
    max_iter=100,
    average=True,
    verbose=1,
    fit_intercept=True,
)  # ,fit_intercept=False
# clf2 = SGDRegressor('squared_loss',alpha=1e-3,penalty='l2',tol=1e-6,max_iter=1000,average=True,verbose=1)
# high reg?
# clf2 = SGDRegressor('huber',alpha=0,penalty='l2',tol=1e-6,max_iter=100,average=True,verbose=1)

clf2.fit(X, (y - y.mean()) / y.std())
# clf2.fit(X,y)

In [None]:
scores = np.argsort(clf2.coef_)[::-1]
k = 500
rs = ri_scores.std()
us = clf2.coef_.std()
ts = value_scores.std()
for i in range(k):
    idx = scores[i]
    if ri_scores[idx] / rs < 20.0:
        continue
    print(
        "{}\t{:35}\t{:.1f}".format(i + 1, unique_names[idx][:30], clf2.coef_[idx] / us)
    )
_ = plt.hist(clf2.coef_ / us, 100)
clf2.coef_[name_idx["Judea Pearl"]] / us, value_scores[name_idx["Judea Pearl"]] / ts

In [None]:
scores = np.argsort(value_scores)[::-1]
ts = value_scores.std()
k = 500
for i in range(k):
    idx = scores[i]
    if ri_scores[idx] / rs < 20.0:
        continue
    print(
        "{}\t{:35}\t{:.1f}".format(
            i + 1, unique_names[idx][:30], value_scores[idx] / rs
        )
    )

In [None]:
count_papers = np.zeros(len(unique_names))
count_norm_papers = np.zeros(len(unique_names))
for row in papers.itertuples():
    paper_year = row[10]
    conf = row[2]
    n = row[4]
    authors = row[3]
    for a in authors:
        auth = aliasdict.get(a, a)
        if auth in name_idx:
            count_papers[name_idx[auth]] += 1
            count_norm_papers[name_idx[auth]] += 1 / n

In [None]:
uni_faculty = faculty_affil[
    faculty_affil.affiliation == "Carnegie Mellon University"
]  # Carnegie Mellon University
uni_names = np.array(uni_faculty.name)
uni_names = list(
    uni_names
)  # + ['Derek Hoiem','Nicholas Rhinehart','Jacob Walker','Lerrel Pinto','Brian Okorn','Leonid Keselman','Siddharth Ancha','Humphrey Hu']
cmu_scores = []
count_papers
count_norm_papers
for name in uni_names:  # ['Martial Hebert','Abhinav Gupta','Derek Hoiem','David Held']:
    if name in name_idx:
        idx = name_idx[name]
        cmu_scores.append((clf2.coef_[idx] / us, unique_names[idx]))
for s, name in sorted(cmu_scores, reverse=True):
    print("{:40s}\t\t\t\t{:.1f}".format(name[:38], s))
Xapm.shape, len(unique_confs) * span_years

In [None]:
fa_list = list(faculty_affil.name)
fa_a_list = list(faculty_affil.affiliation)
z = []
ts = total_scores.std()
for name in curious_names:
    sidx = name_idx[name]
    uni = "unknown"
    if unique_names[sidx] in fa_list:
        uni = fa_a_list[fa_list.index(unique_names[sidx])]
    i = (scores.shape[0] - np.where(scores == sidx)[0])[0]
    z.append(
        (
            clf2.coef_[sidx] / us,
            unique_names[sidx][:20],
            uni[:20],
            total_scores[sidx] / ts,
            ri_scores[sidx] / rs,
            auth_years[sidx, 0],
            auth_years[sidx, 1],
        )
    )
print(
    "{}\t{:30s}\t{:25s}{}\t{}\t{} {}".format(
        "APM", "name", "uni", "TS", "RI-s", "start", "end"
    )
)
for _ in sorted(z, reverse=True):
    print("{:.2f}\t{:30s}\t{:25s}{:.1f}\t{:.1f}\t{:.0f} {:.0f}".format(*_))

In [None]:
z = []
for name in prev_cand:
    sidx = name_idx[name]
    uni = "unknown"
    if unique_names[sidx] in fa_list:
        uni = fa_a_list[fa_list.index(unique_names[sidx])]
    i = (scores.shape[0] - np.where(scores == sidx)[0])[0]
    z.append(
        (
            i + 1,
            unique_names[sidx][:20],
            uni[:20],
            clf2.coef_[sidx] / us,
            total_scores[sidx] / ts,
            ri_scores[sidx] / rs,
            auth_years[sidx, 0],
            auth_years[sidx, 1],
        )
    )
print(
    "{}\t{:30s}\t{:25s}{}\t{}\t{}\t{} {}".format(
        "rank", "name", "uni", "APM", "TS", "RI-s", "start", "end"
    )
)
for _ in sorted(z):
    print("{}\t{:30s}\t{:25s}{:.1f}\t{:.1f}\t{:.1f}\t{:.0f} {:.0f}".format(*_))

In [None]:
from collections import Counter, defaultdict


def di():
    return defaultdict(float)


apm_by_year = np.zeros(2019 - 1969)
apm_cnt_by_year = np.zeros(2019 - 1969)
for idx in range(clf2.coef_.shape[0]):
    start_year = auth_years[idx, 0]
    end_year = auth_years[idx, 1]
    span = int(end_year - start_year)
    if span >= 0:
        apm_by_year[span] += clf2.coef_[idx] / us
        apm_cnt_by_year[span] += 1
plt.plot(apm_by_year)
plt.title("plus minus by wokring year")
plt.xlabel("years since first publication")
plt.ylabel("total apm")
plt.grid(True)
plt.figure()
plt.plot(apm_cnt_by_year)
plt.xlabel("years since first publication")
plt.ylabel("number of authors")
plt.grid(True)
plt.figure()
plt.plot(apm_by_year / apm_cnt_by_year)
plt.title("author value by year")
plt.xlabel("years since first publication")
plt.ylabel("average apm")
plt.grid(True)

In [None]:
apm_by_year = np.zeros(2019 - 1969)
apm_cnt_by_year = np.zeros(2019 - 1969)
for idx in range(total_scores.shape[0]):
    start_year = auth_years[idx, 0]
    end_year = auth_years[idx, 1]
    span = int(end_year - start_year)
    if span >= 0:
        apm_by_year[span] += total_scores[idx] / ts
        apm_cnt_by_year[span] += 1
plt.figure()
plt.plot(apm_by_year / apm_cnt_by_year)
plt.title("author value by year")
plt.xlabel("years since first publication")
plt.ylabel("average value")
plt.grid(True)

# using CLF scores

In [None]:
import ftfy
from unidecode import unidecode

k = 500
i = 0
scores = np.argsort(clf2.coef_)  # norm_scores (rate), total_scores (total), clf2.coef_
fa_list = list(faculty_affil.name)
fa_a_list = list(faculty_affil.affiliation)
rs = ri_scores.std()
ts = total_scores.std()
print(
    "rank\t{:20s}\t{:20s} {}\t{:27s} {}\t{}\t{} {}".format(
        "name",
        "uni (if prof)",
        "score",
        "CMU nn            (nn dist)",
        "APM",
        "RI s",
        "start",
        "end",
    )
)

for sidx in scores[::-1]:
    uni = "unknown"

    if years_working[sidx] < 3:
        continue
    if years_working[sidx] > 9:
        continue
    if auth_years[sidx, 1] < 2016:
        continue
    if ri_scores[sidx] / rs < 1.4:
        continue
    if unique_names[sidx] in fa_list:
        uni = fa_a_list[fa_list.index(unique_names[sidx])]
    loc = mapped_all[sidx, :].dot(vec_mat)
    loc /= max(1, mapped_all_mag[sidx])
    dist = cdist(loc, faculty, metric="cosine")

    min_dist = np.argmin(dist[0])
    cmn, cms = cmu_names[min_dist][:20], dist[0, min_dist]
    name_dist = "{:20s} ({:.1f})".format(cmn, cms * 100)
    print(
        "{}\t{:20s}\t{:20s} {:.2f}\t{:25s} {:.1f}\t{:.1f}\t{:.0f} {:.0f}".format(
            i + 1,
            unidecode(ftfy.fix_encoding(unique_names[sidx][:20])),
            uni[:20],
            total_scores[sidx] / ts,
            name_dist,
            clf2.coef_[sidx] / us,
            ri_scores[sidx] / rs,
            auth_years[sidx, 0],
            auth_years[sidx, 1],
        )
    )
    i += 1

    if i == k:
        break

In [None]:
import ftfy
from unidecode import unidecode

[
    unidecode(ftfy.fix_encoding(unique_names[_]))
    for _ in [641932, 612947, 127117, 852686, 879945]
]

In [None]:
cands = (
    (auth_years[:, 0] > 2010)
    & (auth_years[:, 1] > 2016)
    & (auth_years[:, 1] - auth_years[:, 0] >= 2)
    & (ri_scores / rs > 1.3)
)
print(cands.sum())
plt.scatter(total_scores[cands] / ts, clf2.coef_[cands] / us, s=3)
plt.xlabel("total value")
plt.ylabel("credit assigned")

In [None]:
cands = np.zeros(len(unique_names)).astype(np.bool)
for i in cmu_names:
    cands[name_idx[i]] = True
len_years = auth_years[:, 1] - auth_years[:, 0]
len_years[len_years < 1] = 1
xv = total_scores / (ts * len_years)
yv = clf2.coef_ / (us * len_years)
# xv[xv == 0] = 0
# yv[yv == 0] = 1
# xv,yv = np.log(xv),np.log(yv)
# xv[np.isnan(xv)] = 0
# yv[np.isnan(yv)] = 0
plt.figure(figsize=(24, 24))
plt.scatter(xv[cands], yv[cands], s=3)
plt.xlabel("total value per year")
plt.ylabel("credit assigned year")
plt.xlim(0, 3)
plt.ylim(0, 3)
for i in cmu_names:
    plt.text(xv[name_idx[i]], yv[name_idx[i]], i, ha="center", va="center")
    if "Lee" in i:
        print(i)

# university rankings

In [None]:
recent_fil = [1 if _ + 1970 >= 2010 else 0 for _ in range(span_years)]
recent_fil = len(conf_idx) * [1 if _ + 1970 >= 2010 else 0 for _ in range(span_years)]
clf_fil = clf * np.array(recent_fil)
rec_scores = Xauth.dot(ri_filter_mat).dot(clf_fil)
rec_all_scores = Xauth.dot(clf_fil)
unis = faculty_affil.affiliation.unique()

In [None]:
def fd():
    return defaultdict(float)


uni_fac_scores = defaultdict(fd)
uni_ts_scores = defaultdict(fd)
uni_rs_scores = defaultdict(fd)
uni_tsall_scores = defaultdict(fd)

for row in faculty_affil.itertuples():
    auth = aliasdict.get(row[1], row[1])
    uni = row[2]
    if auth not in uni_fac_scores[row[2]] and auth in name_idx:
        uni_fac_scores[row[2]][auth] = clf2.coef_[name_idx[auth]]
        uni_ts_scores[row[2]][auth] = ri_scores[name_idx[auth]]
        uni_rs_scores[row[2]][auth] = rec_scores[name_idx[auth]]
        uni_tsall_scores[row[2]][auth] = rec_all_scores[name_idx[auth]]

In [None]:
uni_fac_scores["Carnegie Mellon University"].values()
from scipy.stats import trim_mean, trimboth

uni_pm = {k: trimboth(list(v.values()), 0.0).sum() for k, v in uni_fac_scores.items()}
uni_ts = {k: trimboth(list(v.values()), 0.0).sum() for k, v in uni_ts_scores.items()}
uni_rs = {k: trimboth(list(v.values()), 0.0).sum() for k, v in uni_rs_scores.items()}
uni_tsall = {
    k: trimboth(list(v.values()), 0.0).sum() for k, v in uni_tsall_scores.items()
}

In [None]:
uni_pm_scores = sorted([(v, k) for k, v in uni_pm.items()], reverse=True)
uni_v_scores = sorted([(v, k) for k, v in uni_ts.items()], reverse=True)
uni_r_scores = sorted([(v, k) for k, v in uni_rs.items()], reverse=True)
uni_vall_scores = sorted([(v, k) for k, v in uni_tsall.items()], reverse=True)

In [None]:
uni_pm_scores

In [None]:
uni_vall_scores

In [None]:
uni_v_scores

In [None]:
uni_r_scores

In [None]:
uni_vall_scores

In [None]:
rec_scores.shape

In [None]:
import gc

gc.collect()

# Network Analytics (authors)

In [None]:
def pg(M, alpha=0.85, tol=1e-6, max_iter=1, verbose=False):
    N = M.shape[0]
    nodelist = np.arange(N)
    S = scipy.array(M.sum(axis=1)).flatten()
    S[S != 0] = 1.0 / S[S != 0]
    Q = scipy.sparse.spdiags(S.T, 0, *M.shape, format="csr")
    M = Q * M

    # initial vector
    x = scipy.repeat(1.0 / N, N)

    # Personalization vector
    p = scipy.repeat(1.0 / N, N)

    # Dangling nodes
    dangling_weights = p
    is_dangling = scipy.where(S == 0)[0]

    # power iteration: make up to max_iter iterations
    for _ in range(max_iter):
        xlast = x
        x = alpha * (x * M + sum(x[is_dangling]) * dangling_weights) + (1 - alpha) * p
        # check convergence, l1 norm
        err = scipy.absolute(x - xlast).sum()
        if verbose:
            print(_, err)
        if err < N * tol:
            return x

    return x

In [None]:
import itertools

gauth_auth = scipy.sparse.dok_matrix((len(unique_names), len(unique_names)))
g_auth = {}
for row in papers.itertuples():
    paper_year = row[10]
    conf = row[2]
    n = row[4]
    authors = row[3]
    if clf[span_years * conf_idx[conf] : span_years * (conf_idx[conf] + 1)].max() > 0:
        for a, a2 in itertools.product(authors, authors):
            auth = aliasdict.get(a, a)
            auth2 = aliasdict.get(a2, a2)
            if auth in name_idx and auth2 in name_idx:
                g_auth[(name_idx[auth], name_idx[auth2])] = 1 + g_auth.get(
                    (name_idx[auth], name_idx[auth2]), 0
                )
gauth_auth._update(g_auth)

In [None]:
if False:
    import igraph as ig

    sources, targets = gauth_auth.nonzero()
    weights = gauth_auth[sources, targets]
    weights = np.array(
        weights
    )  # Need to convert Scipy's matrix format into a form appropriate for igraph
    # g = ig.Graph(zip(sources, targets), directed=True, edge_attrs={'weight': weights})
    weights.shape
    pr2 = ig.pagerank(g, niter=1)

In [None]:
gauth_auth = scipy.sparse.csr_matrix(gauth_auth)
pr = pg(gauth_auth, max_iter=100, verbose=True, tol=1e-12)
print(gauth_auth.shape[0])

In [None]:
pr_s = np.argsort(pr)[::-1]
top_k = 100
i = 0
j = 0
rs = ri_scores.std()
while i < top_k:
    j += 1
    idx = pr_s[j]
    if ri_scores[idx] / rs < 20.0:
        continue
    print(unique_names[idx], pr[idx], ri_scores[idx] / rs)
    i += 1

In [None]:
# pickle.dump(pr,open('pagerank_people.pkl','wb'))

# Network Analysis (confs)

In [None]:
auth_confs = defaultdict(set)
for row in papers.itertuples():
    paper_year = row[10]
    conf = row[2]
    n = row[4]
    authors = row[3]
    # if clf[conf_idx[conf]] > 0:
    for a in authors:
        auth = aliasdict.get(a, a)
        auth_confs[auth].add(conf_idx[conf])

In [None]:
auth_confs = {k: list(v) for k, v in auth_confs.items()}

In [None]:
import itertools

auth_confs_iter = {
    k: itertools.combinations_with_replacement(v, 2) for k, v in auth_confs.items()
}

In [None]:
import itertools

dconf = dict()

gconf_conf = scipy.sparse.dok_matrix((len(conf_idx), len(conf_idx)))
dconf = {}
for k, v in auth_confs_iter.items():
    for i, j in v:
        tmp = 1 + dconf.get((i, j), 0)
        dconf[(i, j)] = tmp
        if i != j:
            dconf[(j, i)] = tmp

gconf_conf._update(dconf)
# n = len(v)
# for i in range(n):
#    new_row = scipy.sparse.dok_matrix((1,len(conf_idx)))
#    for j in range(i,n):
#        new_row[0,v[j]] = 1
#    new_row = scipy.sparse.csr_matrix(new_row)
#    gconf_conf[v[i]] += new_row
#        i1 = v[i]
#        i2 = v[j]
#        gconf_conf[i1,i2] += 1
# gconf_conf[i2,i1] += 1

In [None]:
# gconf_conf.setdiag(gconf_conf.diagonal()/2)
# gconf_conf =  gconf_conf + gconf_conf.T - scipy.sparse.diags(gconf_conf.diagonal(),format='dok')

In [None]:
# .diagonal() and .setdiag()
gconf_conf = scipy.sparse.csr_matrix(gconf_conf)
prc = pg(gconf_conf, max_iter=100, verbose=True, tol=1e-12)

In [None]:
prc_s = np.argsort(prc)[::-1]
top_k = 100
i = 0
while i < top_k:
    idx = prc_s[i]
    print(unique_confs[idx], prc[idx])
    i += 1

In [None]:
# pickle.dump(prc,open('pagerank_conf.pkl','wb'))

In [None]:
import matplotlib.pyplot as plt

_ = plt.hist(np.log(prc), 100)

In [None]:
prcs = np.log(prc)
prcs = (prcs - prcs.mean()) / prcs.std()
scores = []
for conf in conf_choice:  # + ['STOC','FOCS','SODA']:
    idx = conf_idx[conf]
    scores.append((prcs[idx], conf))
for s, n in sorted(scores, reverse=True):
    print("{:30}\t{:.1f}".format(n[:25], s))
_ = plt.hist(prcs, 100)