In [None]:
import json
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime

# Helper functions for analytics

In [None]:
def pubs_clean(pubs, start_year=1900, has_journal=True, has_title=True, 
               journal_blacklist=["arxiv", "chemrxiv", "biorxiv", "bulletin"], 
               no_cites_grace=3, highly_cited_grace=20, clean_citation_record=True
              ):
    """ Cleans up a citation record to remove preprints, ancient publications,
        or miscellaneous probable parsing errors.
    """
    clean = []
    now_year = datetime.now().year
    for v in pubs:
        if v["bib"]["year"]<start_year:
            continue
        # make sure we don't drop a decently cited article only because
        # of some formatting quirks
        if "citedby" in v and v["citedby"]>=highly_cited_grace:
            clean.append(v) 
            continue
        if has_journal and "journal" not in v["bib"]:
            continue
        if has_title and "title" not in v["bib"]:
            continue
        # drops if the journal name is blacklisted 
        # (e.g. preprints, which I love but are sadly usually not counted)
        if "journal" in v["bib"]:
            for j in journal_blacklist:
                if j in v["bib"]["journal"].lower():
                    continue
        # old articles that collected no citations are either useless or
        # crap picked up by the searchbot
        if (no_cites_grace>=0 and v["bib"]["year"]+no_cites_grace <= now_year and
                len(v["cites_per_year"])==0):
            continue
        if clean_citation_record:
            # removes citations that allegedly appeared before the paper was published,
            # allowing for a 1-year margin to account for preprints
            for y in list(v["cites_per_year"].keys()):
                if int(y)<v["bib"]["year"]-1:
                    v["cites_per_year"].pop(y)
        clean.append(v)
    return clean

In [None]:
# Performance indicators per year
def cites_per_year(pubs):
    """ Counts total number of citations per year. """
    citesy = {}
    for p in pubs:
        for k, nk in p["cites_per_year"].items():
            k = int(k)
            if k in citesy:
                citesy[k]+=nk
            else:
                citesy[k]=nk
    return citesy

def papers_per_year(pubs):
    """ Counts papers published per year. """
    papersy = {}
    for p in pubs:
        y=p["bib"]["year"]
        if y in papersy:
            papersy[y] += 1
        else:
            papersy[y] = 1
    return papersy

In [None]:
# Performance indicators per paper
def papers_cites(pubs):
    """ Counts total citations per paper """
    pc = np.asarray([(p["citedby"] if "citedby" in p else 0) for p in pubs])
    pc[::-1].sort()
    return pc

def papers_cites_years_table(pubs):
    """ Makes a table with the numbers of citations per year
        for all the papers in the publication list. """
    
    years = []
    paperyc = []
    for p in pubs:
        pcitesy = {}
        for k, nk in p["cites_per_year"].items():
            k = int(k)
            if k in pcitesy:
                pcitesy[k]+=nk
            else:
                pcitesy[k]=nk
        years += list(pcitesy.keys())
        paperyc.append(pcitesy)
    years = list(set(years))
    years.sort()
    ypcites = np.zeros((len(pubs),len(years) ))
    for ip, p in enumerate(paperyc):
        for k,v in p.items():
            ik = years.index(k)
            ypcites[ip, ik] = v
    order = np.argsort(ypcites.sum(axis=1))
    return (np.asarray(years, int), ypcites[order[::-1]].copy(), [pubs[i] for i in order[::-1]])
def h_index(pubs):
    pc = papers_cites(pubs)
    h = 0
    while pc[h]>h:
        h+=1
    return h

# Play around with the data

In [None]:
pubs = json.load(open("citations.json","r"))

In [None]:
pubs = pubs_clean(pubs, 1998)

In [None]:
citesy = cites_per_year(pubs)

In [None]:
papersy = papers_per_year(pubs)

In [None]:
plt.bar(x=list(citesy.keys()), height=list(citesy.values()))
plt.xlabel("year")
plt.ylabel("n. of citations")

In [None]:
plt.bar(x=list(papersy.keys()), height=list(papersy.values()))
plt.xlabel("year")
plt.ylabel("n. of papers")

In [None]:
fig, ax1 = plt.subplots()
ax1.bar(x=list(papersy.keys()), height=list(papersy.values()),align="edge",width=0.4,color='b')
ax1.set_xlabel('year')
# Make the y-axis label, ticks and tick labels match the line color.
ax1.set_ylabel('n. of papers', color='b')
ax1.tick_params('y', colors='b')

ax2 = ax1.twinx()
ax2.bar(x=list(citesy.keys()), height=list(citesy.values()),align="edge",width=-0.4,color='r')
ax2.set_ylabel('n. of citations', color='r')
ax2.tick_params('y', colors='r')

fig1=fig
fig.savefig("papercites.png",dpi=300)

In [None]:
papercites = papers_cites(pubs)

In [None]:
years, ypcites, psorted = papers_cites_years_table(pubs)
ypcum = ypcites.cumsum(axis=1)
ysorted = np.asarray([p["bib"]["year"] for p in psorted])
ypubs = list(set(ysorted)); ypubs.sort()

In [None]:
fig, ax1 = plt.subplots()
for b in range(len(years)-1,-1,-1):
    ax1.bar(range(len(pubs)),np.log10(ypcum[:,b]),width=1,label=str(years[b]), color=(b/len(years),0,1-b/len(years)))
ax1.legend(ncol=2)
ax1.set_xlabel("paper index")
ax1.set_ylabel("number of citations")
ax1.set_xlim(-1,105)
fig2 = fig
fig.savefig("paperhistory.png",dpi=300)

In [None]:
fig, ax1 = plt.subplots()
for y in ypubs:
    totb = ypcum[:,-1].copy()
    totb[np.where(ysorted!=y)[0]] = 0
    yf = (y-ypubs[0])/len(years)
    ax1.bar(range(len(pubs)),totb,width=1,label=str(y), color=(yf,0,1-yf))
ax1.legend(ncol=2)
ax1.set_xlabel("paper index")
ax1.set_ylabel("number of citations")
ax1.set_xlim(-1,105)
ax1.set_yscale('log')
fig2 = fig
fig.savefig("paperhistory.png",dpi=300)

In [None]:
combo, axs= plt.subplots(ncols=2, nrows=1)
ax1 = axs[0]
ax1.bar(x=list(papersy.keys()), height=list(papersy.values()),align="edge",width=0.4,color='b')
ax1.set_xlabel('year')
# Make the y-axis label, ticks and tick labels match the line color.
ax1.set_ylabel('n. of papers', color='b')
ax1.tick_params('y', colors='b')

ax2 = ax1.twinx()
ax2.bar(x=list(citesy.keys()), height=list(citesy.values()),align="edge",width=-0.4,color='r')
ax2.set_ylabel('n. of citations', color='r')
ax2.tick_params('y', colors='r')

ax1 = axs[1]
for b in range(len(years)-1,-1,-1):
    ax1.bar(range(len(pubs)),ypcum[:,b],width=1,label=str(years[b]), color=(b/len(years),0,1-b/len(years)))
ax1.legend(ncol=2)
ax1.set_xlabel("paper index")
ax1.set_ylabel("number of citations")
ax1.set_xlim(-1,105)

In [None]:
combo.set_size_inches(9,3)
combo.tight_layout()
combo.savefig("citationdata.png",dpi=300)
combo

# Automatic generation of standard citation analysis plot

In [None]:
def mk_citation_plot(pubs, lrange=(), rrange=()):    
    citesy = cites_per_year(pubs)
    papersy = papers_per_year(pubs)
    papercites = papers_cites(pubs)
    years, ypcites, psorted = papers_cites_years_table(pubs)
    ypcum = ypcites.cumsum(axis=1)
    ysorted = np.asarray([p["bib"]["year"] for p in psorted])
    ypubs = list(set(ysorted)); ypubs.sort()

    combo, axs= plt.subplots(ncols=2, nrows=1)
    ax1 = axs[0]
    ax1.bar(x=list(papersy.keys()), height=list(papersy.values()),align="edge",width=-0.4,color='b')
    ax1.set_xlabel('year')
    # Make the y-axis label, ticks and tick labels match the line color.
    ax1.set_ylabel('n. of papers', color='b')
    ax1.tick_params('y', colors='b')
    if len(lrange)>0:
        ax1.set_xlim(*lrange)

    ax2 = ax1.twinx()
    ax2.bar(x=list(citesy.keys()), height=list(citesy.values()),align="edge",width=0.4,color='r')
    ax2.set_ylabel('n. of citations', color='r')
    ax2.tick_params('y', colors='r')

    ax1 = axs[1]
    for y in ypubs:
        totb = ypcum[:,-1].copy()
        totb[np.where(ysorted!=y)[0]] = 0
        yf = (y-ypubs[0])/len(years)
        ax1.bar(range(len(pubs)),totb,width=1,label=str(y), color=(yf,0,1-yf))    
    ax1.legend(ncol=2)
    ax1.set_xlabel("paper index")
    ax1.set_ylabel("number of citations")
    if len(rrange)>0:
        ax1.set_xlim(*rrange)
    ax1.axvline(h_index(pubs),ls='--',c='k')
    ax1.set_yscale('log')
    
    combo.set_size_inches(9,3)
    combo.tight_layout()
    
    return combo

In [None]:
pubs = pubs_clean(json.load(open("citations.json","r")))
plots = mk_citation_plot(pubs, lrange=(2005,2019.5))

In [None]:
def mk_comparison_plot(pubs1, pubs2, ygrad=(0,0), names=("",""), lrange=(), rrange=()):    
    citesy1 = cites_per_year(pubs1)
    papersy1 = papers_per_year(pubs1)
    papercites1 = papers_cites(pubs1)
    years1, ypcites1, psorted1 = papers_cites_years_table(pubs1)
    ypcum1 = ypcites1.cumsum(axis=1)
    ysorted1 = np.asarray([p["bib"]["year"] for p in psorted1])
    ypubs1 = list(set(ysorted1)); ypubs1.sort()
    
    citesy2 = cites_per_year(pubs2)
    papersy2 = papers_per_year(pubs2)
    papercites2 = papers_cites(pubs2)
    years2, ypcites2, psorted2 = papers_cites_years_table(pubs2)
    ypcum2 = ypcites2.cumsum(axis=1)
    ysorted2 = np.asarray([p["bib"]["year"] for p in psorted2])
    ypubs2 = list(set(ysorted2)); ypubs2.sort()
    

    combo, axs= plt.subplots(ncols=2, nrows=1)
    ax1 = axs[0]
    ax1.bar(x=(np.asarray(list(papersy1.keys()))-ygrad[0]), height=list(papersy1.values()),align="edge",width=-0.4,color='b')
    if ygrad[0]==0 and ygrad[1]==0:
        ax1.set_xlabel('year')
    else:
        ax1.set_xlabel('years since graduation')
    # Make the y-axis label, ticks and tick labels match the line color.
    ax1.set_ylabel('n. of papers', color='b')
    ax1.tick_params('y', colors='b')
    if len(lrange)>0:
        ax1.set_xlim(*lrange)

    ax2 = ax1.twinx()
    ax2.bar(x=(np.asarray(list(citesy1.keys()))-ygrad[0]), height=list(citesy1.values()),align="edge",width=0.4,color='r')
    ax2.set_ylabel('n. of citations', color='r')
    ax2.tick_params('y', colors='r')
    
    x=(np.asarray(list(papersy2.keys()))-ygrad[1])
    order=np.argsort(x)
    ax1.plot(x[order], np.asarray(list(papersy2.values()))[order], 'bo')
    ax1.plot(x[order], np.asarray(list(papersy2.values()))[order], 'k-')
    
    x=(np.asarray(list(citesy2.keys()))-ygrad[1])
    order=np.argsort(x)
    ax2.plot(x[order], np.asarray(list(citesy2.values()))[order], 'ro')
    ax2.plot(x[order], np.asarray(list(citesy2.values()))[order], 'k-')

    ax1 = axs[1]
    ax1.plot(range(len(pubs1)), ypcum1[:,-1], 'r-', label=names[0])
    ax1.plot(range(len(pubs2)), ypcum2[:,-1], 'k-', label=names[1])
    ax1.legend(ncol=2)
    ax1.set_xlabel("paper index")
    ax1.set_ylabel("cit./paper")
    if len(rrange)>0:
        ax1.set_xlim(*rrange)
        
    ax1.axvline(h_index(pubs1)+0.25,ls='--',c='r')
    ax1.axvline(h_index(pubs2)-0.25,ls='--',c='k')
    ax1.set_yscale('log')
    
    combo.set_size_inches(9,3)
    combo.tight_layout()
    
    return combo

In [None]:
pubs = pubs_clean(json.load(open("citations.json","r")))
plots = mk_comparison_plot(pubs, pubs[::2], ygrad=(2002,2007), lrange=(-7,18), rrange=(0,100), names=("john", "doe"))

In [None]:
print("h-index: ",h_index(pubs))
print("<cites>: ",np.mean(papers_cites(pubs)))
print("median:  ",np.median(papers_cites(pubs)))