In [1]:
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from seaborn import color_palette

import pandas as pd 
import numpy as np
from dateutil.parser import parse

import bokeh
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure, output_file, show, curdoc
from bokeh.models import Circle,BoxSelectTool, BoxZoomTool, LassoSelectTool,Text,HoverTool,ColumnDataSource, Range1d, Axis

In [2]:
totals_pubs = pd.Series(index=pd.date_range('1950-01-01','2015-12-31',freq='AS',),data=[97529,102475,106673,108515,113949,100346,115704,122845,133961,137311,152292,179267,188095,196106,221271,288753,327167,349344,364851,399384,405709,420089,448565,459029,495345,566369,606671,683684,703905,728908,749425,775628,810977,875161,893747,893536,909200,920189,901468,856502,880419,901808,922010,963477,1015818,1080336,1130853,1159929,1161957,1187808,1205276,1189963,1234480,1270516,1356955,1434957,1495934,1567550,1655220,1716898,1750645,1836125,1913473,2005909,2027312,1809485,])
totals_refs = pd.Series(index=pd.date_range('1950-01-01','2015-12-31',freq='AS',),data=[862276,913941,991301,1054144,1082894,993625,1132516,1195067,1291857,1390632,1496650,1927193,2030321,2167335,2530307,3071105,3556047,3822576,4197118,4439659,4483020,4909395,5406217,5614448,5994449,6801509,7391424,7858516,8097995,8733158,9155748,9680861,10507381,11104584,11725145,12482330,12384351,12773481,13165658,13524406,14225948,15057108,15906821,16710320,17930450,19381008,21289915,22082648,22865217,23589060,24585765,25072234,26210739,27773579,29654550,31816766,34430852,36991583,40692695,43594503,46666951,50946087,55088628,59408651,62668029,59436898,])

In [3]:
# df_refs = pd.read_pickle('/Users/jaredlorince/Desktop/d_pop_refs.pkl')
df_pubs = pd.read_pickle('/Users/jaredlorince/Desktop/d_pop.pkl')
df_keywords = pd.read_pickle('/Users/jaredlorince/Desktop/d_pop_keywords_lem.pkl')

In [4]:
vc = df_keywords.groupby('keyword')['freq'].sum()
vc_df = vc[vc>=100].reset_index()
df_keywords=df_keywords.join(vc_df.set_index('keyword'),on='keyword',how='inner',rsuffix='_')

In [5]:
df_pubs = df_pubs.reset_index().pivot_table(index='date',columns='category',values='uid').fillna(0).resample('AS').sum().ix[:'2015']#.ix['1991':'2015']
#df_refs = df_refs.reset_index().pivot_table(index='date',columns='category',values='cnt').fillna(0).ix['1991':'2015']
df_keywords = df_keywords.pivot_table(index='date',columns='keyword',values='freq').fillna(0).ix['1991':'2015']

In [6]:
#totals = df.sum(1)
cum_totals_pubs = totals_pubs.cumsum()#.ix['1991':'2015']
cum_totals_refs = totals_refs.cumsum()#.ix['1991':'2015']

In [7]:
total_pubs = totals_pubs#.ix['1991':'2015']

In [8]:
output_notebook()



def update(rawcount=True,cumulative=False,discipline='genomics',data_type='pubs'): #resample='yearly'
    def do_plot():
        df = globals()['df_'+data_type]
        #totals = globals()['totals_'+data_type]
        #totals = totals_pubs
        #cum_totals = globals()['cum_totals_'+data_type]
        #cum_totals = cum_totals_pubs
        lines = {}
        hovers = {}
        data = {}
        colors = ['#%02x%02x%02x' % (a, b, c) for a,b,c in np.round(255*np.array(color_palette("coolwarm",n_colors=len(discipline)))).astype(int)]
        for i,disc in enumerate(discipline):
            current = df[disc]#.resample(resample).sum()
            #totals_current = totals#.resample(resample).sum()

            if cumulative:
                #cum_totals_current = cum_totals#.resample(resample).sum()
                if rawcount:
                    values = current.cumsum()
                else:
                    values = current.cumsum()/cum_totals_pubs.ix['1991':'2015']
            else:
                #totals_current = totals#.resample(resample).sum()
                if rawcount:
                    values = current
                else:
                    values = current/totals_pubs.ix['1991':'2015']
            data[disc] = ColumnDataSource({'x':idx,'value':values,'date':datestrings,'percent':{False:values.apply(lambda x: "{:.2f}%".format(100*x)),True:values.apply(lambda x: "{:,.0f}".format(x))}[rawcount]})
            lines[disc] = p.line('x','value',source=data[disc],color=colors[i],line_width=2)
            hovers[disc] = HoverTool(renderers=[lines[disc]],tooltips=[('Discipline', disc),('date','@date'),('{} this period'.format(data_type),'@percent')])
            p.add_tools(hovers[disc])
            if data_type == 'keywords':
                p.set(x_range=Range1d(parse('1991-01-01'), parse('2016-01-01')))

    if discipline == 'all':
        discpline = df.columns
    else:
        discipline = discipline.strip().split('|')
        
    p = figure(width=800,height=350,x_axis_type='datetime',tools=['pan,box_zoom,reset'])
    p.select(dict(type=Axis, layout="left"))[0].formatter.use_scientific = False
    #resample = {'Daily':'D','Weekly':'W','Monthly':'M','Yearly':'A'}[resample]
    #freq = {'D':'D','W':'W','M':'MS','A':'AS'}[resample]
    idx = pd.date_range('1991-01-01','2015-12-31',freq='AS')
    datestrings = idx.map(lambda x: x.strftime('%Y-%m-%d'))
    if data_type == 'both':
        for data_type in ('pubs','refs'):
            do_plot()#data_type,discipline,p,resample,cumulative,rawcount)
    else:
        do_plot()#data_type,discipline,p,resample,cumulative,rawcount)
    show(p)
    




In [9]:
from ipywidgets import Dropdown,Text,Checkbox,interact

disc_select = Text(description='Discipline',value='Psychology')
rawcount_select = Checkbox(description = 'rawcount',value=True)
cumulative_select = Checkbox(description='cumulativ',value=False)
#resample_select = Dropdown(description="Resample",options=["Daily","Weekly","Monthly","Yearly"],value="Yearly")
data_select = Dropdown(description="Data",options=["pubs","refs","keywords","both"],value="keywords")



interact(update,rawcount=rawcount_select,cumulative=cumulative_select,
         disc=disc_select,data_type=data_select); # resample=resample_select,


In [10]:
df_keywords.head()

keyword,Unnamed: 1_level_0,0,0 05 percent,0 1 matrix,0 1 percent,0 18 mu m cmos,0 2 percent,0 5 m h2so4,0 5 percent,0 5 percent bupivacaine,...,zymodemes,zymogen,zymogen activation,zymogen granule,zymogram,zymography,zymomonas mobilis,zymosan,zyxin,zz ceti star
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1991-01-01,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,7.0,...,11.0,13.0,3.0,7.0,1.0,2.0,33.0,16.0,0.0,3.0
1992-01-01,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,4.0,6.0,...,14.0,13.0,3.0,11.0,3.0,4.0,34.0,21.0,0.0,3.0
1993-01-01,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,7.0,10.0,...,11.0,13.0,3.0,10.0,5.0,3.0,42.0,22.0,1.0,0.0
1994-01-01,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,7.0,...,7.0,5.0,2.0,11.0,7.0,6.0,48.0,24.0,4.0,3.0
1995-01-01,20.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,7.0,5.0,...,6.0,11.0,8.0,11.0,6.0,8.0,57.0,26.0,1.0,4.0


In [11]:
new_df = df_keywords.ix['1991':]
arr_raw = new_df.T.values
arr = new_df.divide(totals_pubs.reindex(new_df.index.values),axis=0).T.values
arr_norm = (arr / arr.max(1,keepdims=True))

In [None]:
from scipy.spatial.distance import euclidean

def db_index(k,labels,features,centroids):
    maxes = []
    for i in xrange(k):
        mx=0
        for j in xrange(k):
            if i==j:
                continue
            ci = features[np.where(labels==i)[0]]
            cj = features[np.where(labels==j)[0]]
            disperion_ci = np.mean([euclidean(row,centroids[i]) for row in ci])
            disperion_cj = np.mean([euclidean(row,centroids[j]) for row in cj])
            current = (disperion_ci+disperion_cj)/euclidean(centroids[i],centroids[j])
            if current>mx:
                mx = current
        maxes.append(mx)
    return np.mean(maxes)

# assuming you have a kmeans model called "kmeans" and feature array "features"
db_index(n_clusters,kmn.labels,features,kmeans.cluster_centers_)

In [None]:
import math
from jqmcvi.base import davisbouldin,dunn_fast
from sklearn.cluster import KMeans
from sklearn.metrics import calinski_harabaz_score,silhouette_score
bins = np.linspace(arr_raw.sum(1).min(),arr_raw.sum(1).max(),1000)
dbs = []
chs = []
ds = []
ss = []
plotit = False
for n in xrange(2,9):
    kmeans = KMeans(n_clusters=n, n_jobs=-1).fit(arr_norm)
    db = davisbouldin([arr_norm[np.where(kmeans.labels_==i)[0]] for i in xrange(kmeans.n_clusters)],kmeans.cluster_centers_)
    dbs.append(db)
    ch = calinski_harabaz_score(arr_norm,kmeans.labels_)
    chs.append(ch)
    s =silhouette_score(arr_norm, kmeans.labels_, metric='euclidean',sample_size=1000)
    ss.append(s)
    #dunn = dunn_fast(arr_norm,kmeans.labels_)
    #ds = 
    
    print n,ch,db,s,dunn
    
    if plotit:
        rows = int(math.ceil(np.sqrt(n)))
        fig,axes = plt.subplots(rows,rows)
        #hist_fig,hist_axes = plt.subplots(rows,rows)
        af = axes.flat
        #hist_af = hist_axes.flat
        for i,row in enumerate(kmeans.cluster_centers_):
            ax = af.next()
            #ax_hist = hist_af.next()
            ax.plot(row,lw=3)
            ax.set_xticklabels([])
            ax.set_yticklabels([])
            #ax_hist.set_xticklabels([])
            #ax_hist.set_yticklabels([])
            matching = np.where(labels==i)[0]
            ax.set_title('N = {}'.format(len(matching)))
            #rand_sample = np.random.choice(matching,min(500,len(matching)),replace=False)
            #vals,_= np.histogram(arr_raw[matching].sum(1),bins=bins)
            #ax_hist.plot((vals/float(vals.sum()))[:100])
            #ax_hist.set_xscale('log')
            #ax_hist.set_yscale('log')
            #ax_hist.set_ylim(0,1)
            #for r in rand_sample:
            #    ax.plot(arr_norm[r],lw=0.01)

In [None]:
for i in xrange(16):
    print i,'|'.join(df_keywords.T.iloc[np.where(labels==i)].sum(1).sort_values(ascending=False).index[:10])
    print '---'

In [None]:
fig,ax = plt.subplots(1,1,figsize=(24,6))
df_pubs.ix[:'2015'].resample('AS').sum().plot(ax=ax,legend=False)

zero = df_pubs.ix['1976'].T.iloc[np.where(df_pubs.ix['1976'].T==0)[0]].index
fig,ax = plt.subplots(1,1,figsize=(24,6))
df_pubs.ix[:'2015'][zero].resample('AS').sum().plot(ax=ax,legend=True)

zero = df_pubs.ix['1976'].T.iloc[np.where(df_pubs.ix['1976'].T==0)[0]].index
fig,ax = plt.subplots(1,1,figsize=(24,6))
df_pubs.ix[:'2015'][zero].resample('AS').sum().plot(ax=ax,legend=True)

In [None]:
"""
Conference Proceedings Citation Index covers more than 160,000 conference titles in the Sciences starting from 1990 to the present day
Science Citation Index Expanded covers more than 8,500 notable journals encompassing 150 disciplines. Coverage is from the year 1900 to the present day.
Social Sciences Citation Index covers more than 3,000 journals in social science disciplines. Range of coverage is from the year 1900 to the present day.
Arts & Humanities Citation Index covers more than 1,700 arts and humanities journals starting from 1975. In addition, 250 major scientific and social sciences journals are also covered.
"""
fig,ax = plt.subplots(1,1,figsize=(12,6))
(df_pubs>0).sum(1).plot(ax=ax,title='Number of extant disciplines by year')
#ax.arrow('1960', 250, 14, -5, head_width=2, head_length=1, fc='k', ec='k',length_includes_head=True)
ax.annotate("Arts & Humanities Citation Index", xy=('1974', 234), xytext=('1955', 250),
            arrowprops=dict(arrowstyle="-|>"),fontsize=12)
ax.grid()

In [None]:
df_pubs