In [228]:
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from seaborn import color_palette

import pandas as pd 
import numpy as np

import bokeh
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure, output_file, show, curdoc
from bokeh.models import Circle,BoxSelectTool, BoxZoomTool, LassoSelectTool,Text,HoverTool,ColumnDataSource, Range1d, Axis

In [234]:
totals_pubs = pd.Series(index=pd.date_range('1950-01-01','2015-12-31',freq='AS',),data=[97529,102475,106673,108515,113949,100346,115704,122845,133961,137311,152292,179267,188095,196106,221271,288753,327167,349344,364851,399384,405709,420089,448565,459029,495345,566369,606671,683684,703905,728908,749425,775628,810977,875161,893747,893536,909200,920189,901468,856502,880419,901808,922010,963477,1015818,1080336,1130853,1159929,1161957,1187808,1205276,1189963,1234480,1270516,1356955,1434957,1495934,1567550,1655220,1716898,1750645,1836125,1913473,2005909,2027312,1809485,])
totals_refs = pd.Series(index=pd.date_range('1950-01-01','2015-12-31',freq='AS',),data=[862276,913941,991301,1054144,1082894,993625,1132516,1195067,1291857,1390632,1496650,1927193,2030321,2167335,2530307,3071105,3556047,3822576,4197118,4439659,4483020,4909395,5406217,5614448,5994449,6801509,7391424,7858516,8097995,8733158,9155748,9680861,10507381,11104584,11725145,12482330,12384351,12773481,13165658,13524406,14225948,15057108,15906821,16710320,17930450,19381008,21289915,22082648,22865217,23589060,24585765,25072234,26210739,27773579,29654550,31816766,34430852,36991583,40692695,43594503,46666951,50946087,55088628,59408651,62668029,59436898,])

In [244]:
df_refs = pd.read_pickle('/Users/jaredlorince/Desktop/d_pop_refs.pkl')
df_pubs = pd.read_pickle('/Users/jaredlorince/Desktop/d_pop.pkl')

In [245]:
df_pubs = df_pubs.reset_index().pivot_table(index='date',columns='category',values='uid').fillna(0).ix[:'2015']
df_refs = df_refs.reset_index().pivot_table(index='date',columns='category',values='cnt').fillna(0).ix[:'2015']

In [273]:
print '|'.join(df.columns),len(df.columns)

|Acoustics|Agricultural Economics & Policy|Agricultural Engineering|Agriculture, Dairy & Animal Science|Agriculture, Multidisciplinary|Agronomy|Allergy|Anatomy & Morphology|Andrology|Anesthesiology|Anthropology|Archaeology|Architecture|Area Studies|Art|Asian Studies|Astronomy & Astrophysics|Audiology & Speech-Language Pathology|Automation & Control Systems|Behavioral Sciences|Biochemical Research Methods|Biochemistry & Molecular Biology|Biodiversity Conservation|Biology|Biophysics|Biotechnology & Applied Microbiology|Business|Business, Finance|Cardiac & Cardiovascular Systems|Cell & Tissue Engineering|Cell Biology|Chemistry, Analytical|Chemistry, Applied|Chemistry, Inorganic & Nuclear|Chemistry, Medicinal|Chemistry, Multidisciplinary|Chemistry, Organic|Chemistry, Physical|Classics|Clinical Neurology|Communication|Computer Science, Artificial Intelligence|Computer Science, Cybernetics|Computer Science, Hardware & Architecture|Computer Science, Information Systems|Computer Science, Inter

In [None]:
# fig,ax = plt.subplots(1,1,figsize=(24,6))
# (df.resample('A').sum()).divide(df.resample('A').sum().sum(1),axis=0).plot(legend=False,ax=ax)

In [240]:
#totals = df.sum(1)
cum_totals_pubs = totals_pubs.cumsum()
cum_totals_refs = totals_refs.cumsum()

In [None]:
#top = df.sum().sort_values(ascending=False).index

In [271]:
output_notebook()



def update(rawcount=True,cumulative=False,resample='yearly',discipline='Psychology',data_type='pubs'):
    def do_plot():
        df = globals()['df_'+data_type]
        totals = globals()['totals_'+data_type]
        cum_totals = globals()['cum_totals_'+data_type]
        lines = {}
        hovers = {}
        data = {}
        colors = ['#%02x%02x%02x' % (a, b, c) for a,b,c in np.round(255*np.array(color_palette("coolwarm",n_colors=len(discipline)))).astype(int)]
        for i,disc in enumerate(discipline):
            current = df[disc].resample(resample).sum()
            totals_current = totals.resample(resample).sum()

            if cumulative:
                cum_totals_current = cum_totals.resample(resample).sum()
                if rawcount:
                    values = current.cumsum()
                else:
                    values = current.cumsum()/cum_totals_current
            else:
                totals_current = totals.resample(resample).sum()
                if rawcount:
                    values = current
                else:
                    values = current/totals_current
            data[disc] = ColumnDataSource({'x':idx,'value':values,'date':datestrings,'percent':{False:values.apply(lambda x: "{:.2f}%".format(100*x)),True:values.apply(lambda x: "{:,.0f}".format(x))}[rawcount]})
            lines[disc] = p.line('x','value',source=data[disc],color=colors[i],line_width=2)
            hovers[disc] = HoverTool(renderers=[lines[disc]],tooltips=[('Discipline', disc),('date','@date'),('{} this period'.format(data_type),'@percent')])
            p.add_tools(hovers[disc])
    if discipline == 'all':
        discpline = df.columns
    else:
        discipline = discipline.strip().split('|')
        
    p = figure(width=800,height=350,x_axis_type='datetime',tools=['pan,box_zoom,reset'])
    p.select(dict(type=Axis, layout="left"))[0].formatter.use_scientific = False
    resample = {'Daily':'D','Weekly':'W','Monthly':'M','Yearly':'A'}[resample]
    freq = {'D':'D','W':'W','M':'MS','A':'AS'}[resample]
    idx = pd.date_range('1950-01-01','2015-12-31',freq=freq)
    datestrings = idx.map(lambda x: x.strftime('%Y-%m-%d'))
    if data_type == 'both':
        for data_type in ('pubs','refs'):
            do_plot()#data_type,discipline,p,resample,cumulative,rawcount)
    else:
        do_plot()#data_type,discipline,p,resample,cumulative,rawcount)
    show(p)
    




In [272]:
from ipywidgets import Dropdown,Text,Checkbox,interact

disc_select = Text(description='Discipline',value='Psychology')
rawcount_select = Checkbox(description = 'rawcount',value=True)
cumulative_select = Checkbox(description='cumulativ',value=False)
resample_select = Dropdown(description="Resample",options=["Daily","Weekly","Monthly","Yearly"],value="Yearly")
data_select = Dropdown(description="Data",options=["pubs","refs","both"],value="pubs")



interact(update,rawcount=rawcount_select,cumulative=cumulative_select,
         disc=disc_select,resample=resample_select,data_type=data_select);


In [275]:
'|'.join([x for x in df.columns if 'psych' in x.lower()])

'Psychiatry|Psychology|Psychology, Applied|Psychology, Biological|Psychology, Clinical|Psychology, Developmental|Psychology, Educational|Psychology, Experimental|Psychology, Mathematical|Psychology, Multidisciplinary|Psychology, Psychoanalysis|Psychology, Social'

In [None]:
x = df.resample('AS').sum().cumsum().T
'|'.join(x.index[(x['2000']<10000).ix[:,0]])

In [222]:
#Genetics & Heredity|Mathematical & Computational Biology|Medical Informatics
totals.sum()

1066017645

In [248]:
df_pubs.head()

category,Unnamed: 1_level_0,Acoustics,Agricultural Economics & Policy,Agricultural Engineering,"Agriculture, Dairy & Animal Science","Agriculture, Multidisciplinary",Agronomy,Allergy,Anatomy & Morphology,Andrology,...,Transportation,Transportation Science & Technology,Tropical Medicine,Urban Studies,Urology & Nephrology,Veterinary Sciences,Virology,Water Resources,Women's Studies,Zoology
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1950-01-01,0.0,304.0,0.0,0.0,828.0,473.0,171.0,135.0,889.0,0.0,...,0.0,0.0,246.0,0.0,256.0,427.0,0.0,0.0,0.0,566.0
1950-01-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1950-01-14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1950-01-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1950-01-18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
