In [1]:
import numpy as np
import pandas as pd
import bokeh.plotting as bk
bk.output_notebook()

In [2]:
dataFolder = 'temp/baby_names/'
columnNames = ['name', 'sex', 'births']

names1880 = pd.read_csv(dataFolder+'yob1880.txt', names=columnNames)
names1880.head()

Unnamed: 0,name,sex,births
0,Mary,F,7065
1,Anna,F,2604
2,Emma,F,2003
3,Elizabeth,F,1939
4,Minnie,F,1746


In [3]:
years = range(1880, 2012)
parts = []
for year in years:
    path = '{0}yob{1}.txt'.format(dataFolder, year)
    frame = pd.read_csv(path, names=columnNames)
    frame['year'] = year  
    parts.append(frame)

In [4]:
names = pd.concat(parts, ignore_index=True)
names[::10**5]

Unnamed: 0,name,sex,births,year
0,Mary,F,7065,1880
100000,Ernie,F,13,1912
200000,Lemoyne,M,9,1922
300000,Derrell,M,43,1932
400000,Valentine,M,56,1943
500000,Neal,M,968,1953
600000,Konni,F,14,1962
700000,Howard,F,18,1970
800000,Tomecca,F,6,1976
900000,Martrell,M,8,1981


##  Pivoting

In [9]:
from bokeh.models.ranges import Range1d
totalBirths = names.pivot_table('births', index='year', columns='sex',
                                aggfunc=sum, margins=False)
#display(totalBirths.head())
#totalBirths[['F', 'M']][:-1].plot(title='Total births by sex and year')
fig = bk.figure(plot_width=750, plot_height=300, title=None)
fig.line(x=totalBirths.index, y=totalBirths['F'], legend='F', line_color='magenta')
fig.line(x=totalBirths.index, y=totalBirths['M'], legend='M', line_color='royalblue')
fig.legend.orientation = 'horizontal'
fig.xaxis.axis_label = 'Year'
fig.yaxis.axis_label = 'Total births'
fig.yaxis[0].formatter.use_scientific = False
bk.show(fig)

## Splitting

In [10]:
names.groupby(['year', 'sex'])['births'].sum().head()

year  sex
1880  F       90994
      M      110492
1881  F       91955
      M      100747
1882  F      107851
Name: births, dtype: int64

In [12]:
boys = names[names.sex == 'M']
girls = names[names.sex == 'F']
boys[:2000:100]

Unnamed: 0,name,sex,births,year
942,John,M,9655,1880
1042,Perry,M,134,1880
1142,Clayton,M,60,1880
1242,Judson,M,31,1880
1342,Wilmer,M,19,1880
1442,Rubin,M,14,1880
1542,Alois,M,10,1880
1642,Fayette,M,8,1880
1742,Toney,M,7,1880
1842,Titus,M,6,1880


In [13]:
boys[boys['name']=='Jayden']

Unnamed: 0,name,sex,births,year
824344,Jayden,M,7,1977
843205,Jayden,M,6,1978
899512,Jayden,M,9,1981
940021,Jayden,M,6,1983
960469,Jayden,M,5,1984
977725,Jayden,M,10,1985
997266,Jayden,M,14,1986
1019074,Jayden,M,11,1987
1040265,Jayden,M,16,1988
1062845,Jayden,M,22,1989


In [14]:
bBirths = boys.pivot_table('births', index='year', columns='name',
                           aggfunc=sum, margins=False)
subset = bBirths[['Ray', 'Elvis', 'Sam', 'John', 'Marvin', 'Bob']]

plots = []
for name in subset.columns:
    fig = bk.figure(plot_height=200, plot_width=700, title=None)
    fig.line(x=np.asarray(subset.index), y=np.asarray(subset[name]),
             line_color='black', legend=name)
    plots.append([fig])
bk.show(bk.gridplot(plots))

# Or directly using Pandas (which uses Matplotlib, not Bokeh): 
#subset.plot(subplots=True, figsize=(12, 10), grid=False,
#            title="Number of births per year")

## groupby

In [15]:
def add_prop(group):
    births = group['births']
    group['prop'] = births/float(births.sum())
    return group

In [17]:
names = names.groupby(['year', 'sex']).apply(add_prop)
names.head()

Unnamed: 0,name,sex,births,year,prop
0,Mary,F,7065,1880,0.077642
1,Anna,F,2604,1880,0.028617
2,Emma,F,2003,1880,0.022012
3,Elizabeth,F,1939,1880,0.021309
4,Minnie,F,1746,1880,0.019188


In [18]:
np.allclose(names.groupby(['year', 'sex'])['prop'].sum(), 1)

True

In [19]:
def get_top(group, topNumber):
    return group.sort_values(by='births', ascending=False)[:topNumber]

grouped = names.groupby(['year', 'sex'])
topNames = grouped.apply(get_top, topNumber=10)
topNames[:50]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,name,sex,births,year,prop
year,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1880,F,0,Mary,F,7065,1880,0.077642
1880,F,1,Anna,F,2604,1880,0.028617
1880,F,2,Emma,F,2003,1880,0.022012
1880,F,3,Elizabeth,F,1939,1880,0.021309
1880,F,4,Minnie,F,1746,1880,0.019188
1880,F,5,Margaret,F,1578,1880,0.017342
1880,F,6,Ida,F,1472,1880,0.016177
1880,F,7,Alice,F,1414,1880,0.015539
1880,F,8,Bertha,F,1320,1880,0.014506
1880,F,9,Sarah,F,1288,1880,0.014155


In [20]:
from bokeh.models.ranges import Range1d

diversity = topNames.pivot_table('prop', index='year', columns='sex', aggfunc=sum)

fig = bk.figure(plot_width=750, plot_height=300, title=None)
fig.line(x=diversity.index, y=diversity['F'], line_color='green', legend='F')
fig.line(x=diversity.index, y=diversity['M'], line_color='blue',  legend='M')
fig.y_range = Range1d(0, 1.2)
bk.show(fig)

# Or, using directly Pandas' "plot" method (which calls Matplotlib, not Bokeh)
# diversity.plot(title='Sum of diversity.prop by year and sex',
#                yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))