In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

%matplotlib inline
plt.style.use('ggplot')

##  Names to scrape for details

In [None]:
name_list = pd.read_json('data/names_list.json')

In [None]:
name_list[name_list.name == 'Anne']

In [None]:
# scrapeable_females = name_list.ix[(name_list.total_female > 4), 'name']
# scrapeable_males = name_list.ix[(name_list.total_male > 4), 'name']
# scrapeable_females.to_csv('data/scrapeable_females.csv', index=False, encoding='utf-8')
# scrapeable_males.to_csv('data/scrapeable_males.csv', index=False, encoding='utf-8')

## 1. Birth rates

In [None]:
MIN_YEAR = 1880
MAX_YEAR = 2014
PERIOD_SIZE = 5
lower_boundary_fcn = lambda x: (((x - 1) / PERIOD_SIZE) * 
                                PERIOD_SIZE + 1)
min_period = lower_boundary_fcn(MIN_YEAR)
max_period = lower_boundary_fcn(MAX_YEAR + 2 * PERIOD_SIZE)

Load the file with detailed stats per name.

In [None]:
details = pd.read_json('data/details.json')
names_with_details = details[details.has_details]
names_with_details.head(2)

In [None]:
names_with_details[names_with_details.name == 'Anne']

We're mostly interested in: 
<li> <code>name</code> </li>
<li> <code>gender</code> </li>
<li> <code>name_type</code>: whether it's a first name or a follow name </li>
<li> <code>value</code>: list with # births per year </li>
<li> <code>approximation</code>: list with approximated # births per year </li>
<li> <code>year</code>: array indexing value and approximation </li>

<p>
Unfortunately, value, approximation and year are lists contained in cells in the DataFrame, so we have to do some pandas magic (hidden in the function <code>get_births</code>) to unpack the data.

In [None]:
def get_births(df, id_vars, cols, var_name='ii'):
    """
    Parse scraped data and get birth stats.
    
    :param df: DataFrame with scraped data
    :param id_vars: columns to use as identifiers
    :param cols: columns with values to unpack
    :param var_name: dummy variable name
    :returns DataFrame with unpacked data
    """
    births = pd.concat([convert_and_index(df, id_vars, var_name, p) for p 
                        in cols], axis=1)
    births.reset_index(inplace=True)
    births.drop(var_name, axis=1, inplace=True)
    return births


def convert_and_index(df, id_vars, var_name, value_name):
    """
    Wrapper function for convert_nested_to_long: also sets id_vars
    and var_name as index.
    
    :param df: DataFrame
    :param id_vars: List to use as identified variables
    :param var_name: Name to use for the variable column
    :param value_name: Column name to unpack.
    :returns DataFrame with column data in long format
    """
    converted = convert_nested_to_long(df, id_vars, var_name, value_name)
    return converted.set_index(id_vars + [var_name])


def convert_nested_to_long(df, id_vars, var_name, value_name):
    """
    Convert lists nested in cells in a column to a DataFrame in long
    format.
    
    :param df: DataFrame
    :param id_vars: List to use as identified variables
    :param var_name: Name to use for the variable column
    :param value_name: Column name to unpack.
    :returns DataFrame with column data in long format
    """
    indexed = df.set_index(id_vars)
    unnested = indexed[value_name].apply(pd.Series).reset_index()
    long_format = pd.melt(unnested, id_vars=id_vars, 
                          var_name=var_name, value_name=value_name)
    return long_format

In [None]:
births = get_births(names_with_details, ['name', 'gender', 'name_type'],
                    ['year', 'value', 'approximation'])

The function <code>convert_nested_to_long</code> converts the columns <code>year</code>, <code>value</code> and <code>approximation</code> individually. The lists in the cells of the column are converted to a <code>Series</code>, effectively turning the 1D column with lists into a 2D <code>DataFrame</code>. The data is then transformed from wide to long format by melting the <code>DataFrame</code>. Setting the index with the name, gender and name type preserves these index values during the transformation

In [None]:
temp_name = names_with_details[names_with_details.name == 'Aad']
temp_name.iloc[0]

In [None]:
wide_name = temp_name.set_index(['name', 'gender', 'name_type'])['approximation'].apply(pd.Series)
wide_name

In [None]:
pd.melt(wide_name.reset_index(), id_vars=['name', 'gender', 'name_type'], 
        var_name='ii', value_name='approximation').head()

The processsed result looks like:

In [None]:
births.head()

Approximate births are given for years that don't have exact data, so we would like to use approximations if no real values are given. 

In [None]:
# Combine approximate & exact stats 
births['n_born'] = births.approximation
has_value = births.value > 0 
births.ix[has_value, 'n_born'] = births.ix[has_value, 'value']
# Tidy up
births.drop(['value', 'approximation'], axis=1, inplace=True)
births['year'] = births['year'].astype(int)
births.rename(columns={'year': 'birth_year'}, inplace=True)

To compute the sums of the births for both name types or both genders, group over the all other identifying columns, and sum the number of births. Append the resulting <code>DataFrame</code> to the original one.

In [None]:
def append_sums(df, id_vars, value_vars, id_name, id_value):
    """
    Append sum aggregates.
    
    :param df: DataFrame
    :param id_vars: Column(s) to use as identifier variables
    :param value_vars: Column(s) to sum
    :param agg_name: Column to aggregate 
    :param agg_value: Value to use as identifier in aggregated column
    :returns DataFrame with aggregated sums appended
    """
    totals = df.groupby(id_vars, as_index=False)[value_vars].sum()
    totals[id_name] = id_value
    df = df[df[id_name] != id_value].append(totals, ignore_index=True)
    return df

In [None]:
births = append_sums(births, ['name', 'gender', 'birth_year'],
                     'n_born', 'name_type', 'total')
births = append_sums(births, ['name', 'name_type', 'birth_year'],
                     'n_born', 'gender', 'both')

Mortality rates are given for bins of 5 years, use <code>pd.cut</code> to assign bins for later use (IS THIS TRUE?).

In [None]:
# Bin the years in bins ofo 5 years
period_bins = range(min_period, max_period, PERIOD_SIZE)
births['begin_period'] = pd.cut(births.birth_year, period_bins, 
                               labels=period_bins[:-1], right=False)

The end result looks like:

In [None]:
births.head()

And we can already plot the yearly birth rates for a name:

In [None]:
NAME = 'Anne'
id_cols = ['name_type', 'gender']
current_births = births[births.name == NAME]

fig, ax = plt.subplots(3, 3, figsize=(10, 10), sharex=True, sharey=True)
ax = ax.reshape(-1)  # Reshape the list for easy iteraton
for i_plot, (key, group) in enumerate(current_births.groupby(id_cols)):
    group.plot(x='birth_year', y='n_born', ax=ax[i_plot],
               title=' '.join(key), rot=45)
    ax[i_plot].legend([])
    ax[i_plot].set_ylabel('# births')
fig.tight_layout()

## 2. Mortality rates 

In [None]:
mortality = pd.read_csv('data/levensverwachting_cbs.csv', delimiter=';', skiprows=2, encoding='utf-8')
mortality.columns = ['period', 'gender', 'age', 'proba']
mortality['age'] = mortality.age.str.replace(',', '.').str.split(' ').apply(lambda x: float(x[0]))
mortality['gender'] = mortality.gender.map({'Mannen': 'male', 'Vrouwen': 'female'})
mortality['begin_period'] = mortality.period.apply(lambda x: x[:4]).astype(int)
mortality.drop('period', axis=1, inplace=True)

In [None]:
def interpolate_age_proba(df):
    groupby_cols = ['gender', 'begin_period']
    interpolate_axis = np.arange(0, df.age.max() + 0.5, 0.5)
    new_axis = interpolate_axis[::2]
    groups = df.groupby(groupby_cols)
    age_fcn = lambda x: interpolate_col(x, 'age', interpolate_axis, new_axis)
    interpolated = groups.apply(age_fcn)
    return interpolated.drop(groupby_cols, axis=1).reset_index()


def interpolate_col(g, col, interpolate_axis, new_axis):
    reindex_g = g.set_index(col).reindex(interpolate_axis)
    interpolated_g = reindex_g.apply(pd.Series.interpolate)
    return interpolated_g.ix[new_axis]


def fill_missing_periods(df, new_axis):
    groupby_cols = ['gender', 'age']
    groups = df.groupby(groupby_cols)
    filled = groups.apply(lambda x: fill_group(x, new_axis))
    return filled.reset_index(drop=True)
    
def fill_group(g, new_axis):
    reindex_g = g.set_index('begin_period').reindex(new_axis)
    filled = reindex_g.fillna(method='ffill').reset_index()
    return filled

In [None]:
interpolated_mortality = interpolate_age_proba(mortality)
filled_mortality = fill_missing_periods(interpolated_mortality, period_bins)
years_in_period = births[['birth_year', 'begin_period']].drop_duplicates()
yearly_mortality = pd.merge(filled_mortality, years_in_period)

In [None]:
def compute_survival_rate(df, birth_year):
    is_born = (df.birth_year - df.age == birth_year)
    survival_rate = (1 - df.ix[is_born, 'proba']).prod()
    return survival_rate

def get_cohort_survival(df, year):
    survival_gen = ((birth_year, compute_survival_rate(df, birth_year)) 
                    for birth_year in range(year - 100, year + 1))
    cohort_survival = pd.DataFrame(survival_gen, columns=['birth_year', 'survival_rate'])
    cohort_survival['year'] = year
    return cohort_survival

def get_historical_survival(df, valid_years):
    historical_survival = pd.concat([get_cohort_survival(df, year) for year in valid_years], 
                                    axis=0)
    return historical_survival

In [None]:
yearly_mortality.max()

In [None]:
valid_years = range(yearly_mortality.begin_period.min() + 100, yearly_mortality.begin_period.max() + 5)
historical_survival_fcn = lambda x: get_historical_survival(x, valid_years)
survival_rates = yearly_mortality.groupby('gender').apply(historical_survival_fcn)
survival_rates.reset_index(level=0, inplace=True)

In [None]:
YEAR = 2014
rates = survival_rates[survival_rates.year == YEAR]

In [None]:
name_survival = pd.merge(births, rates)
name_survival['n_born_and_alive'] = name_survival.n_born * name_survival.survival_rate
name_survival['n_dead'] = name_survival.n_born - name_survival.n_born_and_alive

In [None]:
name_survival.head()

In [None]:
value_cols = ['birth_year', 'n_born', 'n_born_and_alive', 'n_dead']
index_cols = ['name', 'name_type', 'gender'] 
export = pd.DataFrame({e: name_survival.groupby(index_cols)[e].apply(list) for 
                       e in value_cols}).reset_index()

In [None]:
export.head()

In [None]:
#export.to_csv('out/name_survival.csv', index=False, encoding='utf-8')

In [None]:
#name_survival.to_csv('out/name_survival.csv', index=False, encoding='utf-8')

In [None]:
# See https://stackoverflow.com/questions/23433237/pandas-long-form-table-to-nested-json?rq=1

class NestedDict(dict):
    def __missing__(self, key):
        self[key] = NestedDict()
        return self[key]

In [None]:
name_survival[['birth_year', 'n_born']] = name_survival[['birth_year', 'n_born']].applymap(np.round)
name_survival[['birth_year', 'n_born']] = name_survival[['birth_year', 'n_born']].astype(int)

In [None]:
d = NestedDict()
for key, group in name_survival.groupby(['name', 'gender', 'name_type']):
    d[key[0]][key[1]][key[2]]['data'] = group[['birth_year', 'n_born', 'n_born_and_alive']].to_dict(orient='records')


In [None]:
import json


In [None]:
with open('out/anne.json', 'w') as f:
    json.dump(d['Anne'], f)

In [None]:
with open('out/names_export.json', 'w') as f:
    json.dump(d, f)


<code>
[
    {
    'name': 'Aaf',
    'gender': 'female'
    'data':
        {
        'first': 
            [
                {
                'year': 2015,
                'n_born': 24,
                'n_born_and_alive: 20,
                },
                {
                'year': 2010,
                'n_born': 10,
                'n_born_and_alive': 2,
                },
            ]
        'follow': 
            [
                {
                }
                {
                }
            ]
        }

            }
    },
    {
    'name': 'Anne' 
    },
]
</code>

In [None]:
pd.DataFrame(d)

In [None]:
group[['birth_year', 'n_born']].to_dict(orient='records')
#pd.DataFrame.to_dict()

In [None]:
aard.set_in

In [None]:
{name: {male: {first: data, follow: data}}, female: {first: data, follow: data}}

In [None]:
name_survival[[name_survival.gender == 'male')]]

In [None]:
export.iloc[0].to_json('out/aad_male_first.json')

<code>mongoimport -d names -c data --type csv --file names/out/name_survival.csv -headerline</code>

<code>mongod --dbpath ~/temp/mongo_data/</code>


In [None]:
henk = name_survival[(name_survival.name == u'Daniël') & (name_survival.gender == 'male') &
                     (name_survival.name_type == 'first')]
henk.set_index('birth_year')[['n_born', 'n_born_and_alive']].plot()
henk = name_survival[(name_survival.name == 'Hendrikus') & (name_survival.gender == 'male') &
                     (name_survival.name_type == 'first')]
henk.set_index('birth_year')[['n_born', 'n_born_and_alive']].plot()

In [None]:
henk = name_survival[(name_survival.name == u'Daniël') & (name_survival.gender == 'male') &
                     (name_survival.name_type == 'first')]
henk[['n_born', 'n_born_and_alive', 'n_dead']].plot()
henk = name_survival[(name_survival.name == u'Willem') & (name_survival.gender == 'male') &
                     (name_survival.name_type == 'first')]
henk[['n_born', 'n_born_and_alive', 'n_dead']].plot()

In [None]:
henk[['n_born', 'n_born_and_alive']].sum()

In [None]:
pd.Series(export.name.unique()).to_json('out/names.json', orient='values')

In [None]:
pd.DataFrame.to_json()

### Ideas:
<li> Total deaths per name
<li> Cum deaths vs cum born