In [None]:
%%javascript
var command = "nb_name = '" + IPython.notebook.notebook_path + "'; nb_name = nb_name.split('/')[-1]";
IPython.notebook.kernel.execute(command);

In [None]:
import requests
import datetime
from lxml import etree
import csv
import lore
import os
import pandas
lore_dir = os.path.join(os.path.dirname(nb_name), '..')
os.chdir(lore_dir)

Download popular names from social security administration

In [None]:
url = 'https://www.ssa.gov/cgi-bin/popularnames.cgi'
first_available = 1880
most_recent = datetime.datetime.now().year - 1


dir = os.path.join(lore.env.data_dir, 'usa_names')
if not os.path.exists(dir):
    os.makedirs(dir)

years = {}
for year in range(first_available, most_recent + 1):
    path = os.path.join(dir, str(year) + '.csv')
    if not os.path.exists(path):
        response = requests.post(url, data={'year': year, 'top': 1000, 'number': 'n'})
        html = response.text
        tree = etree.HTML(html)
        with open(path, 'w') as file:
            writer = csv.writer(file)
            writer.writerow(['rank', 'male_name', 'male_count', 'female_name', 'female_count'])
            for row in tree.xpath('body/table[2]/tr/td[2]/table/tr'):
                tds = row.xpath('td')
                if tds:
                    writer.writerow([td.text.replace(',', '') for td in tds if td.text])
    years[year] = pandas.DataFrame.from_csv(path)


In [None]:
url = 'https://www.ssa.gov/oact/STATS/table4c6.html'
response = requests.get(url)
html = response.text
tree = etree.HTML(html)
path = os.path.join(lore.env.data_dir, 'actuary.csv')
tree.xpath('//*[@id="content"]/div/div[2]/div/table[1]/tbody/tr[2]/td/table/tbody/tr')

Get life expectancy by sex

In [None]:
url = 'https://www.ssa.gov/oact/STATS/table4c6.html'
response = requests.get(url)
html = response.text
tree = etree.HTML(html)
path = os.path.join(lore.env.data_data, 'actuary.csv')
with open(path, 'w') as file:
    writer = csv.writer(file)
    writer.writerow(['age','male death probability','male lives','male life expectancy','female death probability','female lives','female life expectency'
])
    for row in tree.xpath('//*[@id="content"]/div/div[2]/div/table[1]/tbody/tr[2]/td/table/tbody/tr'):
        tds = row.xpath('td')
        if tds:
            writer.writerow([td.text.replace(',', '') for td in tds if td.text])
    

actuary = pandas.DataFrame.from_csv(os.path.join(lore.env.data_dir, 'actuarial', 'table.csv'))
male_deaths = actuary['male death probability'].apply(lambda x: 1-x).cumprod()
female_deaths = actuary['female death probability'].apply(lambda x: 1-x).cumprod()

In [None]:
year_data = {}
for year in years:
    path = os.path.join(dir, str(year) + '.csv')
    year_data[year] = pandas.DataFrame.from_csv(path)

In [None]:
from datetime import date, timedelta
from collections import defaultdict
this_year = date(date.today().year, 1, 1)
year_delta = timedelta(days=365.24)
name_years = defaultdict(lambda: defaultdict(dict))
living_name_years = defaultdict(lambda: defaultdict(dict))

for year, data in year_data.items():
    age = (this_year - date(year, 1, 1)) // year_delta
    for row in data.itertuples():
        name_years[row[1]][year]['male'] = row[2]
        name_years[row[3]][year]['female'] = row[4]
        if age < 120:
            living_name_years[row[1]][year]['male'] = row[2] * male_deaths.iat[age]
            living_name_years[row[3]][year]['female'] = row[4] * female_deaths.iat[age]
        else:
            living_name_years[row[1]][year]['male'] = 0
            living_name_years[row[3]][year]['female'] = 0

In [None]:
name_stats = defaultdict(lambda: {'total': 0, 'male': 0, 'female': 0, 'mean_age': 0})
for name in name_years:
    stats = name_stats[name]
    for year, sexes in living_name_years[name].items():
        age = (this_year - date(year, 1, 1)) / year_delta
        male = sexes.get('male', 0)
        female = sexes.get('female', 0)
        stats['male'] += male
        stats['female'] += female
        stats['total'] += (male + female)
        stats['mean_age'] += (male + female) * age
    if stats['total'] > 0:
        stats['mean_age'] = stats['mean_age'] / stats['total']    
        stats['sex'] = stats['male'] / stats['total']
    else:
        stats['mean_age'] = stats['sex'] = 0
    name_stats[name] = stats
name_stats['Piper'] 

In [None]:
from matplotlib import pyplot
from matplotlib.patches import Patch

def plot_sexes(name):
    years = range(first_available, most_recent + 1)
    males = [float(name_years[name][year].get('male', 0)) for year in years]
    females = [float(name_years[name][year].get('female', 0)) for year in years]
    living_males = [float(living_name_years[name][year].get('male', 0)) for year in years]
    living_females = [float(living_name_years[name][year].get('female', 0)) for year in years]
    pyplot.figure(dpi=200)
    pyplot.plot(years, males, label="male", color='#ADD8E6')
    pyplot.plot(years, females, label="female", color='#FFC0CB')
    pyplot.plot(years, living_males, label="living male", color='#6666FF')
    pyplot.plot(years, living_females, label="living female", color='#FF6666')
    pyplot.axvline(x=(year - name_stats[name]['mean_age']), label=('mean age: %3.1f' % name_stats[name]['mean_age']), color='green')
    pyplot.plot()
    pyplot.xlabel('year')
    pyplot.ylabel('births')
    pyplot.title('Babies Named ' + name)
    pyplot.grid(True)
    pyplot.legend(loc='best')
    pyplot.show()

In [None]:
plot_sexes('Montana')

In [None]:
plot_sexes('Natalie')

In [None]:
plot_sexes('Josh')

In [None]:
with open(os.path.join(dir, 'names.csv'), 'w') as file:
    writer = csv.writer(file)
    writer.writerow(('name', 'male', 'mean_age', 'sample_size'))
    for name, stats in name_stats.items():
        if (stats['male'] + stats['female']) > 0:
            writer.writerow((name.lower(), stats['male'] / (stats['male'] + stats['female']), stats['mean_age'], stats['total']))


In [None]:
with open(os.path.join(dir, 'names.csv'), 'r') as file:
    reader = csv.reader(file)
    for line in reader:
        print(line)
