In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
n1988 = pd.read_csv("~/Downloads/birth/NATL1988.txt")

In [None]:
# From the documentation of the detail natality file for 1990 data:
# NCHS has adopted a new policy on release of vital statistics unit record
# data files.  This new policy was implemented for the 1989 vital event
# files to prevent the inadvertent disclosure of individuals and institutions.
# As a result, the files for 1989 and later years do not contain the
# acutal day or the birth or the dates of birth of the mother or father.
# The geographic detail is also restricted...

# These datasets are currently distributed by the CDC Vital Satistics Online Portal
# https://www.cdc.gov/nchs/data_access/vitalstatsonline.htm

# and the NATL1988.zip dataset is 
# https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/DVS/natality/Nat1988.zip
# with the data dictionary https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Dataset_Documentation/DVS/natality/Nat1988doc.pdf


In [None]:
n1988.head()

Pandas thought the first row was a header definition row (which isn't going to work with fixed-with fields like we have here).  I do not want to lose the first row, or call it by its true name, so turn off header parsing:

In [None]:
n1988 = pd.read_csv("~/Downloads/birth/NATL1988.txt", header=None)

In [6]:
n1988.columns

Int64Index([0], dtype='int64')

Now I have a single column, creatively named "0"

In [None]:
n1988[0]

And you can see I have an index running from 0 to 3,913,792

In [None]:
n1988[0][0]

Accessing the rows with square brackets works, but I should probably use .loc

In [None]:
n1988[0].loc[1000]

This is a row.. How long is it?

In [None]:
len(n1988[0].loc[0])

In [None]:
# This looks like a row.. can I get columns?

In [None]:
n1988[0].loc[0][105]

It looks like I can access columns with square brackets and a column number.

In [None]:
# Loop over the data, get the 60th column "birth order" and put it in a dataframe.
# I'll create an empty list, fill it with strings, and then 
# convert the list of strings into a dataframe.

p = []
for i in n1988.index:
    p.append(n1988[0].loc[i][60])
pf = pd.DataFrame(p)
pf.head()

Note the data formats.. I created a empty list, filled it with strings, and then converted a list of strings into a dataframe.

In [None]:
pf.head()

In [None]:
pf.value_counts()

In [None]:
# This does not look right.  
# This does not look like column 60, birth order, but
# Column 61, the tens place of "detail live birth order"
#  OK.  This is the python-starts-at-zero property.

In [None]:
# Loop over the data, get column 60 (total birth order recode 9) and 
# 41-42  (age of mother single years recode 36)  and put it in a dataframe.
p = []
q = []
for i in n1988.index:
    p.append(n1988[0][i][60-1])
    q.append(n1988[0][i][(41-1):(42-1+1)])

pf = pd.DataFrame({"BIRTHORDER": p, "MATERNALAGE":q})

In [None]:
pf.head()

In [None]:
pf["BIRTHORDER"].value_counts()

In [None]:
pf.MATERNALAGE.value_counts()

In [None]:
dir(pf.MATERNALAGE.value_counts())

In [None]:
pf.MATERNALAGE.value_counts().values

In [None]:
plt.bar(pf["BIRTHORDER"].value_counts().index, pf["BIRTHORDER"].value_counts().values)

In [None]:
plt.xlabel("Birth order")

In [None]:
p = []
q = []
for i in n1988.index:
    p.append(n1988[0][i][60-1])
    q.append(n1988[0][i][(41-1):(42-1+1)])

pf = pd.DataFrame({"BIRTHORDER": p, "MATERNALAGE":q})

In [None]:
# Loop over the data, get columns 84-87 (month and date of birth)
# and put it in a dataframe.
p = []
q = []
for i in n1988.index:
    p.append(n1988[0][i][(84-1):(85-1+1)]) # MONTH
    q.append(n1988[0][i][(86-1):(87-1+1)]) # DAY
birthdate = pd.DataFrame({"MONTH": p, "DAY":q})

In [None]:
birthdate.MONTH.value_counts()

In [None]:
months = birthdate.MONTH.value_counts().sort_values()
months

In [None]:
months = birthdate.MONTH.value_counts().sort_values("index")
months

In [None]:
months = birthdate.MONTH.value_counts().sort_index()
months

In [None]:
plt.bar(months.index, months.values)

In [None]:
days = birthdate.DAY.value_counts().sort_index()
days

In [None]:
plt.bar(days.index, days)

In [None]:
plt.figure(figsize=(15,10))
plt.bar(days.index, days)

In [None]:
pd.to_datetime("1988-01-01")

In [None]:
birthdate.head()

I'm pretty sure I'm going to need to change these into a time format to make good use of them.  Let us construct an ISO-8601-compliant string like 1988-01-02 and add it as a new column.

In [None]:
birthdate["ISO8601"] = pd.to_datetime("1988" + birthdate["MONTH"]+ birthdate["DAY"])

In [None]:
# ParserError: day is out of range for month: 19881099

# Believe it or not, this is good news.  October 99th is not being coded as a date.
# https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html
# we have options errors="ignore" which will set the date to the input or
# errors="coerce" which will set the date to Not a Time.
birthdate["DATE"] = pd.to_datetime("1988" + birthdate["MONTH"]+ birthdate["DAY"], errors="coerce")

In [None]:
# It completes this time.  Look at the output 
birthdate.head()

In [None]:
# This is the example code from a python module for making calendars:
# https://github.com/tomkwok/calplot
import calplot
import numpy as np; np.random.seed(sum(map(ord, 'calplot')))
import pandas as pd
all_days = pd.date_range('1/1/2019', periods=730, freq='D')
days = np.random.choice(all_days, 500)
events = pd.Series(np.random.randn(len(days)), index=days)
calplot.calplot(events)


In [None]:
all_days = pd.date_range('1/1/1988', periods=365, freq='D')
days = np.random.choice(all_days, 500)
events = pd.Series(np.random.randn(len(days)), index=days)
calplot.calplot(events)
events.head()

In [None]:
datehist = birthdate["DATE"].value_counts()
datehist.head()

In [None]:
# This looks promising.  Let us try making a calendar plot.
calplot.calplot(datehist)


In [None]:
plt.bar(datehist)

In [None]:
plt.bar(range(len(datehist)), datehist)

In [None]:
#  Aaaargghh.  When I created the histogram, it sorted by value, which is bad.
#  Why do we have a bimodal distribution here?

In [None]:
plt.bar(range(len(datehist)), datehist.sort_index())

In [None]:
# Make this wider, more pleasant to look at
plt.figure(figsize=(16,8))
plt.bar(range(1,len(datehist)+1), datehist.sort_index(), width=1)
plt.xlabel ("Day in 1988")

In [None]:
# This is not presentation-ready, but this gives us some pretty good insight into
# three drivers of birth timing, weekends, holidays, and a seasonal modulation.

In [None]:
# Let us look back at our data parser, though.  
# It's slow, and I don't want to edit it every time
# a new column comes to my attention.

# Loop over the data, get columns 84-87 (month and date of birth)
# and put it in a dataframe.
p = []
q = []
for i in n1988.index:
    p.append(n1988[0][i][(84-1):(85-1+1)]) # MONTH
    q.append(n1988[0][i][(86-1):(87-1+1)]) # DAY
birthdate = pd.DataFrame({"MONTH": p, "DAY":q})

Looking at the description...

![Data description](FIELDS.jpg "Exceprt from data description showing Date and month fields")

To get the data into our computing environment, we have three tasks:
1. extract columns (with the right field width)
2. associate columns with human-readable names 
3. decode symbols 

If we can put the layout of the database *into data, instead of code*, we can expand the scope of our investigation by modifying data, *not code*.  This will work well for us.

In [None]:
# let's start with trying to use data to specify the layout,
# write a partser that generates a dataframe with column names,
# and we can expand to more columns (and decode the symbols) later.


In [None]:
# Let us start with something like these fields (which do not require decoding)
# This is a (pure python) list of lists
#  [ [startcolumn, stopcolumn, fieldname] ] 
dict1988 =[
    [84, 85, "MONTHOFBIRTH"],
    [86, 87, "DAYOFMONTH"],
]
# Since this is our data structure for our use, we can use any data type that works;
# there is no requirement that we use pandas or numpy here.


# Now, to organize my thoughts, I'm going to write a function prototype with a docstring that tells me what I need and what I promise to deliver.

In [None]:
def parse_natality(natalitydata, datadictionary):
    '''Take in pandas dataframe natalitydata and list-of-lists 
    datadictionary  (for example [[43, 44, "MATERNALAGE"]] ) 
    and return a pandas dataframe (same number of rows) with 
    labeled columns  '''

In [None]:
#  Where to start?  For loop over data dictionary first, or for loop over data?
#  Hint: which one is more expensive? This is the one we will want to do once.
# 

In [None]:
print(type(n1988[0]))
n1988[0]

In [None]:
print(type(n1988[0].loc[42]))
n1988[0].loc[42]

In [None]:
print(type(n1988[0].loc[42][83:84+1]))
n1988[0].loc[42][83:84+1]

In [None]:
# So I can access the cth column and ith row with df[0].loc[i][c]

In [None]:
# First I'm going to make certain I have the data I think I do
def parse_natality(natalitydata, datadictionary):
    '''Take in pandas dataframe natalitydata and list-of-lists datadictionary
    (for example [[43, 44, "MATERNALAGE"]] ) and return a pandas dataframe (same 
    number of rows) with labeled columns  '''
    for index in natalitydata.index:
        for start, stop, column_name in datadictionary:
            if index < 3:
                print(index, start,stop, column_name)

In [None]:
parse_natality(n1988, dict1988)

In [None]:
# Above, we had hard-coded lists p and q that took all our intermediate data.  
# If we are going to automate column-extraction, we need a more flexible data structure.
# Dictionary of lists.
# So let's make a dictionary of empty lists first.
def parse_natality(natalitydata, datadictionary):
    '''Take in pandas dataframe natalitydata and list-of-lists datadictionary
    (for example [[43, 44, "MATERNALAGE"]] ) and return a pandas dataframe (same 
    number of rows) with labeled columns  '''
    intermediate_data={}
    # First, populate intermediate_data with empty lists for each column_name
    for start, stop, column_name in datadictionary:
        intermediate_data[column_name] = []
    # Now loop through the data    
    for index in natalitydata.index:
        for start, stop, column_name in datadictionary:
            if index < 3:
                print(index, start,stop, column_name)
    return intermediate_data

In [None]:
parse_natality(n1988, dict1988)

In [None]:
def parse_natality(natalitydata, datadictionary):
    '''Take in pandas dataframe natalitydata and list-of-lists datadictionary
    (for example [[43, 44, "MATERNALAGE"]] ) and return a pandas dataframe (same 
    number of rows) with labeled columns  '''
    intermediate_data={}
    # First, populate intermediate_data with empty lists for each column_name
    for start, stop, column_name in datadictionary:
        intermediate_data[column_name] = []
    for idx in natalitydata.index:
        for start, stop, column_name in datadictionary:
            if idx < 3:
                datafield = natalitydata.loc[idx][0][start:(stop+1)]
                print(idx, start,stop, column_name, datafield)
    return intermediate_data

In [None]:
parse_natality(n1988, dict1988)

In [None]:
# Well, that's not right.  Day of month 80?  Birth order 0?  Off-by-one again
def parse_natality(natalitydata, datadictionary):
    '''Take in pandas dataframe natalitydata and list-of-lists datadictionary
    (for example [[43, 44, "MATERNALAGE"]] ) and return a pandas dataframe (same 
    number of rows) with labeled columns  '''
    intermediate_data={}
    for start, stop, column_name in datadictionary:
        intermediate_data[column_name] = []
    for idx in natalitydata.index:
        for start, stop, column_name in datadictionary:
            if idx < 3:
                datafield = natalitydata.loc[idx][0][start-1:(stop-1+1)]
                print(idx, start,stop, column_name, datafield)
    return intermediate_data

In [None]:
parse_natality(n1988, dict1988)

In [None]:
# These values look sane now, so let us put the data in lists.
def parse_natality(natalitydata, datadictionary):
    '''Take in pandas dataframe natalitydata and list-of-lists datadictionary
    (for example [[43, 44, "MATERNALAGE"]] ) and return a pandas dataframe (same 
    number of rows) with labeled columns  '''
    intermediate_data={}
    for start, stop, column_name in datadictionary:
        intermediate_data[column_name] = []
    for idx in natalitydata.index:
        for start, stop, column_name in datadictionary:
            datafield = natalitydata.loc[idx][0][start-1:(stop-1+1)]
            intermediate_data[column_name].append(datafield)
   #         if idx < 3:
   #              print(idx, start,stop, column_name, natalitydata.loc[idx][0][start-1:(stop-1+1)])
    return intermediate_data

In [None]:
parse_natality(n1988, dict1988)

In [None]:
# Takes a while this time.  And it returns a hash of lists.  I'd prefer pandas data frames..
import time
starttime = time.time()
parse_natality(n1988, dict1988)
stoptime = time.time()
print(stoptime-starttime)   # Agh.   12 minutes 

In [None]:
# My code doesn't even give me a dataframe yet and it's taking forever.  
# Faster feedback will take me to my goal faster.. so create a dataframe 
# that only has 1% of the data: 
n1988sample = n1988[::100]
len(n1988sample)

In [None]:
# transfer data from intermediate_data hash to pandas dataframe at the end
def parse_natality(natalitydata, datadictionary):
    '''Take in pandas dataframe natalitydata and list-of-lists datadictionary
    (for example [[43, 44, "MATERNALAGE"]] ) and return a pandas dataframe (same 
    number of rows) with labeled columns  '''
    intermediate_data={}
    for start, stop, column_name in datadictionary:
        intermediate_data[column_name] = []
    for idx in natalitydata.index:
        for start, stop, column_name in datadictionary:
            intermediate_data[column_name].append(natalitydata.loc[idx][0][start-1:(stop-1+1)])
   #         if idx < 3:
   #              print(idx, start,stop, column_name, natalitydata.loc[idx][0][start-1:(stop-1+1)])
    df = pd.DataFrame()
    for start, stop, column_name in datadictionary:
        print(column_name)
        df[column_name] = pd.Series(intermediate_data[column_name])
    return df

In [None]:
starttime=time.time()
df = parse_natality(n1988sample, dict1988)
stoptime=time.time()
print(stoptime-starttime)

In [None]:
df

In [None]:
#  This puts my data in a reasonably good place, but now I worry about efficiency.
#  This didn't take long, but it was 1% of the whole dataset.
#  I only have to run this when I want to get new columns, but still, I want to 
#  lower my personal cost of accessing information.

# Why am I looping through all 4M records?  Because I am accessing 
# the columns by .loc[i][colnumber].  Can pandas help me?
# https://pandas.pydata.org/pandas-docs/version/1.3/user_guide/text.html 

# pd.str.get() will extract a single column *as a pandas dataframe*,
# so I can get all the rows with a single operation. 

In [None]:
n1988sample[0].str.get(80)

In [None]:
def parse_natality(natalitydata, datadictionary):
    '''Take in pandas dataframe natalitydata and list-of-lists datadictionary
    (for example [[43, 44, "MATERNALAGE"]] ) and return a pandas dataframe (same 
    number of rows) with labeled columns  '''
    df = pd.DataFrame()
    for start, stop, column_name in datadictionary:
        df[column_name] = natalitydata[0].str.get(start-1)
    return df


In [None]:
starttime=time.time()
df = parse_natality(n1988sample, dict1988) 
stoptime=time.time()
print(stoptime-starttime)

In [None]:
df

In [None]:
# almost there, but pd.str.get()  only gets one column at a time.


In [None]:
def parse_natality(natalitydata, datadictionary):
    '''Take in pandas dataframe natalitydata and list-of-lists datadictionary
    (for example [[43, 44, "MATERNALAGE"]] ) and return a pandas dataframe (same 
    number of rows) with labeled columns  '''
    df = pd.DataFrame()
    for start, stop, column_name in datadictionary:
        # Get the first column (start)
        df[column_name] = natalitydata[0].str.get(start-1)
        # Get the remaining columns
        for i in range(1, stop-start + 1):
            df[column_name]=  df[column_name].str.cat(natalitydata[0].str.get(start-1+i))
    return df

In [None]:
starttime=time.time()
df = parse_natality(n1988sample, dict1988) 
stoptime=time.time()
print(stoptime-starttime)
df.head()

In [None]:
# Looks reasonable.  We can expand to the entire dataset and
# make sure it completes in a reaosonable time:
starttime=time.time()
df = parse_natality(n1988, dict1988) 
stoptime=time.time()
print(stoptime-starttime)
df.head()

In [None]:
type(df.MONTHOFBIRTH)

In [None]:
# That doesn't answer my question
type(df.MONTHOFBIRTH[0])
# One more picky little request.. numeric data types?


In [None]:
def parse_natality(natalitydata, datadictionary):
    '''Take in pandas dataframe natalitydata and list-of-lists datadictionary
    (for example [[43, 44, "MATERNALAGE"]] ) and return a pandas dataframe (same 
    number of rows) with labeled columns  '''
    df = pd.DataFrame()
    for start, stop, column_name in datadictionary:
        # Get the first column (start)
        df[column_name] = natalitydata[0].str.get(start-1)
        # Get the remaining columns
        for i in range(1, stop-start + 1):
            df[column_name]=  df[column_name].str.cat(natalitydata[0].str.get(start-1+i))
    return df
    for start, stop, column_name in datadictionary:
        pd[column_name] = pd.to_numeric(pd[column_name])

In [None]:
df = parse_natality(n1988, dict1988) 
type(df.MONTHOFBIRTH[0])
df.head()

In [None]:
# I am now satisfied that it is working.
dict1988 =[
    [84, 85, "MONTHOFBIRTH"],
    [86, 87, "DAYOFMONTH"],
    [60, 60, "BIRTHODRDER"],
    [41, 42, "MATERNALAGE"],
    [28, 29, "STATE"], 
    [69, 70, "PATERNALAGE"],
]

In [None]:
df = parse_natality(n1988, dict1988) 


In [None]:
age = df.MATERNALAGE.value_counts()

In [None]:
df.MATERNALAGE.value_counts().sort_index()

In [None]:
age = df.MATERNALAGE.value_counts()

In [None]:
df.MATERNALAGE.value_counts().sort_index()

In [None]:
plt.bar(x=df.MATERNALAGE.value_counts().sort_index().index, height=df.MATERNALAGE.value_counts().sort_index().values)

In [None]:
plt.figure(figsize=(15,8))
plt.bar(x=df.MATERNALAGE.value_counts().sort_index().index, height=df.MATERNALAGE.value_counts().sort_index().values)


In [None]:
plt.hist2d(pd.to_numeric(df["MATERNALAGE"]), pd.to_numeric(df["PATERNALAGE"]))

In [None]:
# More bins?
plt.hist2d(pd.to_numeric(df["MATERNALAGE"]), pd.to_numeric(df["PATERNALAGE"]), bins=30)

In [None]:
And here we have an example of the hisogrammer's migrane.  
The field boundaries for both MATERNALAGE and PATERNALAGE
sometimes encompass different numbers of data points on the
underlying distribution over nonnegative integers.

We can always fix this by explicitly defining the bins to 
always include integer numbers of numbers.  

In [None]:
bins1 = np.arange(10,50) +.5
bins2 = np.arange(10,100)+.5

In [None]:
plt.hist2d(pd.to_numeric(df["MATERNALAGE"]), pd.to_numeric(df["PATERNALAGE"]), bins=(bins1, bins2))
plt.xlabel("Maternal age"); plt.ylabel("Paternal age")

In [None]:
Now we see something peculiar; the catch-all category 
PATERNALAGE=99 is a symbol for missing values.  
And just glancing at the heatmap shows the marginal
distribution of MATERNALAGE given PATERNALAGE=99 is 
peaked around 18.

In [None]:
paternalage = df.PATERNALAGE.value_counts().sort_index()
plt.bar(paternalage.index, paternalage.values)
maternalage = df.MATERNALAGE.value_counts().sort_index()

In [None]:
paternalage = df.PATERNALAGE.value_counts().sort_index()[:-1]

plt.subplot(211)
plt.bar(paternalage.index, paternalage.values, alpha=0.5)
plt.subplot(211)
plt.bar(maternalage.index, maternalage.values, alpha=0.5)



In [None]:

plt.subplot(211)
plt.bar(pd.to_numeric(paternalage.index), paternalage.values, alpha=0.5)
plt.subplot(212)
plt.bar(pd.to_numeric(maternalage.index), maternalage.values, alpha=0.5)


In [None]:
plt.figure(figsize=(15,8))
plt.bar(pd.to_numeric(paternalage.index), paternalage.values, alpha=0.5, label="paternal age (where stated)")
plt.bar(pd.to_numeric(maternalage.index), maternalage.values, alpha=0.5, label="maternal age")
plt.legend()

In [None]:
df["MATERNALAGE"]=pd.to_numeric(df.MATERNALAGE)

In [None]:
# Since I have a list of individual rows.. and I want something like a histogram.. I look
# into the pandas doucmentation for methods that aggregate dataframes:
# https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html
#
gb = df.groupby(["MATERNALAGE", "STATE"])

In [None]:
gb.MATERNALAGE.median()

In [None]:
gb = df.groupby(["STATE", "MATERNALAGE"])

In [None]:
gb.MATERNALAGE.count()

In [None]:
gb.STATE.count()

In [None]:
df.groupby("STATE")["MATERNALAGE"].median()

In [None]:
#If only we could decode the states, we would see which states have on average older mothers.

* Fail early and often
* Invest in faster feedback
** Hack at code with data subset
* Specify what you want before you do it
* Get it right first, make it fast later
* Usually better to use p

In [None]:
# Cleaning up the numbers which are symbols is a little messier.  It is left as an exercise for the student.



In [None]:
h = {}
for line in open("/Users/wltrimbl/Downloads/birth/STATE2829decoder.csv"):
    fields = line.strip().split()
    h[fields[0]] = fields[1]
h

In [None]:
df["STATEA"] = pd.Series ( "", index=df.index)
for i in df.index:
    df["STATEA"].loc[i] =  h [ df.loc[i].STATE] 
    if i % 100 == 0:
        print(i,  h [ df.loc[i].STATE] )