# USA Births

## Load data

In [1]:
csv_list = open("US_births_1994-2003_CDC_NCHS.csv").read().split("\n")

In [6]:
csv_list[0:10]

['year,month,date_of_month,day_of_week,births',
 '1994,1,1,6,8096',
 '1994,1,2,7,7772',
 '1994,1,3,1,10142',
 '1994,1,4,2,11248',
 '1994,1,5,3,11053',
 '1994,1,6,4,11406',
 '1994,1,7,5,11251',
 '1994,1,8,6,8653',
 '1994,1,9,7,7910']

In [7]:
def read_csv(csv_name) :
    string_list = open(csv_name).read().split("\n")
    firstLine = string_list.pop(0) 
    final_list = []
    for x in string_list :
        string_fields = x.split(",")
        final_list.append(list(map(int, string_fields)))
    return(final_list)

In [8]:
cdc_list = read_csv("US_births_1994-2003_CDC_NCHS.csv")

In [9]:
cdc_list[0:10]

[[1994, 1, 1, 6, 8096],
 [1994, 1, 2, 7, 7772],
 [1994, 1, 3, 1, 10142],
 [1994, 1, 4, 2, 11248],
 [1994, 1, 5, 3, 11053],
 [1994, 1, 6, 4, 11406],
 [1994, 1, 7, 5, 11251],
 [1994, 1, 8, 6, 8653],
 [1994, 1, 9, 7, 7910],
 [1994, 1, 10, 1, 10498]]

# Births by Month

In [10]:
def month_births(input_list) :
    births_per_month = {}
    for x in input_list:
        month = x[1]
        births = x[4]
        if month in births_per_month:
            births_per_month[month] += births
        else:
            births_per_month[month] = births
    return(births_per_month)

In [11]:
cdc_month_births = month_births(cdc_list)

In [12]:
cdc_month_births

{1: 3232517,
 2: 3018140,
 3: 3322069,
 4: 3185314,
 5: 3350907,
 6: 3296530,
 7: 3498783,
 8: 3525858,
 9: 3439698,
 10: 3378814,
 11: 3171647,
 12: 3301860}

Births appear to be evenly spread across all months, which the highest being August and the lowest being Feburary. Feburary has fewer days than all the other months which could account for this difference. 

## Average Births per month

In [13]:
days_per_month = {
    1:31, 
    2:28.25, 
    3:31, 
    4:30, 
    5:31, 
    6:30,
    7:31, 
    8:31, 
    9:30, 
    10:31, 
    11:30, 
    12:31
}

cdc_month_births_rate = {}

for key in cdc_month_births :
    if key not in cdc_month_births_rate :
        cdc_month_births_rate[key] = cdc_month_births[key] // days_per_month[key]

cdc_month_births_rate

{1: 104274,
 2: 106836.0,
 3: 107163,
 4: 106177,
 5: 108093,
 6: 109884,
 7: 112863,
 8: 113737,
 9: 114656,
 10: 108994,
 11: 105721,
 12: 106511}

This is now showing the average births per day in each Month, where there does appear to be an increase of births in July, August and September compared to the rest of the year. This could align with babies being concieved around the holiday season (Thanksgiving/Christmas/New Years). 

## Births by Day of the Week

In [14]:
def dow_births(input_list) :
    day_of_week = {}
    for x in input_list:
        dow = x[3]
        births = x[4]
        if dow in day_of_week:
            day_of_week[dow] += births
        else:
            day_of_week[dow] = births
    return(day_of_week)

In [15]:
cdc_day_births = dow_births(cdc_list)

In [16]:
cdc_day_births

{1: 5789166,
 2: 6446196,
 3: 6322855,
 4: 6288429,
 5: 6233657,
 6: 4562111,
 7: 4079723}

In [17]:
cdc_list[0:5]

[[1994, 1, 1, 6, 8096],
 [1994, 1, 2, 7, 7772],
 [1994, 1, 3, 1, 10142],
 [1994, 1, 4, 2, 11248],
 [1994, 1, 5, 3, 11053]]

## Create function to calculate births by factor desired 

In [21]:
def calc_counts(data, column) :
    results = {}
    for x in data:
        interest = x[column-1]
        births = x[4]
        if interest in results:
            results[interest] += births
        else:
            results[interest] = births
    return(results)

In [22]:
cdc_year_births = calc_counts(cdc_list, 1)
cdc_month_births = calc_counts(cdc_list, 2)
cdc_dom_births = calc_counts(cdc_list, 3)
cdc_dow_births = calc_counts(cdc_list, 4)

In [23]:
cdc_year_births

{1994: 3952767,
 1995: 3899589,
 1996: 3891494,
 1997: 3880894,
 1998: 3941553,
 1999: 3959417,
 2000: 4058814,
 2001: 4025933,
 2002: 4021726,
 2003: 4089950}

In [24]:
cdc_month_births

{1: 3232517,
 2: 3018140,
 3: 3322069,
 4: 3185314,
 5: 3350907,
 6: 3296530,
 7: 3498783,
 8: 3525858,
 9: 3439698,
 10: 3378814,
 11: 3171647,
 12: 3301860}

In [25]:
cdc_dom_births

{1: 1276557,
 2: 1288739,
 3: 1304499,
 4: 1288154,
 5: 1299953,
 6: 1304474,
 7: 1310459,
 8: 1312297,
 9: 1303292,
 10: 1320764,
 11: 1314361,
 12: 1318437,
 13: 1277684,
 14: 1320153,
 15: 1319171,
 16: 1315192,
 17: 1324953,
 18: 1326855,
 19: 1318727,
 20: 1324821,
 21: 1322897,
 22: 1317381,
 23: 1293290,
 24: 1288083,
 25: 1272116,
 26: 1284796,
 27: 1294395,
 28: 1307685,
 29: 1223161,
 30: 1202095,
 31: 746696}

In [26]:
cdc_dow_births

{1: 5789166,
 2: 6446196,
 3: 6322855,
 4: 6288429,
 5: 6233657,
 6: 4562111,
 7: 4079723}

### Get Min and Max number of Births for days of the week

In [27]:
def min_and_max(input_dict) : 
    min_value = min(input_dict.items())
    max_value = max(input_dict.items())
    return(min_value, max_value)

In [28]:
min_and_max(cdc_dow_births)

((1, 5789166), (7, 4079723))

Sunday (1) appears to have the most births whereas Saturday (7) has the fewest. 

## Calculate differences by year

In [55]:
def per_year(data, column, column_value) :
    results = {}
    years = list(set([item[0] for item in data]))
    for x in years:
        for y in data:
            if y[0] == x and y[column-1] == column_value :
                key = str(y[0]) + "_" + str(y[column-1])
                interest = y[column-1]
                births = y[4]     
                if key in results:
                    results[key] += births
                else:
                    results[key] = births
    return(results)       

In [71]:
for i in range(1,13) : 
    print(per_year(cdc_list, 2, i))

{'1994_1': 320705, '1995_1': 316013, '1996_1': 314283, '1997_1': 317211, '1998_1': 319340, '1999_1': 319182, '2000_1': 330108, '2001_1': 335198, '2002_1': 330674, '2003_1': 329803}
{'1994_2': 301327, '1995_2': 295094, '1996_2': 301763, '1997_2': 291541, '1998_2': 298711, '1999_2': 297568, '2000_2': 317377, '2001_2': 303534, '2002_2': 303977, '2003_2': 307248}
{'1994_3': 339736, '1995_3': 328503, '1996_3': 322581, '1997_3': 321212, '1998_3': 329436, '1999_3': 332939, '2000_3': 340553, '2001_3': 338684, '2002_3': 331505, '2003_3': 336920}
{'1994_4': 317392, '1995_4': 309119, '1996_4': 312595, '1997_4': 314230, '1998_4': 319758, '1999_4': 316889, '2000_4': 317180, '2001_4': 323613, '2002_4': 324432, '2003_4': 330106}
{'1994_5': 330295, '1995_5': 334543, '1996_5': 325708, '1997_5': 330331, '1998_5': 330519, '1999_5': 328526, '2000_5': 341207, '2001_5': 344017, '2002_5': 339007, '2003_5': 346754}
{'1994_6': 329737, '1995_6': 329805, '1996_6': 318525, '1997_6': 321867, '1998_6': 327091, '199

In [72]:
cdc_per_year = per_year(cdc_list, 2, 12)

In [73]:
def diff_per_year(input_dict) :
    results = {}
    keyList=sorted(input_dict.keys())
    for i,v in enumerate(keyList):
        date_check = v.split("_")[0]
        if date_check == 2003 :
            continue
        if v not in results:
            results[v] = input_dict[keyList[i]] - input_dict[keyList[i-1]]
    return(results)


In [74]:
cdc_diff_per_year = diff_per_year(cdc_per_year)
cdc_diff_per_year

{'1994_12': -16831,
 '1995_12': -11778,
 '1996_12': 7499,
 '1997_12': 6866,
 '1998_12': 4561,
 '1999_12': -645,
 '2000_12': 3536,
 '2001_12': -10218,
 '2002_12': 7687,
 '2003_12': 9323}

Examine the difference in the number of births per year for a particular column of interest and particular variable. Here being births in December. 