In [2]:
# Example to make sure read_csv works correctly
births_list = open("US_births_1994-2003_CDC_NCHS.csv").read().split("\n")
print(births_list[:10])

['year,month,date_of_month,day_of_week,births', '1994,1,1,6,8096', '1994,1,2,7,7772', '1994,1,3,1,10142', '1994,1,4,2,11248', '1994,1,5,3,11053', '1994,1,6,4,11406', '1994,1,7,5,11251', '1994,1,8,6,8653', '1994,1,9,7,7910']


In [3]:
def read_csv(filename):
    '''Opens and reads a .csv and turns it into a tokenized list'''
    # turns the csv into a list of strings
    string_list = open(filename).read().split("\n")
    # removes header row
    string_list.pop(0)
    # empty list to be appended to later
    final_list = []
    # loop to convert a list of strings into a list of lists of integers
    for elem in string_list:
        int_fields = []
        # splits each list element into a list
        string_fields = elem.split(',')
        # turns each of those lists into lists of integers
        for elem in string_fields:
            int_fields.append(int(elem))
        final_list.append(int_fields)
    return final_list

# carries out function on our births dataset
cdc_list = read_csv("US_births_1994-2003_CDC_NCHS.csv")
# tests for functionality
print(cdc_list[:10])

[[1994, 1, 1, 6, 8096], [1994, 1, 2, 7, 7772], [1994, 1, 3, 1, 10142], [1994, 1, 4, 2, 11248], [1994, 1, 5, 3, 11053], [1994, 1, 6, 4, 11406], [1994, 1, 7, 5, 11251], [1994, 1, 8, 6, 8653], [1994, 1, 9, 7, 7910], [1994, 1, 10, 1, 10498]]


In [4]:
def month_births(list_data):
    '''Takes the list data from read_csv and returns a dictionary with how many people were born in each month'''
    births_per_month = {}
    # iterates over the list to add up births per month in a dictionary
    for elem in list_data:
        # item 1 = month; item 4 is births
        if elem[1] not in births_per_month:
            births_per_month[elem[1]] = elem[4]
        else:
            births_per_month[elem[1]] += elem[4]
    return births_per_month


cdc_months_births = month_births(cdc_list)
print(cdc_months_births)

{1: 3232517, 2: 3018140, 3: 3322069, 4: 3185314, 5: 3350907, 6: 3296530, 7: 3498783, 8: 3525858, 9: 3439698, 10: 3378814, 11: 3171647, 12: 3301860}


In [5]:
def dow_births(list_data):
    '''Takes the list data from read_csv and returns a dictionary with how many births there were on each day of the week'''
    births_per_weekday = {}
    # iterates over the lists to add up births for each weekday
    for elem in list_data:
        # item 3 = day of week; item 4 = births
        if elem[3] not in births_per_weekday:
            births_per_weekday[elem[3]] = elem[4]
        else:
            births_per_weekday[elem[3]] += elem[4]
    return births_per_weekday

cdc_day_births = dow_births(cdc_list)
print(cdc_day_births)

{1: 5789166, 2: 6446196, 3: 6322855, 4: 6288429, 5: 6233657, 6: 4562111, 7: 4079723}


In [6]:
def calc_counts(data, column):
    '''Takes the data from read_csv and returns a dictionary compiling data from any given column.
    Column 0: Year
    Column 1: Month
    Column 2: Date
    Column 3: Day of week
    Column 4: Births (For reference)
    '''
    final_dict = {}
    for elem in data:
        if elem[column] not in final_dict:
            final_dict[elem[column]] = elem[4]
        else:
            final_dict[elem[column]] += elem[4]
    return final_dict

# births per year
cdc_year_births = calc_counts(cdc_list, 0)
print(cdc_year_births)
print(" ")
#births per month
cdc_month_births = calc_counts(cdc_list, 1)
print(cdc_month_births)
print(" ")
# births per day of month
cdc_dom_births = calc_counts(cdc_list, 2)
print(cdc_dom_births)
print(" ")
# births per day of week
cdc_dow_births = calc_counts(cdc_list, 3)
print(cdc_dow_births)

{2000: 4058814, 2001: 4025933, 2002: 4021726, 2003: 4089950, 1994: 3952767, 1995: 3899589, 1996: 3891494, 1997: 3880894, 1998: 3941553, 1999: 3959417}
 
{1: 3232517, 2: 3018140, 3: 3322069, 4: 3185314, 5: 3350907, 6: 3296530, 7: 3498783, 8: 3525858, 9: 3439698, 10: 3378814, 11: 3171647, 12: 3301860}
 
{1: 1276557, 2: 1288739, 3: 1304499, 4: 1288154, 5: 1299953, 6: 1304474, 7: 1310459, 8: 1312297, 9: 1303292, 10: 1320764, 11: 1314361, 12: 1318437, 13: 1277684, 14: 1320153, 15: 1319171, 16: 1315192, 17: 1324953, 18: 1326855, 19: 1318727, 20: 1324821, 21: 1322897, 22: 1317381, 23: 1293290, 24: 1288083, 25: 1272116, 26: 1284796, 27: 1294395, 28: 1307685, 29: 1223161, 30: 1202095, 31: 746696}
 
{1: 5789166, 2: 6446196, 3: 6322855, 4: 6288429, 5: 6233657, 6: 4562111, 7: 4079723}


In [7]:
def min_max(data_dict):
    '''Calculates the minimum and maximum values for any dictionary passed through'''
    value_list = []
    minimum_val = 9999999999999999999999999999
    maximum_val = 0
    max_key = 0
    min_key = 0
    for key in data_dict:
        if data_dict[key] >= maximum_val:
            maximum_val = data_dict[key]
            max_key = key
        if data_dict[key] <= minimum_val:
            minimum_val = data_dict[key]
            min_key = key
    print("Min:{0} at {1} - Max:{2} at {3}".format(minimum_val, min_key, maximum_val, max_key))
        
min_max(cdc_year_births)
min_max(cdc_month_births)
min_max(cdc_dom_births)
min_max(cdc_dow_births)

Min:3880894 at 1997 - Max:4089950 at 2003
Min:3018140 at 2 - Max:3525858 at 8
Min:746696 at 31 - Max:1326855 at 18
Min:4079723 at 7 - Max:6446196 at 2


In [26]:
def year_comp(data_list, column, spec_value):
    '''Shows yearly changes between births for a specified (spec_value) month, date, and day of the week
    Column 0: Year
    Column 1: Month - spec_value: 1-12
    Column 2: Date - spec_value: 1-31
    Column 3: Day of week - spec_value: 1-7
    Column 4: Births (For reference)'''
    final_dict = {}
    tup_list = []
    for elem in data_list:
        # checks if year is in dictionary
        if elem[0] not in final_dict:
            # checks if the column being scanned matches with the specified value
            if elem[column] == spec_value:
                final_dict[elem[0]] = elem[4]
        else:
            if elem[column] == spec_value:
                final_dict[elem[0]] += elem[4]
    # outputs the dictionary as a sorted list of tuples
    for k in final_dict:
        tup_list.append((k, final_dict[k]))
    return sorted(tup_list, key=lambda k:k[0])

# number of births in March from 1994-2003
print(year_comp(cdc_list, 1, 3))
print(" ")
# number of births on the 3rd of the month
print(year_comp(cdc_list, 2, 3))
print(" ")
# number of births on a Friday
print(year_comp(cdc_list, 3, 5))

[(1994, 339736), (1995, 328503), (1996, 322581), (1997, 321212), (1998, 329436), (1999, 332939), (2000, 340553), (2001, 338684), (2002, 331505), (2003, 336920)]
 
[(1994, 126213), (1995, 126834), (1996, 126145), (1997, 132879), (1998, 131142), (1999, 128608), (2000, 132060), (2001, 128107), (2002, 130009), (2003, 142502)]
 
[(1994, 614641), (1995, 611410), (1996, 607883), (1997, 605274), (1998, 608402), (1999, 624604), (2000, 637657), (2001, 640380), (2002, 638827), (2003, 644579)]
