In [1]:
f = open("US_births_1994-2003_CDC_NCHS.csv", "r") #open the file
data_str = f.read() #read the csv file into a string
data_lst = data_str.split("\n") #split the data on the new-line character
print("first ten string rows with header:")
print(data_lst[:10]) #display the first ten rows

first ten string rows with header:
['year,month,date_of_month,day_of_week,births', '1994,1,1,6,8096', '1994,1,2,7,7772', '1994,1,3,1,10142', '1994,1,4,2,11248', '1994,1,5,3,11053', '1994,1,6,4,11406', '1994,1,7,5,11251', '1994,1,8,6,8653', '1994,1,9,7,7910']


# Converting Data into a List of Lists

In [2]:
#read the csv file and convert the dataset into a list of lists
def read_csv(file_name, header_row=True):
    f = open(file_name, "r")
    data_str = f.read()
    data_lst = data_str.split("\n")
    if header_row == True:
        data_lst = data_lst[1:] #remove header row
    
    final_lst = []
    for row in data_lst:
        int_fields = []
        str_fields = row.split(",")
        for field in str_fields:
            int_fields.append(int(field)) #convert value to integer
        final_lst.append(int_fields)
    
    return final_lst    

cdc_lst = read_csv("US_births_1994-2003_CDC_NCHS.csv")
print("first ten rows of the dataset as a list of lists with integer values, without header:")
print(cdc_lst[:10])

first ten rows of the dataset as a list of lists with integer values, without header:
[[1994, 1, 1, 6, 8096], [1994, 1, 2, 7, 7772], [1994, 1, 3, 1, 10142], [1994, 1, 4, 2, 11248], [1994, 1, 5, 3, 11053], [1994, 1, 6, 4, 11406], [1994, 1, 7, 5, 11251], [1994, 1, 8, 6, 8653], [1994, 1, 9, 7, 7910], [1994, 1, 10, 1, 10498]]


# Calculating Number of Births Each Month

In [3]:
#count the total number of births that occured in each month, across all years in the dataset (list of lists)
def count_births_per_month(data):
    births_per_month = {}    
    for row in data:
        month = row[1]
        births = row[4]
        if month in births_per_month:
            births_per_month[month] += births
        else:
            births_per_month[month] = births
            
    return births_per_month

cdc_month_births = count_births_per_month(cdc_lst)
print("births per month:")
print(cdc_month_births)

births per month:
{1: 3232517, 2: 3018140, 3: 3322069, 4: 3185314, 5: 3350907, 6: 3296530, 7: 3498783, 8: 3525858, 9: 3439698, 10: 3378814, 11: 3171647, 12: 3301860}


# Calculating Number of Births Each Day of Week

In [4]:
#count the total number of births that occured in each day of week, accross all years in the dataset (list of lists)
def count_births_per_day_of_week(data):
    births_per_dow = {}
    for row in data:
        dow = row[3]
        births = row[4]
        if dow in births_per_dow:
            births_per_dow[dow] += births
        else:
            births_per_dow[dow] = births
    return births_per_dow

cdc_dow_births = count_births_per_day_of_week(cdc_lst)
print("births per day of week:")
print(cdc_dow_births)

births per day of week:
{6: 4562111, 7: 4079723, 1: 5789166, 2: 6446196, 3: 6322855, 4: 6288429, 5: 6233657}


# Creating a More General Function

In [5]:
#count the total number of births for each unique value in a given column of the dataset (list of lists)
def count_births_per_column_value(data, col_int):
    births_per_col_value = {}
    for row in data:
        col_value = row[col_int]
        births = row[4]
        if col_value in births_per_col_value:
            births_per_col_value[col_value] += births
        else:
            births_per_col_value[col_value] = births
    return births_per_col_value

cdc_year_births = count_births_per_column_value(cdc_lst, 0)
print("births per year:")
print(cdc_year_births)
print("\n")

cdc_month_births = count_births_per_column_value(cdc_lst, 1)
print("births per month:")
print(cdc_month_births)
print("\n")

cdc_dom_births = count_births_per_column_value(cdc_lst, 2)
print("births per day of month:")
print(cdc_dom_births)
print("\n")

cdc_dow_births = count_births_per_column_value(cdc_lst, 3)
print("births per day of week:")
print(cdc_dow_births)

births per year:
{1994: 3952767, 1995: 3899589, 1996: 3891494, 1997: 3880894, 1998: 3941553, 1999: 3959417, 2000: 4058814, 2001: 4025933, 2002: 4021726, 2003: 4089950}


births per month:
{1: 3232517, 2: 3018140, 3: 3322069, 4: 3185314, 5: 3350907, 6: 3296530, 7: 3498783, 8: 3525858, 9: 3439698, 10: 3378814, 11: 3171647, 12: 3301860}


births per day of month:
{1: 1276557, 2: 1288739, 3: 1304499, 4: 1288154, 5: 1299953, 6: 1304474, 7: 1310459, 8: 1312297, 9: 1303292, 10: 1320764, 11: 1314361, 12: 1318437, 13: 1277684, 14: 1320153, 15: 1319171, 16: 1315192, 17: 1324953, 18: 1326855, 19: 1318727, 20: 1324821, 21: 1322897, 22: 1317381, 23: 1293290, 24: 1288083, 25: 1272116, 26: 1284796, 27: 1294395, 28: 1307685, 29: 1223161, 30: 1202095, 31: 746696}


births per day of week:
{6: 4562111, 7: 4079723, 1: 5789166, 2: 6446196, 3: 6322855, 4: 6288429, 5: 6233657}


# Calculating min and max values

In [6]:
#calculate the min and max values for any dictionary
def calculate_min_max_values(dictionary):
    result_dict = {
        "min": None,
        "max": None
    }
    
    for key in dictionary:
        value = dictionary[key]
        if (result_dict["min"] == None) or (value < result_dict["min"]):
            result_dict["min"] = value
        if (result_dict["max"] == None) or (value > result_dict["max"]):
            result_dict["max"] = value
    
    return result_dict

min_max_dow = calculate_min_max_values(cdc_dow_births)
print("min and max values per day of week:")
print(min_max_dow)
print("\n")

min_max_month = calculate_min_max_values(cdc_month_births)
print("min and max values per month:")
print(min_max_month)       

min and max values per day of week:
{'min': 4079723, 'max': 6446196}


min and max values per month:
{'min': 3018140, 'max': 3525858}


# Calculating Births Changes Across Years

In [7]:
#extracts values for the same period (e.g. January or Monday) across years 
#and calculates the differences between consecutive values to show if number of births is increasing or decreasing
#for example, how did the number of births in January change each year between 1994 and 2003?
def count_column_value_change_across_years(data, col_int, period_int):
    value_changes = {}
    births_across_years = {}
    
    for row in data:
        births = row[4]
        year = row[0]
        period = row[col_int]
        if period == period_int:
            if year in births_across_years:
                births_across_years[year] += births
            else:
                births_across_years[year] = births
    
    for year, births in births_across_years.items():        
        if year+1 in births_across_years:
            this_year_births = births_across_years[year]
            next_year_births = births_across_years[year+1]
            value_changes[year+1] = next_year_births - this_year_births
    return value_changes

january_across_years = count_column_value_change_across_years(cdc_lst, 1, 1)
print("change of values of births in consecutive years for January:")
print(january_across_years)
print("\n")
saturday_across_years = count_column_value_change_across_years(cdc_lst, 3, 6)
print("change of values of births in consecutive years for Saturday:")
print(saturday_across_years)

change of values of births in consecutive years for January:
{1995: -4692, 1996: -1730, 1997: 2928, 1998: 2129, 1999: -158, 2000: 10926, 2001: 5090, 2002: -4524, 2003: -871}


change of values of births in consecutive years for Saturday:
{1995: -15152, 1996: -3319, 1997: -5421, 1998: 2936, 1999: -3791, 2000: 19809, 2001: -15866, 2002: -8158, 2003: 1675}


# Combining datasets

In [8]:
#combine data from two datsets, removing overlapping years
ssa_lst = read_csv("US_births_2000-2014_SSA.csv") #second dataset
print("rows count in first dataset only:")
print(len(cdc_lst))

combined_lst = cdc_lst
for row in ssa_lst:
    year = row[0]
    if year > 2003:
        combined_lst.append(row)#append only rows with years greater thean 2003, to avoid overlapping
print("rows count in combined dataset:")
print(len(combined_lst))

rows count in first dataset only:
3652
rows count in combined dataset:
7670
