In [1]:
# demographic_figures.py
# This program reads in information concerning deaths rates for Coronavirus. It
# turns the raw data into dataframes for age, gender, and pre-existing conditions.
# It also stores the breakdown of age groupings in the population and uses these
# figures to estimate the total mortality by gender, age, and relevant diseases.
# It gives the user the choice to see a bar graph displaying the death rates either for
# age, gender, or pre-existing conditions. Furthermore, it creates a sunburst visualization
# that gives a qualitative representation of the user's risk given input conditions.
# 
# Author: Kristen Friday
# Date: April 21, 2020

from bs4 import BeautifulSoup
import requests
import pandas as pd
import plotly.express as px
from ipywidgets import interact, fixed
import csv


# get HTML death rate data
html_raw = open('Demographics_HTML.html')
soup = BeautifulSoup(html_raw, 'lxml')
subsoup = soup.find('div', class_='content-inner')
table_values = subsoup.find_all('strong')

# function that obtains all demographic data from the subsoup
# returns a list containing 3 lists of demographic data (age, gender, pre-existing conditions)
def get_data():

    # empty list to store values from tables
    start_indeces = []
    end_indeces = []

    # get range of indeces for table values
    for index, value in enumerate(table_values):
        string_val = value.text.strip()
        if string_val == '80+ years old' or string_val == 'Male' or string_val == 'Cardiovascular disease':
            start_indeces.append(index)
        elif string_val == 'Death Rat':
            end_indeces.append(index)

    # get rid of all extra indeces that don't contain relevant data
    for i in range(len(start_indeces)):
        if end_indeces[i] < start_indeces[i]:
            del end_indeces[i]

    # empty list to store lists of values in index ranges
    list_values_list = []
    
    for i in range(len(start_indeces)):
        # empty list to store index ranges containing data
        index_range = []

        start_index = int(start_indeces[i])
        for j in range(start_index, int(end_indeces[i])):
            # filter out a piece of irrelevant information
            # add all indeces between start and end to index list
            if j != 13:
                index_range.append(j)
            
        # empty list to store values within range
        values_list = []

        # print values within index range    
        for index, value in enumerate(table_values):
            if index in index_range:
                if value.text.strip() == 'no fatalities':
                    values_list.append('0.0%')
                else:
                    values_list.append(value.text.strip())
        list_values_list.append(values_list)
    
    return list_values_list


# function accesses list of lists containing values for death rates and puts into dataframe
# returns 3 dataframes for age, gender, pre-existing conditions
def data_to_dataframe(values):
    # dictionaries that hold demographic as key and percentage as value
    age_dict = {}
    gender_dict = {}
    disease_dict = {}
    
    # add values to respective dictionaries
    for i in range(0, len(values[0])-1, 2):
        age_dict[values[0][i]] = values[0][i+1]
        
    for i in range(0, len(values[1])-1, 3):
        gender_dict[values[1][i]] = values[1][i+1]
        
    for i in range(0, len(values[2])-1, 3):
        disease_dict[values[2][i]] = values[2][i+1]  
    
    # turn age dictionary into dataframe
    age_df = pd.DataFrame.from_dict(age_dict, columns=['Death Rate'], orient='index')
    
    # turn gender dictionary into dataframe
    gender_df = pd.DataFrame.from_dict(gender_dict, columns=['Death Rate'], orient='index')
    
    # turn disease dictionary into dataframe
    disease_df = pd.DataFrame.from_dict(disease_dict, columns=['Death Rate'], orient='index')
    
    return age_df, gender_df, disease_df


# function that creates bar plots comparing mortality rates for each demographic
# returns 3 bar graph objects
def compare_mortality(age_df, gender_df, disease_df):
    # convert string rates to floats
    age_rates = []
    for rate in age_df['Death Rate']:
        float_rate = float(rate.strip('%'))
        age_rates.append(float_rate)
        
    age_viz = px.bar(age_df, y=age_df.index, x=age_rates, labels={'x':'Death Rate (%)', 'y':'Demographic'},
                    title='Death Rates By Age', orientation='h')
    
    # convert string rates to floats
    gender_rates = []
    for rate in gender_df['Death Rate']:
        float_rate = float(rate.strip('%'))
        gender_rates.append(float_rate)
    
    gender_viz = px.bar(gender_df, y=gender_df.index, x=gender_rates, labels={'x':'Death Rate (%)', 'y':'Demographic'},
                       title='Death Rates By Gender', orientation='h')
    
    # convert string rates to floats
    disease_rates = []
    for rate in disease_df['Death Rate']:
        float_rate = float(rate.strip('%'))
        disease_rates.append(float_rate)
    
    disease_viz = px.bar(disease_df, y=disease_df.index, x=disease_rates, labels={'x':'Death Rate (%)', 'y':'Demographic'},
                        title='Death Rates By Pre-Existing Conditions', orientation='h')
    
    return age_viz, gender_viz, disease_viz


# function shows visualization for death rate bar graphs
def show_visualization(age_df, gender_df, disease_df, demographic):
    age, gender, disease = compare_mortality(age_df, gender_df, disease_df)
    
    if demographic == 'Age':
        age.show()
    elif demographic == 'Gender':
        gender.show()
    else:
        disease.show()

        
# function creates dataframe from age groupings csv data - shows percentages of each age group (Age_Breakdown.csv)
def age_to_df():
    csv_file = open('Age_Breakdown.csv')
    pop_data = csv.reader(csv_file) 
    pop_data = list(pop_data)
    # store contents of csv in dictionary
    data = {}

    # add data to dictionary
    for i in range(len(pop_data[0])):
       data[pop_data[0][i]] = pop_data[1][i]

    # convert dictionary into dataframe
    age_breakdown = pd.DataFrame(data, index=[0])
    
    return age_breakdown


# function returns a dictionary that stores the percentages of each age group as a float
# and the total population as an integer
def show_age_floats(age_df):
    # create list to store brackets
    age_brackets = ['0-14', '15-24', '25-64', '65+']
    age_perc= []
    
    # obtain percentages for each age bracket
    young = age_df.iloc[0]['Age 0-14']
    # convert percent into a decimal
    young = float(young.strip('%')) / 100
    age_perc.append(young)
    
    teen = age_df.iloc[0]['Age 15-24']
    # convert percent into a decimal
    teen = float(teen.strip('%')) / 100
    age_perc.append(teen)
    
    mid = age_df.iloc[0]['Age 25-64']
    # convert percent into a decimal
    mid = float(mid.strip('%')) / 100
    age_perc.append(mid)
    
    old = age_df.iloc[0]['Age 65+']
    # convert percent into a decimal
    old = float(old.strip('%')) / 100
    age_perc.append(old)
    
    # create dictionary that will store population data based on age (number of people in each age group)
    pop_breakdown = {}
    population = int(age_df['2013 Population'])
    for i in range(len(age_brackets)):
        per_group = int(population * age_perc[i])
        pop_breakdown[age_brackets[i]] = per_group
        
    return pop_breakdown, population
    
    
# function returns a dictionary with updated breakdown of death rates for each age group
# takes in dictionary of population spread and dataframe for the death rates by age
# standardizes the other dictionary of age vulnerability
def get_age_dict(death_rates, pop_breakdown):
    # list stores rates as floats
    rates = []
    
    for rate in death_rates['Death Rate']:
        rate = float(rate.strip('%')) /100
        rates.append(rate)
    rates.reverse()
    
    # combine several age groups to get same groups as the population breakdown
    # store new death rates into dictionary
    age_dict = {}
    for key in pop_breakdown:
        if key == '0-14':
            age_dict[key] = (rates[0] + rates[1]) / 2
        elif key == '15-24':
            age_dict[key] = (rates[1] + rates[2]) / 2
        elif key == '25-64':
            age_dict[key] = (rates[2] + rates[3] + rates[4] + rates[5]  + rates[6]) / 5
        elif key == '65+':
            age_dict[key] = (rates[len(rates)-1] + rates[len(rates)-2]) / 2
    
    return age_dict


# function returns a dictionary that holds gender death rates
# takes in dataframe for gender death rates
def get_gender_dict(gender_df):
    # dictionary that holds death rates by gender
    gender_dict = {}
    
    for index in gender_df.index:
        rate = gender_df.at[index, 'Death Rate']
        rate = float(rate.strip('%')) / 100
        gender_dict[index] = rate
    
    return gender_dict


# function returns a dictionary that holds disease death rates
# takes in dataframe for disease death rates
def get_disease_dict(disease_df):
    # dictionary that holds death rates by disease
    disease_dict = {}
    
    for index in disease_df.index:
        rate = disease_df.at[index, 'Death Rate']
        rate = float(rate.strip('%')) / 100
        disease_dict[index] = rate
    
    return disease_dict


# function uses breakdown of age groups and death rates to show estimate of how many are dead if infected
# returns a dataframe containing breakdown of estimated deaths according to age and gender
def mortality_to_df(age_dict, gender_dict, pop_dict, population): 
    # determine estimate of deaths within age groups and gender
    age_groups = []
    genders = []
    death_rates = []
    estimate_deaths = []
    total_deaths = []
    
    # calculate probability of being dead in each age group
    for key in age_dict.keys():
        # calculate deaths based on age death rate
        for gender in gender_dict.keys():
            # assume 50% of population is male
            dead_people = (pop_dict[key] / 2) * gender_dict[gender]
            total_deaths.append(dead_people)
            age_groups.append(key)
            genders.append(gender)
            estimate_deaths.append(int(dead_people))
            
    # loop through total deaths to get final count
    total_sum = 0
    for deaths in total_deaths:
        total_sum += deaths
        
    # loop through deaths to get overall death rate
    for deaths in total_deaths:
        overall_death_rate = deaths / total_sum
        death_rates.append('{:.4f}%'.format(overall_death_rate * 100))
    
    # build dataframe from data array
    deaths_df = pd.DataFrame({'Gender': genders,
                              'Age Group' : age_groups,
                              'Estimated Deaths' : estimate_deaths,
                              '% of Dead' : death_rates})
    
    return deaths_df


# function interacts with user by selecting demographic information and showing overall death rate
# returns a dataframe containing info for an individual with the selected conditions
def interact_risk(age_group, num_deaths_df, gender):
    individual_info = []
    # loop through dataframe to find info pertinent to parameters
    for index, row in enumerate(num_deaths_df.values.tolist()):
        if (row[0] == gender) and (row[1] == age_group):
            individual_info = row
    
    parameters_df = pd.DataFrame(individual_info, index=num_deaths_df.columns)
    parameters_df = parameters_df.transpose()
    
    return parameters_df


# function plots estimated deaths based on age and gender on sunburst visualization
# takes in argument for dataframe with estimated deaths by demographic
def create_sunburst(num_deaths_df):
    fig = px.sunburst(num_deaths_df, path=['Gender', 'Age Group'], 
                  values='Estimated Deaths', hover_data=['% of Dead'], 
                      title='Estimated Deaths According to Demographics')
    fig.show()
         
        
if __name__ == '__main__':
    type_viz = ['Age', 'Gender', 'Pre-Existing Conditions']
    print('Choose a demographic type from the dropdown menu to display mortality rates:')
    age_rates, gender_rates, disease_rates = data_to_dataframe(get_data())
    interact(show_visualization, age_df=fixed(age_rates), gender_df=fixed(gender_rates),
             disease_df=fixed(disease_rates), demographic=type_viz)
    
    age_breakdown = age_to_df()
    pop_breakdown, total_pop = show_age_floats(age_breakdown)
    
    age_dict = get_age_dict(age_rates, pop_breakdown)
    gender_dict = get_gender_dict(gender_rates)
    disease_dict = get_disease_dict(disease_rates)
    
    num_deaths_df = mortality_to_df(age_dict, gender_dict, pop_breakdown, total_pop)
    
    print('Enter options from the following dropdown menus to display estimated deaths: \n')
    interact(interact_risk, num_deaths_df=fixed(num_deaths_df), age_group=age_dict.keys(), gender=gender_dict.keys(), 
             conditions=disease_dict.keys())
    
    create_sunburst(num_deaths_df)
    description = '''
        The visualization above shows the estimated age and gender breakdown for those dying of Coronavirus. 
        Though older individuals have a greater likelihood of dying from the disease, the fact that they make  
        up a much smaller percentage of the population leads to the estimated counts that are displayed. 
        The death percentages are taken out of a total population of 7,162,119,434.'''
    print(description)



Choose a demographic type from the dropdown menu to display mortality rates:


interactive(children=(Dropdown(description='demographic', options=('Age', 'Gender', 'Pre-Existing Conditions')…

Enter options from the following dropdown menus to display estimated deaths: 



interactive(children=(Dropdown(description='age_group', options=('0-14', '15-24', '25-64', '65+'), value='0-14…


        The visualization above shows the estimated age and gender breakdown for those dying of Coronavirus. 
        Though older individuals have a greater likelihood of dying from the disease, the fact that they make  
        up a much smaller percentage of the population leads to the estimated counts that are displayed. 
        The death percentages are taken out of a total population of 7,162,119,434.
