In [21]:
# show_visualizations.py
# This program creates 3 different interactions/visualizations with the user
# and calls the demographic_figures.py program in order to display all 
# visualizations in one location. It interacts with the user by allowing him or
# her to select a country and date, and the program will display the current
# situation of Coronavirus for the specifications. Furthermore, it generates
# 2 different animated choropleth maps: one showing raw confirmed cases and
# the other showing confirmed cases as a percentage of the country's population.
#
# Author: Kristen Friday
# Date: April 21, 2020

import csv
import pandas as pd
import feather
import plotly.express as px
from ipywidgets import interact, fixed
import plotly.io as pio

# function reads file and puts names of files into a list
def open_file_names():
    with open('Corona_updates.txt', 'r') as file:
        # store contents of file into a list
        rows = file.readlines()
        
        # create empty list to store each row
        row_contents = []
        
        # strip whitespace so that row_contents contains only file names
        for row in rows:
            row_contents.append(row.strip('\n'))
            
    return row_contents


# function opens file containing population data (population_countries.csv)
# returns a dictionary with values for total population in 2016 by country
def open_populations():
    # create dictionary to store population according to country code
    populations = {}
    
    with open('population_countries.csv') as pop_data:
        reader = csv.DictReader(pop_data)
        
        for row in reader:
            populations[row['Country_Code']] = row['Year_2016']
            
    return populations


# function reads file containing names of all feather updates
# takes in a dictionary parameter of population data by country
# returns a list of dataframes representing each update/day
def add_populations(pop_dict):
    # store file names into a list
    files_list = open_file_names()
    
    # create empty list to store dataframes for each update
    df_list = []
    
    # loop through all feather files in list
    for file in files_list:
        # create list of populations to add to dataframe
        populations = []
        
        update_df = pd.read_feather(file)
        # loop through all countries to add population data
        for country_code in update_df['3 Char ISO Codes']:
            # manually fix country codes that cause errors
            if country_code == 'TWN':
                populations.append(23780000)
            elif country_code == 'VAT':
                populations.append(618)
            elif country_code == 'XXX':
                populations.append(0)
            elif country_code == 'AIA':
                populations.append(66650000)  
            elif country_code == 'ESH':
                populations.append(582463)
            elif country_code == 'GGY':
                populations.append(67052)
            elif country_code == 'JEY':
                populations.append(97857)   
            # if no errors, add the data associated with the population dictionary
            else:
                populations.append(pop_dict[country_code])
        
        # create a pandas series to represent populations
        pop_df = pd.Series(populations)
       
        # add column for populations into each update's dataframe
        update_df['Population'] = pop_df
        
        # add each dataframe to list 
        df_list.append(update_df)
        
    return df_list
        
    
# function adds dataframe column for confirmed cases as percentage of total population
# parameter: list of all dataframes
# return list of all dataframes with updated column for case ratio
def pop_percentage(df_list):
    # loop through dataframes
    for df in df_list:
        # create empty list to store percentages of confirmed/population
        confirmed_perc = []
        
        # loop through each country's data in dataframe
        for index, case in enumerate(df['Confirmed']): 
            # manually fix countries that throw errors
            if df.at[index, 'Country_Region'] == 'Eritrea':
                population = 5320000
            elif df.at[index, 'Country_Region'] == 'Kosovo':
                population = 1845000
            elif df.at[index, 'Country_Region'] == 'Not a country':
                df.drop(df.index[index])
            elif df.at[index, 'Country_Region'] == 'Anguilla':
                df.at[index, 'Country_Region'] = 'United Kingdom'
                df.at[index, '2 Char ISO Codes'] = 'gb'
                df.at[index, '3 Char ISO Codes'] = 'GBR'  
            # if no errors, use population associated with country name
            else:
                population = int(df.at[index, 'Population'])
                
            pop_perc = float(case / population) * 100
            confirmed_perc.append(pop_perc)
        
        # add list of percentages into dataframe
        df['% of Population'] = pd.Series(confirmed_perc)
    
    return df_list


# function that displays specific information for 1 country in 1 dataframe
def display_country(df_list, country, date):
    # empty list to store country info
    country_info = []
    
    for df in df_list:
        # list to store row information
        rows_list = df.values.tolist()
        
        for row in rows_list:
            if row[1] == country and row[0] == date:
                country_info.append(row)
                
    # print contents of country_info list by creating new dataframe
    labels = df_list[0].columns 
    country_df = pd.DataFrame(country_info, columns = labels)
    
    return country_df


# function concatenates all updates into one dataframe
def convert_main_df(df_list):
    # merge all dataframes within file list into a single dataframe
    main_df = pd.concat(df_list)
    main_df = main_df.sort_values('Date')
    
    return main_df
    
    
# function that stores a list of dates and list of countries
def store_dates_countries(main_df):
    # empty list to store dates
    dates = []
    
    for date in main_df['Date']:
        if date not in dates:
            dates.append(date)
            
    # empty list to store countries
    countries_list = []
    
    for country in main_df['Country_Region']:
        if country not in countries_list:
            countries_list.append(country)
    countries_list.sort()
            
    return dates, countries_list


# function shows visualization for choropleth maps
def show_choropleth(main_df):
    # plot confirmed cases on a choropleth map
    confirmed = px.choropleth(main_df, locations='3 Char ISO Codes', color='Confirmed', hover_name='Country_Region',
                      hover_data=['Deaths'], labels={'3 Char ISO Codes' : 'Country Code'}, 
                       title='Confirmed Cases of Coronavirus', range_color=[0, 350000], animation_frame='Date')
    confirmed.show()
    print('''
        The choropleth map above shows the raw number of confirmed Coronavirus cases in each country. Use the slider  
        to animate the spread of the disease starting from March 1 to the current day.''')
    
    # plot confirmed cases as percentage of population on choropleth map
    percent_plot = px.choropleth(main_df, locations='3 Char ISO Codes', color='% of Population', hover_name='Country_Region',
                      hover_data=['Deaths', 'Confirmed'], labels={'3 Char ISO Codes' : 'Country Code'}, 
                       title='Confirmed Cases as % of Population', range_color=[0, 0.2], animation_frame='Date')
    percent_plot.show()
    print('''
        The choropleth map above shows the number of confirmed Coronavirus cases as a percentage of each country\'s 
        total population. Use the slider to animate the spread of the disease starting from March 1 to the current day.''')
    
    return confirmed, percent_plot

# get HTML death rate data
html_raw = open('Demographics_HTML.html')
soup = BeautifulSoup(html_raw, 'lxml')
subsoup = soup.find('div', class_='content-inner')
table_values = subsoup.find_all('strong')

# function that obtains all demographic data from the subsoup
# returns a list containing 3 lists of demographic data (age, gender, pre-existing conditions)
def get_data():

    # empty list to store values from tables
    start_indeces = []
    end_indeces = []

    # get range of indeces for table values
    for index, value in enumerate(table_values):
        string_val = value.text.strip()
        if string_val == '80+ years old' or string_val == 'Male' or string_val == 'Cardiovascular disease':
            start_indeces.append(index)
        elif string_val == 'Death Rat':
            end_indeces.append(index)

    # get rid of all extra indeces that don't contain relevant data
    for i in range(len(start_indeces)):
        if end_indeces[i] < start_indeces[i]:
            del end_indeces[i]

    # empty list to store lists of values in index ranges
    list_values_list = []
    
    for i in range(len(start_indeces)):
        # empty list to store index ranges containing data
        index_range = []

        start_index = int(start_indeces[i])
        for j in range(start_index, int(end_indeces[i])):
            # filter out a piece of irrelevant information
            # add all indeces between start and end to index list
            if j != 13:
                index_range.append(j)
            
        # empty list to store values within range
        values_list = []

        # print values within index range    
        for index, value in enumerate(table_values):
            if index in index_range:
                if value.text.strip() == 'no fatalities':
                    values_list.append('0.0%')
                else:
                    values_list.append(value.text.strip())
        list_values_list.append(values_list)
    
    return list_values_list


# function accesses list of lists containing values for death rates and puts into dataframe
# returns 3 dictionaries for age, gender, pre-existing conditions
def data_to_dataframe(values):
    # dictionaries that hold demographic as key and percentage as value
    age_dict = {}
    gender_dict = {}
    disease_dict = {}
    
    # add values to respective dictionaries
    for i in range(0, len(values[0])-1, 2):
        age_dict[values[0][i]] = values[0][i+1]
        
    for i in range(0, len(values[1])-1, 3):
        gender_dict[values[1][i]] = values[1][i+1]
        
    for i in range(0, len(values[2])-1, 3):
        disease_dict[values[2][i]] = values[2][i+1]  
    
    # turn age dictionary into dataframe
    age_df = pd.DataFrame.from_dict(age_dict, columns=['Death Rate'], orient='index')
    
    # turn gender dictionary into dataframe
    gender_df = pd.DataFrame.from_dict(gender_dict, columns=['Death Rate'], orient='index')
    
    # turn disease dictionary into dataframe
    disease_df = pd.DataFrame.from_dict(disease_dict, columns=['Death Rate'], orient='index')
    
    return age_df, gender_df, disease_df


# function that creates scatter plots comparing mortality rates for each demographic
# returns 3 bar graph objects
def compare_mortality(age_df, gender_df, disease_df):
    # convert string rates to floats
    age_rates = []
    for rate in age_df['Death Rate']:
        float_rate = float(rate.strip('%'))
        age_rates.append(float_rate)
        
    age_viz = px.bar(age_df, y=age_df.index, x=age_rates, labels={'x':'Death Rate (%)', 'y':'Demographic'},
                    title='Death Rates By Age', orientation='h')
    
    # convert string rates to floats
    gender_rates = []
    for rate in gender_df['Death Rate']:
        float_rate = float(rate.strip('%'))
        gender_rates.append(float_rate)
    
    gender_viz = px.bar(gender_df, y=gender_df.index, x=gender_rates, labels={'x':'Death Rate (%)', 'y':'Demographic'},
                       title='Death Rates By Gender', orientation='h')
    
    # convert string rates to floats
    disease_rates = []
    for rate in disease_df['Death Rate']:
        float_rate = float(rate.strip('%'))
        disease_rates.append(float_rate)
    
    disease_viz = px.bar(disease_df, y=disease_df.index, x=disease_rates, labels={'x':'Death Rate (%)', 'y':'Demographic'},
                        title='Death Rates By Pre-Existing Conditions', orientation='h')
    
    return age_viz, gender_viz, disease_viz


# function shows visualization for death rate bar graphs
def show_visualization(age_df, gender_df, disease_df, demographic):
    age, gender, disease = compare_mortality(age_df, gender_df, disease_df)
    
    if demographic == 'Age':
        age.show()
    elif demographic == 'Gender':
        gender.show()
    else:
        disease.show()

        
# function creates dataframe from age groupings csv data - shows percentages of each age group (Age_Breakdown.csv)
def age_to_df():
    csv_file = open('Age_Breakdown.csv')
    pop_data = csv.reader(csv_file) 
    pop_data = list(pop_data)
    # store contents of csv in dictionary
    data = {}

    # add data to dictionary
    for i in range(len(pop_data[0])):
       data[pop_data[0][i]] = pop_data[1][i]

    # convert dictionary into dataframe
    age_breakdown = pd.DataFrame(data, index=[0])
    
    return age_breakdown


# function returns a dictionary that stores the percentages of each age group as a float
# and the total population as an integer
def show_age_floats(age_df):
    # create list to store brackets
    age_brackets = ['0-14', '15-24', '25-64', '65+']
    age_perc= []
    
    # obtain percentages for each age bracket
    young = age_df.iloc[0]['Age 0-14']
    # convert percent into a decimal
    young = float(young.strip('%')) / 100
    age_perc.append(young)
    
    teen = age_df.iloc[0]['Age 15-24']
    # convert percent into a decimal
    teen = float(teen.strip('%')) / 100
    age_perc.append(teen)
    
    mid = age_df.iloc[0]['Age 25-64']
    # convert percent into a decimal
    mid = float(mid.strip('%')) / 100
    age_perc.append(mid)
    
    old = age_df.iloc[0]['Age 65+']
    # convert percent into a decimal
    old = float(old.strip('%')) / 100
    age_perc.append(old)
    
    # create dictionary that will store population data based on age (number of people in each age group)
    pop_breakdown = {}
    population = int(age_df['2013 Population'])
    for i in range(len(age_brackets)):
        per_group = int(population * age_perc[i])
        pop_breakdown[age_brackets[i]] = per_group
        
    return pop_breakdown, population
    
    
# function returns a dictionary with updated breakdown of death rates for each age group
# takes in dictionary of population spread and dataframe for the death rates by age
# standardizes the other dictionary of age vulnerability
def get_age_dict(death_rates, pop_breakdown):
    # list stores rates as floats
    rates = []
    
    for rate in death_rates['Death Rate']:
        rate = float(rate.strip('%')) /100
        rates.append(rate)
    rates.reverse()
    
    # combine several age groups to get same groups as the population breakdown
    # store new death rates into dictionary
    age_dict = {}
    for key in pop_breakdown:
        if key == '0-14':
            age_dict[key] = (rates[0] + rates[1]) / 2
        elif key == '15-24':
            age_dict[key] = (rates[1] + rates[2]) / 2
        elif key == '25-64':
            age_dict[key] = (rates[2] + rates[3] + rates[4] + rates[5]  + rates[6]) / 5
        elif key == '65+':
            age_dict[key] = (rates[len(rates)-1] + rates[len(rates)-2]) / 2
    
    return age_dict


# function returns a dictionary that holds gender death rates
# takes in dataframe for gender death rates
def get_gender_dict(gender_df):
    # dictionary that holds death rates by gender
    gender_dict = {}
    
    for index in gender_df.index:
        rate = gender_df.at[index, 'Death Rate']
        rate = float(rate.strip('%')) / 100
        gender_dict[index] = rate
    
    return gender_dict


# function returns a dictionary that holds disease death rates
# takes in dataframe for disease death rates
def get_disease_dict(disease_df):
    # dictionary that holds death rates by disease
    disease_dict = {}
    
    for index in disease_df.index:
        rate = disease_df.at[index, 'Death Rate']
        rate = float(rate.strip('%')) / 100
        disease_dict[index] = rate
    
    return disease_dict


# function uses breakdown of age groups and death rates to show estimate of how many are dead if infected
# returns a dataframe containing breakdown of estimated deaths according to age and gender
def mortality_to_df(age_dict, gender_dict, pop_dict, population): 
    # determine estimate of deaths within age groups and gender
    age_groups = []
    genders = []
    death_rates = []
    estimate_deaths = []
    total_deaths = []
    
    # calculate probability of being dead in each age group
    for key in age_dict.keys():
        # calculate deaths based on age death rate
        for gender in gender_dict.keys():
            # assume 50% of population is male
            dead_people = (pop_dict[key] / 2) * gender_dict[gender]
            total_deaths.append(dead_people)
            age_groups.append(key)
            genders.append(gender)
            estimate_deaths.append(int(dead_people))
            
    # loop through total deaths to get final count
    total_sum = 0
    for deaths in total_deaths:
        total_sum += deaths
        
    # loop through deaths to get overall death rate
    for deaths in total_deaths:
        overall_death_rate = deaths / total_sum
        death_rates.append('{:.4f}%'.format(overall_death_rate * 100))
    
    # build dataframe from data array
    deaths_df = pd.DataFrame({'Gender': genders,
                              'Age Group' : age_groups,
                              'Estimated Deaths' : estimate_deaths,
                              '% of Dead' : death_rates})
    
    return deaths_df


# function interacts with user by selecting demographic information and showing overall death rate
# returns a dataframe containing info for an individual with the selected conditions
def interact_risk(age_group, num_deaths_df, gender):
    individual_info = []
    # loop through dataframe to find info pertinent to parameters
    for index, row in enumerate(num_deaths_df.values.tolist()):
        if (row[0] == gender) and (row[1] == age_group):
            individual_info = row
    
    parameters_df = pd.DataFrame(individual_info, index=num_deaths_df.columns)
    parameters_df = parameters_df.transpose()
    
    return parameters_df


# function plots estimated deaths based on age and gender on sunburst visualization
# takes in argument for dataframe with estimated deaths by demographic
def create_sunburst(num_deaths_df):
    fig = px.sunburst(num_deaths_df, path=['Gender', 'Age Group'], 
                  values='Estimated Deaths', hover_data=['% of Dead'], 
                      title='Estimated Deaths According to Demographics')
    fig.show()
    return fig
        
        
if __name__ == '__main__':
    pop_dict = open_populations()
    
    df_list = add_populations(pop_dict)
    df_list = pop_percentage(df_list)
    
    main_df = convert_main_df(df_list)
    
    date_list, country_list = store_dates_countries(main_df)
    
    # display specific country info
    print('Enter the name of a country and a date from the drop down menus to display Coronavirus information:')
    country_df = interact(display_country, df_list=fixed(df_list), country=country_list, date=date_list)
    
    # plot choropleth maps
    fig1, fig2 = show_choropleth(main_df)
    print('\n \n')
    
    type_viz = ['Age', 'Gender', 'Pre-Existing Conditions']
    print('Choose a demographic type from the dropdown menu to display mortality rates:')
    age_rates, gender_rates, disease_rates = data_to_dataframe(get_data())
    interact(show_visualization, age_df=fixed(age_rates), gender_df=fixed(gender_rates),
             disease_df=fixed(disease_rates), demographic=type_viz)
    
    age_breakdown = age_to_df()
    pop_breakdown, total_pop = show_age_floats(age_breakdown)
    
    age_dict = get_age_dict(age_rates, pop_breakdown)
    gender_dict = get_gender_dict(gender_rates)
    disease_dict = get_disease_dict(disease_rates)
    
    num_deaths_df = mortality_to_df(age_dict, gender_dict, pop_breakdown, total_pop)
    
    print('Enter options from the following dropdown menus to display estimated deaths: \n')
    interact(interact_risk, num_deaths_df=fixed(num_deaths_df), age_group=age_dict.keys(), gender=gender_dict.keys(), 
             conditions=disease_dict.keys())
    
    fig6 = create_sunburst(num_deaths_df)
    description = '''
        The visualization above shows the estimated age and gender breakdown for those dying of Coronavirus. 
        Though older individuals have a greater likelihood of dying from the disease, the fact that they make  
        up a much smaller percentage of the population leads to the estimated counts that are displayed. 
        The death percentages are taken out of a total population of 7,162,119,434.'''
    print(description)
    
    fig3, fig4, fig5 = compare_mortality(age_rates, gender_rates, disease_rates)
    
    pio.write_html(fig2, file='CoronaWebsite.html', auto_open=True)
    
    
    

Enter the name of a country and a date from the drop down menus to display Coronavirus information:


interactive(children=(Dropdown(description='country', options=('Afghanistan', 'Albania', 'Algeria', 'Andorra',…


        The choropleth map above shows the raw number of confirmed Coronavirus cases in each country. Use the slider  
        to animate the spread of the disease starting from March 1 to the current day.



        The choropleth map above shows the number of confirmed Coronavirus cases as a percentage of each country's 
        total population. Use the slider to animate the spread of the disease starting from March 1 to the current day.

 

Choose a demographic type from the dropdown menu to display mortality rates:


interactive(children=(Dropdown(description='demographic', options=('Age', 'Gender', 'Pre-Existing Conditions')…

Enter options from the following dropdown menus to display estimated deaths: 



interactive(children=(Dropdown(description='age_group', options=('0-14', '15-24', '25-64', '65+'), value='0-14…


        The visualization above shows the estimated age and gender breakdown for those dying of Coronavirus. 
        Though older individuals have a greater likelihood of dying from the disease, the fact that they make  
        up a much smaller percentage of the population leads to the estimated counts that are displayed. 
        The death percentages are taken out of a total population of 7,162,119,434.


In [13]:
import nbinteract as nbi

nbinteract Website_Design.ipynb

SyntaxError: invalid syntax (<ipython-input-13-9f21ab5ac33a>, line 3)