# Data Visualization with Dashboard

This notebook provides the full evaluation of delivery 3 for the course "Enterprise Data Science". The single steps are seperated into different parts. The dynamic dashboard implements a diagram to show the COVID cases for all countries from the data set as well as its filtered data and the doubling rate. As bonus, it also shows the percentage of fully vaccinated people.

Different to the provided notebook, OurWorldInData is used as data source as it provides much more data, among other things the vaccinations. Following from that, many parts had to be adapted to the new data source.

## 1. Fetch New Data from OurWorldInData

In [1]:
# %load "../src/data/get_data.py"
import pandas as pd

def get_data():
    """ Get current data from Our World in Data """
    url = "https://covid.ourworldindata.org/data/owid-covid-data.csv"
    df_owid = pd.read_csv(url, sep=',')
    df_owid.to_csv("../data/raw/covid_full_data.csv", sep=";")


if __name__ == '__main__':
    get_data()

## 2. Process Data

In [1]:
# %load "../src/data/process_owid_data.py"
import pandas as pd

def process_owid_data():
    """ Processes the data fetched ffrom Our World in Data by selecting the relevant columns """
    df_owid = pd.read_csv("../data/raw/covid_full_data.csv", sep=";")
    df_selection = df_owid[['date', 'location', 'total_cases', 'people_vaccinated_per_hundred']].sort_values('date',ascending=True).reset_index(drop=True).copy()
    df_selection = df_selection.drop(df_selection[df_selection['location'] == 'Western Sahara'].index)     # Drop Western Sahara as it has too little data for the rolling window
    df_selection.to_csv("../data/processed/data_owid_selection.csv", sep=";")

if __name__ == '__main__':
    process_owid_data()

## 3. Filter Data and calculate doubling rate

In [18]:
# %load "../src/features/build_features.py"
from scipy import signal
import pandas as pd
import numpy as np
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=True)

def savgol_filter(df_input,column='total_cases', window=5):
    ''' Savgol Filter which can be used in groupby apply function 
        it ensures that the data structure is kept'''
    window=5, 
    degree=1
    df_result=df_input
    
    filter_in=df_input[column].fillna(0) # attention with the neutral element here
    
    result=signal.savgol_filter(np.array(filter_in),
                           5, # window size used for filtering
                           1)
    df_result[column+'_filtered']=result
    return df_result

def get_doubling_time_via_regression(in_array):
    ''' Use a linear regression to approximate the doubling rate

        Parameters:
        ----------
        in_array : pandas.series

        Returns:
        ----------
        Doubling rate: double
    '''

    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1, 1)

    assert len(in_array)==3
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_

    return intercept/slope

def rolling_reg(df_input,col='total_cases'):
    ''' Rolling Regression to approximate the doubling time'

        Parameters:
        ----------
        df_input: pd.DataFrame
        col: str
            defines the used column
        Returns:
        ----------
        result: pd.DataFrame
    '''
    days_back=3
    result=df_input[col].rolling(
                window=days_back,
                min_periods=days_back).apply(get_doubling_time_via_regression,raw=False)
                
    return result

def calc_filtered_data(df_input,filter_on='total_cases'):
    '''  Calculate savgol filter and return merged data frame

        Parameters:
        ----------
        df_input: pd.DataFrame
        filter_on: str
            defines the used column
        Returns:
        ----------
        df_output: pd.DataFrame
            the result will be joined as a new column on the input data frame
    '''

    must_contain=set(['location',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Erro in calc_filtered_data not all columns in data frame'

    df_output=df_input.copy() # we need a copy here otherwise the filter_on column will be overwritten

    pd_filtered_result=df_output[['location',filter_on]].groupby(['location']).apply(savgol_filter)#.reset_index()

    #print('--+++ after group by apply')
    #print(pd_filtered_result[pd_filtered_result['country']=='Germany'].tail())

    #df_output=pd.merge(df_output,pd_filtered_result[['index',str(filter_on+'_filtered')]],on=['index'],how='left')
    df_output=pd.merge(df_output,pd_filtered_result[[str(filter_on+'_filtered')]],left_index=True,right_index=True,how='left')
    #print(df_output[df_output['country']=='Germany'].tail())
    return df_output.copy()





def calc_doubling_rate(df_input,filter_on='total_cases'):
    ''' Calculate approximated doubling rate and return merged data frame

        Parameters:
        ----------
        df_input: pd.DataFrame
        filter_on: str
            defines the used column
        Returns:
        ----------
        df_output: pd.DataFrame
            the result will be joined as a new column on the input data frame
    '''

    must_contain=set(['location',filter_on])
    assert must_contain.issubset(set(df_input.columns)), ' Error in calc_filtered_data not all columns in data frame'


    pd_DR_result= df_input.groupby(['location']).apply(rolling_reg,filter_on).reset_index()

    pd_DR_result=pd_DR_result.rename(columns={filter_on:filter_on+'_DR',
                             'level_1':'index'})

    #we do the merge on the index of our big table and on the index column after groupby
    df_output=pd.merge(df_input,pd_DR_result[['index',str(filter_on+'_DR')]],left_index=True,right_on=['index'],how='left')
    df_output=df_output.drop(columns=['index'])


    return df_output

if __name__ == "__main__":
    df_processed_data = pd.read_csv("../data/processed/data_owid_selection.csv", sep=";")
    df_processed_data = calc_filtered_data(df_processed_data)
    df_processed_data = calc_doubling_rate(df_processed_data)
    df_processed_data = calc_doubling_rate(df_processed_data,'total_cases_filtered')
    df_processed_data.to_csv("../data/processed/data_doubling_filtered.csv", sep=";", index=False)

## 4. Visualization

In [2]:
# %load "../src/visualization/visualize.py"
import pandas as pd
import numpy as np
import plotly.graph_objects as go

import dash
from dash import dcc as doc
from dash.dependencies import Input, Output
from dash import html
import dash_daq as daq

import numpy as np

fig = go.Figure()
df_covid = pd.read_csv("../data/processed/data_doubling_filtered.csv", sep=";")
countries = df_covid['location'].unique()

app = dash.Dash()
app.layout = html.Div([
    html.H1('Dynamic Covid-19 Dashboard'),
    html.Label('This dynamic dashboard implements '),
    html.Label('Select the countries to display:'),
    doc.Dropdown(
        id = 'country_drop_down',
        options=[{'label': country, 'value': country} for country in countries],
        value=['Germany'],        # which are pre-selected
        multi=True
    ),
     doc.Markdown('''
        ## Select Timeline of confirmed COVID-19 cases or the approximated doubling time
        '''),


    doc.Dropdown(
    id='doubling_time',
    options=[
        {'label': 'Timeline Total Cases', 'value': 'total_cases'},
        {'label': 'Timeline Total Cases Filtered', 'value': 'total_cases_filtered'},
        {'label': 'Timeline Doubling Rate', 'value': 'total_cases_DR'},
        {'label': 'Timeline Doubling Rate Filtered', 'value': 'total_cases_filtered_DR'},
        {'label': 'Percentage Of Fully Vaccinated People', 'value': 'people_vaccinated_per_hundred'},
    ],
    value='total_cases',
    multi=False
    ),
    daq.BooleanSwitch(id='loglin_switch', on=False, label="Logarithmic scale", labelPosition="top"),
    doc.Graph(figure=fig,id='main_window_slope')
])
@app.callback(
    Output('main_window_slope', 'figure'),
    [Input('country_drop_down', 'value'), Input('loglin_switch', 'on'), Input('doubling_time', 'value')])
def update_figure(countries_to_show, switch_state, show_doubling):
    traces = []
    ylabels = {'total_cases':'Total Cases of Infected People', 'total_cases_filtered':'Total Cases of Infected People (Filtered)',
                 'total_cases_DR': 'Doubling Rate of Infected People', 'total_cases_filtered_DR': 'Doubling Rate of Infected People (Filtered)',
                 'people_vaccinated_per_hundred': 'Percentage of Fully Vaccinated People'}
    for country in countries_to_show:
        df_plot=df_covid[df_covid['location']==country]

        if show_doubling=='doubling_rate_filtered':
            df_plot=df_plot[['location','total_cases','total_cases_filtered','total_cases_DR','total_cases_filtered_DR','date', 'people_vaccinated_per_hundred']].groupby(['location','date']).agg(np.mean).reset_index()
        else:
            df_plot=df_plot[['location','total_cases','total_cases_filtered','total_cases_DR','total_cases_filtered_DR','date', 'people_vaccinated_per_hundred']].groupby(['location','date']).agg(np.sum).reset_index()
        
        traces.append(dict(x=df_plot.date,
                             y=df_plot[show_doubling],
                             name=country,
                             opacity=0.9,
                             line_width=2,
                             marker_size=4,
                             mode='markers+lines'
                          )
                     )
        
    return {
        'data': traces,
        'layout': dict(width=1280,
                        height=720,
                        title="Covid-19 Dashboard",
                      xaxis={'tickangle':-45,
                            'nticks':20,
                            'tickfont':dict(size=14,color='#7f7f7f'),
                             'title':'Time',
                            },
                      yaxis={
                          'type': ('log' if switch_state else 'linear'),
                          'range':('[0.1,100]' if switch_state else '[0,100000000]'),
                          'title': ylabels[show_doubling] + (', Logarithmic' if switch_state else ''),
                      })
    }

if __name__ == '__main__':
    app.run(debug=True, use_reloader=False)

Dash is running on http://127.0.0.1:8050/

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: on
