In [6]:
import pandas as pd
import plotly.express as px
import numpy as np
pd.set_option('display.float_format', lambda x: '%.0f' % x)
import plotly.graph_objects as go
from sklearn.preprocessing import normalize



In [7]:
df = pd.read_csv('/Users/sam.ho/Documents/sam_personal/streamlit_apps/covid-19-data/public/data/owid-covid-data.csv')

In [8]:
print('Total unique continents:',len(df.continent.unique()))
print('Total unique countries:',len(df.location.unique()))
print('Data span:',df.date.min(),df.date.max())

Total unique continents: 7
Total unique countries: 231
Data span: 2020-01-01 2021-07-11


# Plotly Formatting Functions
---

In [9]:
def plotly_streamlit_layout(fig, barmode=None, barnorm=None, height=None,width=None):
    fig.update_layout(
                      barmode=barmode,
                      barnorm=barnorm,
                      height = height,
                      width = width)
    fig.update_xaxes(showline=True, linewidth=1, linecolor='black')
    fig.update_yaxes(showline=True, linewidth=1, linecolor='black')

    fig.update_layout(margin=dict(l=50, r=50, b=50, t=50, pad=2))
    fig.update_layout(bargap=0.03)

    return fig

def plotly_streamlit_texts(fig, x_title, y_title):
    fig.update_layout(yaxis=dict(title=y_title, titlefont_size=10, tickfont_size=10),
                      xaxis=dict(title=x_title, titlefont_size=10, tickfont_size=10))

    return fig

## Sorting Interpolation
---

In [10]:
def get_indexes(x):
    index_fill_1 = [i for i in range(x.index[0],x.dropna().index[0])]
    index_interpolate = [i for i in range(x.dropna().index[0],x.index[-1])]
    return index_fill_1,index_interpolate

def update_series(x):
    
    if len(x.dropna()) == 0:
        x = x.fillna(1)
        return x
    
    else:
    
        index_fill_1, index_interpolate = get_indexes(x)
        x_fill_1 = x[x.index.isin(index_fill_1)]
        x_interpolate = x[x.index.isin(index_interpolate)]

        x_fill_1 = x_fill_1.fillna(1)
        x_interpolate = x_interpolate.interpolate()

        return pd.concat([x_fill_1,x_interpolate])

    

In [11]:
def update_series(x):
    
    if len(x.dropna()) == 0:
        x = x.fillna(1)
        return x
    
    else:
    
        index_fill_1, index_interpolate = get_indexes(x)
        x_fill_1 = x[x.index.isin(index_fill_1)]
        x_interpolate = x[x.index.isin(index_interpolate)]

        x_fill_1 = x_fill_1.fillna(1)
        x_interpolate = x_interpolate.interpolate()

        return pd.concat([x_fill_1,x_interpolate])

    

In [12]:
transform_cols = [  'people_vaccinated_per_hundred',
                    'total_vaccinations',
                    'total_deaths',
                    'total_deaths_per_million',
                    'total_cases_per_million',
                    'icu_patients_per_million',
                  'hosp_patients_per_million'
                 ]

In [13]:
def get_clean_covid_data_new(df):
    
    country_dfs = []
    
    # loop through each country 
    for country in df.location.unique():
        df_country = df[df.location == country] # df masked on country
        df_country.date = pd.to_datetime(df_country.date) # convert string date to datetime
        df_country = df_country.sort_values(by='date') # sort by date
        
        for col in transform_cols:
            df_country[col] = update_series(df_country[col])
            
        
        
        # we will group by week and use max as agg so each row will represent the max value in any given week
        df_country = df_country.groupby(pd.Grouper(key='date', freq='W')).max()

        
        country_dfs.append(df_country) # append unique country dataframe to list

    df_final = pd.concat(country_dfs)

    df_final = df_final.reset_index().sort_values(by=['location', 'date'])

    # we only want countries so no need to include continent / world aggregates at this stage
    # we can apply aggregation later on
#     df_final = df_final[~df_final.location.isin(df.continent)]
#     df_final = df_final[~df_final.location.isin(['World', 'European Union'])]
    df_final = df_final.fillna(0)

    # this is a hack to make sure Plotly animations and legends work propoerly
#     df_final.total_vaccinations = df_final.total_vaccinations.apply(lambda x: int(2) if x == 0 else x)

    df_final.population = df_final.population.astype(int)
    df_final.total_vaccinations = df_final.total_vaccinations.astype(int)

    df_final = df_final.sort_values(by='date', ascending=True)
    df_final = df_final[df_final.date >= '2020-02-09']
    df_final.date = df_final.date.astype(str)
    df_final = df_final[df_final.continent != 0]

    # more Plotly hacks
#     up_zero = df_final[df_final.date == '2020-02-09']
#     normal = df_final[df_final.date != '2020-02-09']
#     up_zero.total_deaths = 1

#     df_final = pd.concat([up_zero, normal])
#     df_final = df_final[df_final.continent != 0]

    df_final = df_final.loc[:, ~df_final.columns.duplicated()]
    return df_final

cols_for_app = ['continent','location','date','total_deaths','total_deaths_per_million',\
         'total_cases_per_million','icu_patients_per_million','hosp_patients_per_million',
         'people_vaccinated_per_hundred','total_vaccinations',
         'gdp_per_capita','population','stringency_index','population',
         'population_density', 'median_age', 'aged_65_older',
         'aged_70_older', 'gdp_per_capita', 'extreme_poverty',
         'cardiovasc_death_rate', 'diabetes_prevalence',
         'female_smokers','male_smokers', 'handwashing_facilities',
         'hospital_beds_per_thousand','life_expectancy']

df = df[cols_for_app]

df_final = get_clean_covid_data_new(df)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



# Index Analysis
---

In [14]:
df_country_unique = df_final[df_final.columns[1:3]].drop_duplicates(subset=['location'], keep='first').set_index(
        'location')

df_analysis = pd.concat([df_country_unique, df_final.groupby('location')[df_final.columns[3:]].max()], axis=1)
df_analysis = df_analysis.reset_index()
df_analysis.rename(columns={'index': 'location'}, inplace=True)

variable_dic = {'GDP per Capita': 'gdp_per_capita',
                 'Population': 'population',
                 'Stringency_index': 'stringency_index',
                 'Population Density': 'population_density',
                 'Median Age': 'median_age',
                 'Aged 65 or older': 'aged_65_older',
                 'Aged 70 or older': 'aged_70_older',
                 'Extreme Poverty': 'extreme_poverty',
                 'Cardiovascular Death Rate': 'cardiovasc_death_rate',
                 'Diabetes Prevalance':'diabetes_prevalence',
                 'Female Smokers':'female_smokers',
                 'Male Smokers': 'male_smokers',
                 'Handwashing Facilities': 'handwashing_facilities',
                 'Hospital Beds per Thousand':'hospital_beds_per_thousand',
                 'Life Expectancy': 'life_expectancy'}

colour_dic = {'Country': 'location',
                'Continent': 'continent' }

size_dic = {'Total Deaths': 'total_deaths',
              'Total Deaths per Million': 'total_deaths_per_million',
              'Total Cases per Million': 'total_cases_per_million',
              'People Vaccinated per Hundred': 'people_vaccinated_per_hundred',
                'No sizing':None}

# c1,c2, c3, c4 = st.beta_columns((2,2,2,2))

# x_metric = c1.selectbox('X axis', list(variable_dic.keys()), 5)
# y_metric = c2.selectbox('Y axis', list(variable_dic.keys()), 2)
# size_by = c3.selectbox('Size markers by', list(size_dic.keys()), 0)
# colour_by = c4.selectbox('Colour markers by', ('Country', 'Continent'), 0)
# marker_size = st.slider('Adjust marker size for readability', 1, 1000, step=1)

# x = variable_dic[x_metric]
# y= variable_dic[y_metric]
# size_by = size_dic[size_by]
# colour_by = colour_dic[colour_by]

df_plot = df_analysis[['total_deaths','total_deaths_per_million', 'population_density', 'gdp_per_capita', 'location','continent']]

unique_countries_dic = dict(zip(set(df_plot.location),[i for i in range(len(set(df_plot.location)))]))
unique_continents_dic = dict(zip(set(df_plot.continent),[i for i in range(len(set(df_plot.continent)))]))
df_plot['location_colour'] = df_plot['location'].apply(lambda x: unique_countries_dic[x])
df_plot['continent_colour'] = df_plot['continent'].apply(lambda x: unique_continents_dic[x])
# st.dataframe(df_plot.head(10))

# df_plot['size_by_marker'] = df_plot[size_by] * marker_size
from sklearn.preprocessing import normalize

def reshape_for_plot(df, col, marker_size):
    return pd.Series((normalize([df[col]]) * marker_size)[0])







A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [17]:
def plot_scatter(df, x, y, size, marker_size, average_kind ='Mean'):

    fig_scatter = go.Figure() # get a graph objects figure
    df_to_plot = df.reset_index(drop=True)
    col = px.colors.qualitative.Plotly * 25 # get 25 colours

    # loop through each row and add marker and various bits of meta data
    for i in range(df_to_plot.shape[0]):
        fig_scatter.add_trace(go.Scatter(
            x=np.array(df_to_plot[x][i]),
            y=np.array(df_to_plot[y][i]),
            name=df_to_plot['location'][i],
            hovertext='<b>' + df_to_plot['location'][i] + '</b>' + '<br>' + size.capitalize().replace('_', ' ') + ' : ' + \
                      str(int(df_to_plot[size][i])),
            hoverinfo="text",
            mode='markers',

            # marker size is adjusted using a reshape function
            marker=dict(size=reshape_for_plot(df_to_plot, size, marker_size)[i], opacity=0.5,
                        color=col[i])))

    # add vertical and horizontal lines to represent mean or median
    if average_kind == 'Mean':
        fig_scatter.add_vline(x=df_to_plot[x].mean(), line_width=1, line_dash="dash", line_color="grey")
        fig_scatter.add_hline(y=df_to_plot[y].mean(), line_width=1, line_dash="dash", line_color="grey")
    else:
        fig_scatter.add_vline(x=df_to_plot[x].median(), line_width=1, line_dash="dash", line_color="grey")
        fig_scatter.add_hline(y=df_to_plot[y].median(), line_width=1, line_dash="dash", line_color="grey")

    # use log scale for x and y - makes plot more readable
    fig_scatter.update_xaxes(type="log")
    fig_scatter.update_yaxes(type="log")

    # some formatting of plot - background colours and show legend
    fig_scatter.update_layout(legend={'itemsizing': 'constant'})
    fig_scatter.update_layout(
                              width=1000,
                              height=1000,
                              showlegend=True)

    # some formatting of plot - axis font size and tick size
    fig_scatter.update_layout(yaxis=dict(title=y.capitalize().replace('_', ' '), titlefont_size=15, tickfont_size=10),
                              xaxis=dict(title=x.capitalize().replace('_',' '), titlefont_size=15, tickfont_size=10))

    return fig_scatter

In [21]:



fig_cross_plot = plot_scatter(df_plot[df_plot.continent == 'Asia'], x='population_density',y='gdp_per_capita',\
                   size='total_deaths_per_million', marker_size=250)
plotly_streamlit_layout(fig_cross_plot, height=800, width=1600)

# st.plotly_chart(fig_cross_plot)
# fig_cross_plot.update_traces(marker_size={50})

# fig_cross_plot.update_traces(marker=dict(size=list(df_plot.size_by_marker)))


# Vaccines Animation
---

In [14]:
x = [1,2,3]
x.insert(,'po')

In [69]:
df_final[df_final.location == 'Africa']

Unnamed: 0,date,continent,location,total_deaths,total_deaths_per_million,total_cases_per_million,icu_patients_per_million,hosp_patients_per_million,people_vaccinated_per_hundred,total_vaccinations,...,aged_65_older,aged_70_older,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy


In [19]:


fig = px.scatter(df_final, x="population", y="people_vaccinated_per_hundred", 
                 animation_frame=df_final.date, animation_group="location",
           size="total_deaths_per_million",hover_name="location",log_y=False,log_x=True,color=df_final.continent,
           range_x=[50000 ,df_final.population.max()*1.4], range_y=[-10,100],size_max=50)


# fig.update_layout(height=800,width=1000)

plotly_streamlit_layout(fig, height=800, width=1600)


## Distributions

In [126]:
test = pd.read_csv('https://www.dropbox.com/s/4jgheggd1dak5pw/data_visualization.csv?raw=1', index_col=0)


In [155]:
df_heatmap = df_analysis.copy()

df_heatmap = df_heatmap[df_heatmap.columns[2:]].corr()

df_heatmap.columns = [i.capitalize().replace('_',' ') for i in df_heatmap.columns]
df_heatmap.index = df_heatmap.columns

def get_heatmap(df):
    
    mask = np.triu(np.ones_like(df, dtype=bool))
    data = df.mask(mask)

    heat = go.Heatmap(z = data,
                      x = data.columns.values,
                      y = data.columns.values,
                      zmin = - 0.01, 
                      zmax = 1,
                      xgap = 1, 
                      ygap = 1,
                      colorscale = 'Greens')



    layout = go.Layout(
        title_x=0.5, 
        width=800, 
        height=800,
        xaxis_showgrid=False,
        yaxis_showgrid=False,
        yaxis_autorange='reversed'
    )

    fig=go.Figure(data=[heat], layout=layout)
    return fig

In [73]:
from numpy.random import randn
from numpy.random import seed
from scipy.stats import spearmanr
# seed random number generator
seed(1)
# prepare data
data1 = 20 * randn(1000) + 100
data2 = data1 + (10 * randn(1000) + 50)
data3 = data2 + (10 * randn(1000) + 50)
# calculate spearman's correlation
corr, _ = spearmanr(data1, data2)
print('Spearmans correlation: %.3f' % corr)

Spearmans correlation: 0.872
