### COVID-19 
* Explore covid19 data
* Thanks to John Hopkins University for providing the data https://github.com/CSSEGISandData/COVID-19


In [29]:
# essential libraries
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
from matplotlib import pyplot as plt
import plotly.graph_objects as go
from fbprophet import Prophet
import pycountry
import plotly.express as px
import plotly.io as pio
from functools import reduce 
from plotly.subplots import make_subplots
import streamlit as st

# hide warnings 
import warnings
warnings.filterwarnings('ignore')

#### Loading & Pre-processing files from github

* Could read the data directly from github as below
* `url4` is changing every day according to date, so needs to be changed accordingly i.e. **csse_covid_19_daily_reports** 

In [30]:
    
# time seriese data 
url1 = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
url2 = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv"
url3 = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv"



url4 = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/04-07-2020.csv"


df_confirmed = pd.read_csv(url1)
df_deaths = pd.read_csv(url2)
df_recovered = pd.read_csv(url3)

# this file should have the latest figures 
df = pd.read_csv(url4,parse_dates=['Last_Update'])


In [31]:
# rename some columns 
df_confirmed.rename(columns={'Country/Region':'Country'}, inplace=True)
df_recovered.rename(columns={'Country/Region':'Country'}, inplace=True)
df_deaths.rename(columns={'Country/Region':'Country'}, inplace=True)
df.rename(columns={'Last_Update':'Date'}, inplace=True)
df.rename(columns={'Country_Region':'Country'}, inplace=True)
#df_confirmed.head()

#### Time Series Files 

* Work out confirmed, deaths and recovered cases 
* Files are available at https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series , code to load / read these files are above


In [32]:
# get the dates 
df_confirmed=df_confirmed.reset_index(drop=True)

In [33]:
# reshape the data frames 
cols_list = df_confirmed.columns.to_list()[:4]
dates_list = df_confirmed.columns.to_list()[4:]

#cols_list

In [34]:
# tidy data df_confirmed 
df_confirmedM = pd.melt(df_confirmed, id_vars=cols_list,\
     value_vars=dates_list, var_name='Date', value_name='Confirmed')
#df_confirmedM.head()

In [35]:
# Deaths series 
cols_list = df_deaths.columns.to_list()[:4]
dates_list = df_deaths.columns.to_list()[4:]
# tidy data df_deaths 
df_deathsM = pd.melt(df_deaths, id_vars=cols_list,\
     value_vars=dates_list, var_name='Date', value_name='Deaths')
#df_deathsM.head()

In [36]:
# Recovered 
cols_list = df_recovered.columns.to_list()[:4]
dates_list = df_recovered.columns.to_list()[4:]

#dates_list
# and finally tidy data df_recovered 
df_recoveredM = pd.melt(df_recovered, id_vars=cols_list,\
     value_vars=dates_list, var_name='Date', value_name='Recovered')


In [37]:
# Merege the three time series into one
df_all = [df_confirmedM, df_deathsM,df_recoveredM]          
covid19 = reduce(lambda left, right: pd.merge(left, right, on =cols_list+['Date'], how='outer'), df_all)
#covid19.head()

In [38]:
# Rename Palestine 
covid19.loc[covid19.Country=='West Bank and Gaza','Country']='Palestine'

#### Tidy the data 
* Create `df_covid19` as a copy of `covid19`
* Reshape `df_covid19` for easy visualisation of the data 

In [39]:
# Tidy the df again: Rows to be represented by state, country, lat, long, and date 
df_covid19 = covid19.copy()
cols_ids = df_covid19.columns[:5]

cases = ['Confirmed', 'Deaths','Recovered']
df_covid19 = pd.melt(df_covid19, id_vars=cols_ids,\
          value_vars=cases, var_name='Cases', value_name='Count')
df_covid19['Date'] = pd.to_datetime(df_covid19['Date'],format='%m/%d/%y', errors='raise')
df_covid19['Week']=df_covid19['Date'].dt.strftime('%W')
#df_covid19.head()


In [40]:
covid19['Active']=covid19['Confirmed']-covid19['Deaths']

#save files 
covid19[covid19.Country=='Italy'].to_csv('data/Italy.csv')
df_covid19.to_csv('data/df_covid19.csv')

* Change the date to `datetime` type


In [41]:
covid19['Date']=pd.to_datetime(covid19['Date'])

#### Top countries by confirmed cases 
* Worldwide 
* EU countries 


In [42]:
# EU countries 
eus = ['Spain','Belgium','Bulgaria','Croatia','Cyprus','Czech Republic','Denmark',
             'Estonia','Finland','France','Germany','Greece','Hungary',
            'Ireland','Italy','Latvia','Lithuania','Luxembourg','Malta',
             'Netherlands','Poland','Portugal','Romania' ,'Slovakia','Slovenia','Sweden','United Kingdom']


In [43]:
df_grouped = covid19.groupby(['Country', 'Date'], as_index=False).agg({'Confirmed':'sum','Deaths':'sum',
                                                             'Active':'sum','Recovered':'sum'})

df_grouped = df_grouped.groupby('Country')['Confirmed', 'Deaths','Recovered', 'Active'].max().reset_index()
df_grouped = df_grouped.sort_values(by='Confirmed', ascending=False)
df_grouped = df_grouped.reset_index(drop=True)
# top 10 names 
top_confirmed = df_grouped['Country'].to_list()
# top 20 countries 
#df_grouped[:20].style.background_gradient(cmap='Reds')


In [44]:
# Top 20 European Countries
df_eus = df_grouped[df_grouped.Country.isin(eus)].reset_index()
df_eus.columns=['World Rank','Country','Confirmed','Deaths','Recovered','Active']
df_eus['World Rank']=df_eus['World Rank']+1
#df_eus.head(20).style.background_gradient(cmap='Oranges')

In [45]:
## countries ranking lists 
# by confirmed cases 
df_grouped.sort_values(by='Confirmed',ascending=False,inplace=True)
top_conf = df_grouped['Country'].to_list()
# by deaths
df_grouped.sort_values(by='Deaths',ascending=False,inplace=True)
top_death = df_grouped['Country'].to_list()
# by recovered
df_grouped.sort_values(by='Recovered',ascending=False,inplace=True)
top_rec = df_grouped['Country'].to_list()

#### Countries Ranking 
* A function that shows arbitrary number of countries ranking according to cases
* Ranking by confirmed, deaths, and recovered 
* Change `cases`, `top`, and other parameters to get different results

In [46]:

def top_countries_by_cases_by_date(top=30,least=False,byDate='28.01.2020', cases='Confirmed',title='28.03.2020'):
    
    # The code below should generate similar barplot to the one generated above using df

    temp = covid19.copy()
    temp['Date']=pd.to_datetime(temp['Date'])

    mask = (temp['Date'] <= byDate)
    temp = temp.loc[mask]

    temp = temp.groupby(['Country', 'Date'], as_index=False).agg({'Confirmed':'sum','Deaths':'sum',
                                                             'Active':'sum','Recovered':'sum'})

    temp = temp.groupby('Country')['Confirmed', 'Deaths','Recovered', 'Active'].max().reset_index()
    temp = temp.sort_values(by=cases, ascending=False)
    temp = temp.reset_index(drop=True)
    
    if least==True:
        temp = temp[:top]
    else:
        x = temp.shape[0]
        x = x - top
        temp = temp[x:]
    
    if cases=='Confirmed':
        colors = 'rgb(26, 30, 250)'
    elif cases=='Deaths':
        colors = 'rgb(255, 60, 30)'
    else:
        colors = 'rgb(100, 255, 150)'
    #colors = ['deepskyblue',] * 5
    #colors[3] = 'crimson'

    fig = go.Figure(data=[go.Bar(
        x=temp['Country'],
        y=temp[cases],
        text=temp[cases],
        marker_color=colors
        #marker_color=colors # marker color can be a single color value or an iterable
    )])
    #byDate.strftime("%A %d. %B %Y")
    #byDate.strftime("%d/%m/%y")
    #fig.update_layout(showlegend=False)

    fig.update_layout(template='plotly_white')
    fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
    fig.update_yaxes(title_text="Number of Cases", hoverformat=".3f")
    fig.update_layout(title_text="Number of " + cases + " Cases: Top "  + str(top) +' Countries ' +title, title_x=0.5)
    return (fig)


#### Top n Countries by specific date
* top `n` countries worldwide by covid19 cases
* Change `byDate` to see how the ranking of countries vary over time 
* Change the `boolean` to False to show the countries by least confirmed cases
* By `Confirmed` cases


In [47]:
byDate=pd.to_datetime(max(covid19['Date']))
top_countries_by_cases_by_date(30,True,byDate,'Confirmed',' by '+str(byDate.strftime("%d/%m/%y")))


* Top `n` countries by `Deaths` cases 


In [48]:
top_countries_by_cases_by_date(30,True,byDate,'Deaths',' by '+str(byDate.strftime("%d/%m/%y")))


* Top `n` countries by `Recovered` cases 


In [49]:
top_countries_by_cases_by_date(30,True,byDate,'Recovered',' by '+str(byDate.strftime("%d/%m/%y")))
