In [1]:
import pandas as pd

## Data load

For our analysis we will use three data sets:
<br>* Original WiD data set enriched with the information if a country has a women leader from [CFR](https://www.cfr.org/article/womens-power-index)
<br>* [CoronaNet research](https://www.coronanet-project.org/index.html) project with policies implemented per country during the pandemic.
<br>* [Oxford Covid-19 Government Response Tracker](https://github.com/OxCGRT)

In [56]:
df_original = pd.read_csv("COVID19 Hackathon Dataset.csv")
df_policies = pd.read_csv("coronanet_release.csv", encoding= 'unicode_escape')
df_oxford_policies = pd.read_csv("OxCGRT_latest.csv", encoding= 'unicode_escape')

## Data exploration

In [3]:
df_original.shape

(41782, 38)

In [4]:
#Add columns with normalised values of total cases and total deaths per 100k population
df_original["Total_Cases_100k"] = (df_original["Total_Cases"]/df_original["Population"])*100000
df_original["Total_Deaths_100k"] = (df_original["Total_Deaths"]/df_original["Population"])*100000

In [5]:
df_original.describe()

Unnamed: 0,New_Cases,New_Deaths,Total_Cases,Total_Deaths,Weekly_Cases,Weekly_Deaths,Population_Density,Fertility_Rate,GDP_PPP,Corruption,...,Population Aged 65 and above % of Total,Urban Population,Death Rate Per 1000 (2017),"PM2.5 air pollution, mean annual exposure (micrograms per cubic meter) (2017)","PM2.5 air pollution, population exposed to levels exceeding WHO guideline value (% of total) (2017)",Hospital Beds Year,Hospital Beds Per 1000 People,Woman Head of State,Total_Cases_100k,Total_Deaths_100k
count,41506.0,41506.0,41506.0,41506.0,40840.0,40840.0,41782.0,40397.0,41570.0,41011.0,...,39596.0,40009.0,39796.0,40209.0,40209.0,40209.0,40209.0,41782.0,39749.0,39749.0
mean,861.19043,25.197393,64160.93,2497.838457,5974.403844,176.669956,404.073928,2.561276,21688.276305,-0.003089,...,9.577075,28005170.0,7.741435,28.110566,87.504096,2012.472034,2.925681,0.127758,193.703339,5.696097
std,4912.421116,134.616653,379742.2,12420.394436,33641.343934,868.522808,2255.075253,1.197593,21277.864357,1.03651,...,6.643959,86124540.0,2.770453,19.864664,28.56314,2.36763,2.388715,0.333825,429.910617,13.450049
min,-8261.0,-1918.0,0.0,0.0,-3864.0,-1625.0,2.11,1.23,628.0,-1.800085,...,1.156549,23800.0,1.169,5.861331,0.0,2004.0,0.1,0.0,0.0,0.0
25%,0.0,0.0,107.0,1.0,11.0,0.0,36.2,1.71,5020.0,-0.790734,...,3.544727,2186104.0,5.845,14.61134,96.370659,2011.0,1.2,0.0,2.541236,0.016451
50%,19.0,0.0,1786.5,35.0,172.0,2.0,92.6,2.01,14100.0,-0.216403,...,7.300455,6084994.0,7.277,22.196055,100.0,2013.0,2.3,0.0,27.336654,0.541034
75%,232.0,4.0,16708.0,339.0,1680.25,29.0,223.0,3.12,33500.0,0.639966,...,15.200841,21844760.0,9.498,37.926503,100.0,2014.0,4.0,0.0,172.179863,3.758113
max,97894.0,4928.0,7501612.0,210909.0,652390.0,19006.0,26300.0,7.0,116000.0,2.21243,...,28.002049,842934000.0,15.5,99.734374,100.0,2015.0,13.4,1.0,4482.344521,101.241284


In [6]:
# Info about columns types and if they have null values
df_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41782 entries, 0 to 41781
Data columns (total 40 columns):
 #   Column                                                                                               Non-Null Count  Dtype  
---  ------                                                                                               --------------  -----  
 0   Date                                                                                                 41782 non-null  object 
 1   Continent                                                                                            41782 non-null  object 
 2   Countries                                                                                            41782 non-null  object 
 3   New_Cases                                                                                            41506 non-null  float64
 4   New_Deaths                                                                                           41506

In [7]:
df_policies.head(2)

Unnamed: 0.1,Unnamed: 0,record_id,policy_id,entry_type,correct_type,update_type,update_level,description,date_announced,date_start,...,travel_mechanism,compliance,enforcer,index_high_est,index_med_est,index_low_est,index_country_rank,link,date_updated,recorded_date
0,1,R_3nCudDknlhUIjpODg,2975738,new_entry,original,,,"March 6, Afghanistan """"Measures have been take...",2020-03-06,2020-03-06,...,,Voluntary/Recommended but No Penalties,"Ministry/Department of Health,Other (Please sp...",53.029449,50.604971,48.280188,137.0,https://www.etilaatroz.com/94246/fears-rumors-...,2020-08-21,2020-08-21 18:32:29
1,2,R_3nCudDknlhUIjpOCi,2975738,new_entry,original,,,"March 6, Afghanistan """"Measures have been take...",2020-03-06,2020-03-06,...,,Voluntary/Recommended but No Penalties,"Ministry/Department of Health,Other (Please sp...",53.029449,50.604971,48.280188,137.0,https://www.etilaatroz.com/94246/fears-rumors-...,2020-08-21,2020-08-21 18:32:29


In [8]:
# Type of policies and how many by which countries have been made data preparation

In [9]:
policy_countries = df_policies["country"].nunique()

In [10]:
print("Total of countries ind dataset: {}".format(policy_countries))

Total of countries ind dataset: 201


In [11]:
print("Total of policies worldwide: {}".format((df_policies["type"].value_counts()).sum()))

Total of policies worldwide: 71764


In [12]:
df_sum = df_policies.groupby(["type"], as_index=False)[["policy_id","country"]].agg(['count','nunique'])

In [13]:
df_sum.columns = [f"{x}_{y}" for x, y in df_sum.columns.to_flat_index()]
df_sum.reset_index(inplace=True)

In [14]:
df_sum.drop(["policy_id_nunique","country_count"], axis=1, inplace=True)

In [15]:
# Convert values to percentages
df_sum = df_sum.apply(lambda x: 100*x/float(x.sum()) if "policy_id_count" in x.name else 100*x/policy_countries if "country_nunique" in x.name  else x)

In [16]:
#Policies in women and men led countries data preparation 

In [17]:
women_countries = df_original[df_original["Woman Head of State"]==1]["Countries"].unique()

In [18]:
print("Number of countries led by women: {}".format(len(women_countries)))

Number of countries led by women: 21


In [19]:
women_countries

array(['Bangladesh', 'Barbados', 'Belgium', 'Bolivia', 'Denmark',
       'Estonia', 'Ethiopia', 'Finland', 'Gabon', 'Georgia', 'Germany',
       'Greece', 'Iceland', 'Nepal', 'New Zealand', 'Norway', 'Serbia',
       'Singapore', 'Slovak Republic', 'Switzerland',
       'Trinidad and Tobago'], dtype=object)

In [20]:
df_policies_women = df_policies[df_policies["country"].isin(women_countries)]

In [21]:
df_sum_women = df_policies_women.groupby(["type"], as_index=False)[["policy_id","country"]].agg(['count','nunique'])
df_sum_women.columns = [f"women_{x}_{y}" for x, y in df_sum_women.columns.to_flat_index()]
df_sum_women.reset_index(inplace=True)
df_sum_women.drop(["women_policy_id_nunique","women_country_count"], axis=1, inplace=True)
df_sum_women = df_sum_women.apply(lambda x: 100*x/float(x.sum()) if "policy_id_count" in x.name else 100*x/len(women_countries) if "country_nunique" in x.name  else x)

In [22]:
men_countries = df_original[df_original["Woman Head of State"]==0]["Countries"].unique()

In [23]:
print("Number of countries led by men: {}".format(len(men_countries)))

Number of countries led by men: 159


In [24]:
df_policies_men = df_policies[df_policies["country"].isin(men_countries)]

In [25]:
df_sum_men = df_policies_men.groupby(["type"], as_index=False)[["policy_id","country"]].agg(['count','nunique'])
df_sum_men.columns = [f"men_{x}_{y}" for x, y in df_sum_men.columns.to_flat_index()]
df_sum_men.reset_index(inplace=True)
df_sum_men.drop(["men_policy_id_nunique","men_country_count"], axis=1, inplace=True)
df_sum_men = df_sum_men.apply(lambda x: 100*x/float(x.sum()) if "policy_id_count" in x.name else 100*x/len(men_countries) if "country_nunique" in x.name  else x)

In [26]:
df_m_w = df_sum_men.merge(df_sum_women)

In [27]:
# Filter anf split the original dataset into two, one per group of countries either led by a woman or a man
# As filter we consider the government aspects: 'Country','Corruption', 
#'Government_Effectiveness', 'Political_Stability', 'Rule_of_Law', 'Government_Healthcare_Spend'

In [28]:
df_orig_men = df_original[df_original["Woman Head of State"]==0]
df_orig_men_filter = df_orig_men[['Country','Corruption', 'Government_Effectiveness', 'Political_Stability', 'Rule_of_Law', 'Government_Healthcare_Spend']].drop_duplicates()

In [29]:
df_orig_women = df_original[df_original["Woman Head of State"]==1]
df_orig_women_filter = df_orig_women[['Country','Corruption', 'Government_Effectiveness', 'Political_Stability', 'Rule_of_Law', 'Government_Healthcare_Spend']].drop_duplicates()

In [30]:
df_orig_men_filter["gender"] = 0
df_orig_women_filter["gender"] = 1

In [31]:
df_orig_men_filter.shape

(159, 7)

In [32]:
df_orig_women_filter.shape

(21, 7)

In [33]:
# Merge both filtered datasets in one
df_total = pd.concat([df_orig_women_filter, df_orig_men_filter], ignore_index=True)

In [34]:
df_total.shape

(180, 7)

In [35]:
df_total.head()

Unnamed: 0,Country,Corruption,Government_Effectiveness,Political_Stability,Rule_of_Law,Government_Healthcare_Spend,gender
0,Bangladesh,-0.905022,-0.748399,-1.03248,-0.640236,0.166933,1
1,Barbados,1.411373,0.426973,0.921433,0.394909,0.442009,1
2,Belgium,1.510279,1.171748,0.412356,1.3679,0.772174,1
3,Bolivia,-0.62591,-0.322099,-0.242841,-1.14962,0.685277,1
4,Denmark,2.147795,1.871631,0.959067,1.833378,0.84024,1


In [36]:
# Save filtered dataframes to csv for further exploration in tableau
df_orig_men_filter.to_csv("df_orig_men_filter.csv", index = False)
df_orig_women_filter.to_csv("df_orig_women_filter.csv", index = False)
df_total.to_csv("df_total.csv", index= False)

## Female vs men led countries stats

In [37]:
df_orig_women_filter

Unnamed: 0,Country,Corruption,Government_Effectiveness,Political_Stability,Rule_of_Law,Government_Healthcare_Spend,gender
3224,Bangladesh,-0.905022,-0.748399,-1.03248,-0.640236,0.166933,1
3432,Barbados,1.411373,0.426973,0.921433,0.394909,0.442009,1
3918,Belgium,1.510279,1.171748,0.412356,1.3679,0.772174,1
4814,Bolivia,-0.62591,-0.322099,-0.242841,-1.14962,0.685277,1
10600,Denmark,2.147795,1.871631,0.959067,1.833378,0.84024,1
12740,Estonia,1.506318,1.191938,0.599967,1.240627,0.746641,1
13022,Ethiopia,-0.486167,-0.60747,-1.343246,-0.428289,0.249452,1
13432,Finland,2.21243,1.984208,0.92492,2.046279,0.767446,1
13996,Gabon,-0.851205,-0.812283,-0.2476,-0.706186,0.632852,1
14408,Georgia,0.707991,0.614571,-0.426488,0.328048,0.372146,1


### Compare stats of key government aspects in women vs men led countries

Comparing government statistics between men and women led countries, the mean values of the levels of corruption, <br>government effectiveness, political stability and rule of law are positive for the group of women led countries <br>whereas the mean values for all of these scores are negative for the men lead countries. This contrast shows a <br>tendency for a better perception of women led countries in these key aspects. 
<br>Another interesting insight is the government healthcare spend which is in average higher in the group of women <br>led countries.   


In [38]:
df_orig_women_filter.describe()

Unnamed: 0,Corruption,Government_Effectiveness,Political_Stability,Rule_of_Law,Government_Healthcare_Spend,gender
count,21.0,21.0,21.0,21.0,21.0,21.0
mean,0.84775,0.768086,0.411492,0.723433,0.589605,1.0
std,1.214719,1.044917,0.835461,1.080432,0.221689,0.0
min,-0.905022,-0.902338,-1.343246,-1.14962,0.166933,1.0
25%,-0.370215,0.11162,-0.242841,-0.146404,0.442009,1.0
50%,1.411373,0.707461,0.599967,0.529058,0.632852,1.0
75%,2.00971,1.670251,0.959067,1.833378,0.772174,1.0
max,2.21243,2.231474,1.541482,2.046279,0.854731,1.0


In [39]:
df_orig_men_filter.describe()

Unnamed: 0,Corruption,Government_Effectiveness,Political_Stability,Rule_of_Law,Government_Healthcare_Spend,gender
count,156.0,156.0,158.0,156.0,149.0,159.0
mean,-0.192866,-0.154589,-0.207723,-0.191209,0.500164,0.0
std,0.919053,0.951249,0.998288,0.937486,0.218406,0.0
min,-1.800085,-2.449409,-3.002496,-2.338622,0.051,0.0
25%,-0.848347,-0.727142,-0.730904,-0.817479,0.343314,0.0
50%,-0.335464,-0.245134,-0.125859,-0.319704,0.500583,0.0
75%,0.398896,0.387111,0.625504,0.435845,0.679763,0.0
max,2.139578,1.944976,1.612102,1.895983,0.948234,0.0


## Policy measures in 2020 globaly

In [40]:
pd.options.display.float_format = "{:.2f}".format
general_policy = df_sum.sort_values(by="policy_id_count", ascending=False)

In [41]:
general_policy.columns

Index(['type', 'policy_id_count', 'country_nunique'], dtype='object')

In [42]:
general_policy.rename(columns = {"type":"Type of policy","policy_id_count":"Percentage of total policies [%]", "country_nunique": "Countries implementing the policy [%]"}, inplace = True)

In [43]:
general_policy.to_csv("general_policy.csv", index = False)

In [44]:
# Percentage of policies made on each topic (policy_id_count) and 
# percentage of countries implementing them (country_nunique)
general_policy

Unnamed: 0,Type of policy,Percentage of total policies [%],Countries implementing the policy [%]
16,Restriction and Regulation of Government Services,41.91,68.16
1,Closure and Regulation of Schools,10.8,94.03
18,Social Distancing,6.79,80.1
0,Anti-Disinformation Measures,5.14,38.81
15,Restriction and Regulation of Businesses,5.11,83.58
6,Health Resources,5.07,89.55
17,Restrictions of Mass Gatherings,3.68,92.54
4,External Border Restrictions,3.14,98.51
12,Other Policy Not Listed Above,2.72,81.09
14,Quarantine,2.54,88.06


## Stringency and Government response index to the pandemic globaly

In [45]:
df_oxford_policies_women = df_oxford_policies[df_oxford_policies["CountryName"].isin(women_countries)]

In [46]:
df_ox_sum_women = df_oxford_policies_women.groupby(["CountryName"], as_index=False)[["StringencyIndex","GovernmentResponseIndex"]].mean()

In [47]:
df_ox_cases_women = df_oxford_policies_women.groupby(["CountryName"], as_index=False)[["ConfirmedCases","ConfirmedDeaths"]].max()

In [48]:
df_ox_sum_women = df_ox_sum_women.merge(df_ox_cases_women, left_on = "CountryName", right_on = "CountryName")

In [49]:
df_oxford_policies_men = df_oxford_policies[df_oxford_policies["CountryName"].isin(men_countries)]

In [50]:
df_ox_sum_men = df_oxford_policies_men.groupby(["CountryName"], as_index=False)[["StringencyIndex","GovernmentResponseIndex"]].mean()

In [51]:
df_ox_cases_men = df_oxford_policies_men.groupby(["CountryName"], as_index=False)[["ConfirmedCases","ConfirmedDeaths"]].max()

In [52]:
df_ox_sum_men = df_ox_sum_men.merge(df_ox_cases_men, left_on = "CountryName", right_on = "CountryName")

In [53]:
df_ox_sum_men.nlargest(10, "ConfirmedDeaths")

Unnamed: 0,CountryName,StringencyIndex,GovernmentResponseIndex,ConfirmedCases,ConfirmedDeaths
132,United States,49.45,51.08,7916100.0,216872.0
16,Brazil,56.26,48.6,5140863.0,151747.0
54,India,62.82,63.36,7307097.0,111266.0
81,Mexico,53.11,41.52,829396.0,84898.0
131,United Kingdom,52.31,55.69,654644.0,43155.0
60,Italy,53.77,55.75,372799.0,36289.0
98,Peru,65.51,62.06,856951.0,33512.0
116,Spain,51.25,52.66,908056.0,33413.0
45,France,48.36,52.41,779063.0,33037.0
56,Iran,41.83,42.13,513219.0,29349.0


In [54]:
df_ox_sum_women.nlargest(10, "ConfirmedDeaths")

Unnamed: 0,CountryName,StringencyIndex,GovernmentResponseIndex,ConfirmedCases,ConfirmedDeaths
2,Belgium,48.18,51.0,181418.0,10278.0
10,Germany,47.6,49.58,341223.0,9710.0
3,Bolivia,66.62,53.47,139141.0,8377.0
0,Bangladesh,61.75,56.02,382959.0,5593.0
19,Switzerland,41.16,42.82,68450.0,1814.0
6,Ethiopia,57.37,48.59,86430.0,1312.0
16,Serbia,46.74,46.02,35251.0,768.0
4,Denmark,44.96,50.51,33593.0,675.0
13,Nepal,67.01,63.21,117996.0,675.0
11,Greece,46.86,47.75,23495.0,469.0


## Policy measures in 2020 in men and women led countries

In [138]:
df_m_w.sort_values(by="men_policy_id_count", ascending=False)

Unnamed: 0,type,men_policy_id_count,men_country_nunique,women_policy_id_count,women_country_nunique
16,Restriction and Regulation of Government Services,38.09,64.15,49.17,80.95
1,Closure and Regulation of Schools,10.59,88.68,12.58,90.48
18,Social Distancing,6.89,75.47,6.42,80.95
0,Anti-Disinformation Measures,6.33,38.36,3.59,38.1
6,Health Resources,5.29,84.91,4.19,85.71
15,Restriction and Regulation of Businesses,5.29,79.87,3.91,90.48
17,Restrictions of Mass Gatherings,3.57,89.31,4.9,95.24
4,External Border Restrictions,3.57,92.45,2.3,90.48
14,Quarantine,2.76,83.02,2.26,90.48
12,Other Policy Not Listed Above,2.63,79.25,1.62,71.43


## Prediction using sigmoid function

Based on the insights we gathered about government responses to the pandemic and how they differ in men and women led countries, our next question was if their curves of covid-19 cases and deaths will flatten in the near future. 
We built a predictive model based on a paper made at the [TU Eindhoven](https://assets.tue.nl/fileadmin/content/pers/2020/03%20March/TUe%20-%20Technical_Report_Prediction_Corona_Virus.pdf) where a three-parameter logistic growth was used to obtain evidence that this curve is suitable for accumulated covid-19 related data. We fitted this sigmoid function to the total cases and deaths per country from the timeframe in the original dataset (31.12.2019 to 07.10.2020). As group of countries we selected the top 10 women and men led countries by total amount of cases as of 07.10.
In the sigmoid function we used, M is the maximum number of cases, 𝛼 the number of days at which the expected number of counts is half way the maximum,and 𝛽 > 0 the growth parameter. 
It is important to note that the growth curves do not (necessarily) provide predictions of the true number of infections and deaths in the population due to the covid-19 virus, but only describes the growth in confirmed number of infections and deaths.    
Model implementation is inspired by kaggle kernels by [group16](https://www.kaggle.com/corochann/covid-19-current-situation-on-october) and [corochann](https://www.kaggle.com/corochann/covid-19-current-situation-on-october)

In [139]:
import numpy as np
import scipy as sp
from scipy import optimize

In [140]:
import datetime as dt
df_model = df_original.copy()

In [142]:
# Convert time from str to time
df_model['Date'] = df_model['Date'].apply(lambda x:dt.datetime.strptime(x,"%d/%m/%Y"))

In [143]:
df_original['Date'].max()

'31/12/2019'

In [144]:
df_model['Date'].min()

Timestamp('2019-12-31 00:00:00')

In [145]:
# Create dataframe for the model with only data, country and total and normalised values of cases and deaths
country_df = df_model.groupby(['Date', 'Countries'])[['Total_Cases', 'Total_Cases_100k', 'Total_Deaths','Total_Deaths_100k']].sum().reset_index()

In [148]:
# Convert values to int 
country_df[["Total_Cases","Total_Cases_100k","Total_Deaths","Total_Deaths_100k"]] = country_df[["Total_Cases","Total_Cases_100k","Total_Deaths","Total_Deaths_100k"]].astype(int)

In [149]:
# Define sigmoid function for the model and error function for the training
def sigmoid(t, M, beta, alpha, offset=0):
    alpha += offset
    return M / (1 + np.exp(-beta * (t - alpha)))

def error(x, y, params):
    M, beta, alpha = params
    y_pred = sigmoid(x, M, beta, alpha)

    # apply weight, latest number is more important than past.
    weight = np.arange(len(y_pred)) ** 2
    loss_mse = np.mean((y_pred - y) ** 2 * weight)
    return loss_mse

In [162]:
target_date = country_df['Date'].max()

print('Date: ', target_date)
for i in [1, 10, 100, 1000, 10000]:
    n_countries = len(country_df.query('(Date == @target_date) & Total_Cases > @i'))
    print(f'{n_countries} countries have more than {i} total cases')

Date:  2020-10-07 00:00:00
178 countries have more than 1 total cases
178 countries have more than 10 total cases
171 countries have more than 100 total cases
154 countries have more than 1000 total cases
100 countries have more than 10000 total cases


In [182]:
# Define function to fit the model to predict until 15.01.2020
# Dates are taken into account when number of cases if above threshold (int)
# Currently we are using 100 for total cummulated cases and 0.1 for normalised cases 
# Input parameters: 
# exclude_days-> number of days not to include in the training date 
# country_list
# case -> e.g. Total_Cases, Death_cases, Total_Cases_100k, Death_cases_100k

def fit_sigmoid(exclude_days, country_list, case):
    target_country_df_list = []
    pred_df_list = []
    for target_country in country_list:
        print('target_country', target_country)
        # --- Train ---
        target_country_df = country_df.query('Countries == @target_country')

        #train_start_date = target_country_df['date'].min()
        train_start_date = target_country_df.query(case+' > 0.1')['Date'].min()
        #print("Start date:{}".format(train_start_date))
        train_end_date = pd.to_datetime(target_date) - pd.Timedelta(f'{exclude_days} days')
        #print("End date:{}".format(train_end_date))
        if (train_start_date!= np.datetime64('NaT')) and (train_end_date!= np.datetime64('NaT')):
            target_date_df = target_country_df.query('(Date >= @train_start_date) & (Date <= @train_end_date)')
            print("Target date df:{}".format(target_date_df.size))
            if len(target_date_df) <= 7:
                print('WARNING: the data is not enough, use 7 more days...')
                train_start_date -= pd.Timedelta('7 days')
                target_date_df = target_country_df.query('(Date >= @train_start_date) & (Date <= @train_end_date)')

            confirmed = target_date_df[case].values
            #print(confirmed.shape)
            x = np.arange(len(confirmed))

            lossfun = lambda params: error(x, confirmed, params)

            res = sp.optimize.minimize(lossfun, x0=[np.max(confirmed) * 5, 0.04, 2 * len(confirmed) / 3.], method='nelder-mead')
            M, beta, alpha = res.x
 
            # --- Pred ---
            pred_start_date = target_country_df['Date'].min()
            pred_end_date = pd.to_datetime('2021-01-15')
            days = int((pred_end_date - pred_start_date) / pd.Timedelta('1 days'))
            # print('pred start', pred_start_date, 'end', pred_end_date, 'days', days)

            x = np.arange(days)
            offset = (train_start_date - pred_start_date) / pd.Timedelta('1 days')
            print('train_start_date', train_start_date, 'offset', offset, 'params', M, beta, alpha)
            y_pred = sigmoid(x, M, beta, alpha, offset=offset)
            # target_country_df['confirmed_pred'] = y_pred

            all_dates = [pred_start_date + np.timedelta64(x, 'D') for x in range(days)]
            pred_df = pd.DataFrame({
                'date': all_dates,
                'country': target_country,
                'confirmed_pred': y_pred,
            })

            target_country_df_list.append(target_country_df)
            pred_df_list.append(pred_df)
        else:
            continue
    return target_country_df_list, pred_df_list

In [170]:
# Libraries for the plot
from plotly import tools, subplots
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
import plotly.io as pio
pio.templates.default = "plotly_dark"

In [171]:
# Define function to generate random colours in the plot
def gen_random_color(min_value=0, max_value=256) -> str:
    """Generate random color for plotly"""
    r, g, b = np.random.randint(min_value, max_value, 3)
    return f'rgb({r},{g},{b})'

In [172]:
def plot_sigmoid_fitting(target_country_df_list, pred_df_list, title,countries,case):
    n_countries = len(countries)

    # --- Plot ---
    fig = go.Figure()

    for i in range(n_countries):
        target_country = countries[i]
        country_pop = (df_original.loc[df_original["Countries"] == countries[i]]["Population"].max()).astype(int)
        target_country_df = target_country_df_list[i]
        #target_country_df["total_cases_norm"] = (target_country_df_list[i]["Total_Cases"]/country_pop)*100000
        pred_df = pred_df_list[i]
        #pred_df["confirmed_pred_norm"] = (pred_df_list[i]["confirmed_pred"]/country_pop)*100000
        
        color = gen_random_color(min_value=20)
        # Prediction
        fig.add_trace(go.Scatter(
            x=pred_df['date'], y=pred_df['confirmed_pred'],
            name=f'{target_country}_pred',
            line=dict(color=color, dash='dash')
        ))

        # Ground truth
        fig.add_trace(go.Scatter(
            #x=target_country_df['Date'], y=target_country_df[case],
            x=target_country_df['Date'], y=target_country_df[case],
            mode='markers', name=f'{target_country}_actual',
            line=dict(color=color),
        ))
    fig.update_layout(
        title=title, xaxis_title='Date', yaxis_title=case,
        width=900,
        height=600,)
    fig.show()

In [174]:
# Define the group of countries to make the prediction
# We select the top 10 countries by total cases
df_original_men = df_original[df_original["Woman Head of State"]==0].drop_duplicates("Countries",keep='last') 
top_men_countries = df_original_men.sort_values(by=["Total_Cases"], ascending=False).nlargest(10,["Total_Cases"])["Countries"].values
df_original_women = df_original[df_original["Countries"].isin(women_countries)].drop_duplicates("Countries",keep='last') 
top_women_countries = df_original_women.sort_values(by=["Total_Cases"], ascending=False).nlargest(11,["Total_Cases"])["Countries"].values

In [175]:
top_men_countries

array(['United States', 'India', 'Brazil', 'Russia', 'Colombia', 'Peru',
       'Spain', 'Argentina', 'Mexico', 'South Africa'], dtype=object)

In [176]:
top_women_countries

array(['Bangladesh', 'Germany', 'Bolivia', 'Belgium', 'Nepal', 'Ethiopia',
       'Singapore', 'Switzerland', 'Serbia', 'Denmark', 'Greece'],
      dtype=object)

In [173]:
target_country_df_list, pred_df_list = fit_sigmoid(7, top_women_countries,"Total_Cases")

target_country Bangladesh
Target date df:1062
train_start_date 2020-04-07 00:00:00 offset 29.0 params 387770.0024066557 0.033547560717860206 102.89640492038784
target_country Germany
Target date df:1284
train_start_date 2020-03-01 00:00:00 offset 61.0 params 2915344.071731949 0.004482949827179712 714.9580158875697
target_country Bolivia
Target date df:1104
train_start_date 2020-03-31 00:00:00 offset 19.0 params 142404.69832245348 0.04375330442659332 119.19198302288919
target_country Belgium
Target date df:1260
train_start_date 2020-03-05 00:00:00 offset 65.0 params 2195721.160699239 0.0067180708686988035 648.3878137829927
target_country Nepal
Target date df:876
train_start_date 2020-05-08 00:00:00 offset 129.0 params 1733722.7366164355 0.02317644003711407 277.4292714301869
target_country Ethiopia
Target date df:990
train_start_date 2020-04-19 00:00:00 offset 36.0 params 83144.63657996044 0.05961945782742707 127.55924764300913
target_country Singapore
Target date df:1284
train_start_dat

In [169]:
plot_sigmoid_fitting(target_country_df_list, pred_df_list, 'Sigmoid prediction of cases in top 10 women led countries', top_women_countries, 'Total_Cases')

In [177]:
# Predictions without Nepal who just started a wave
top_women_countries = ['Bangladesh', 'Germany', 'Bolivia', 'Belgium', 'Ethiopia',
       'Singapore', 'Switzerland', 'Serbia', 'Denmark', 'Greece']

In [178]:
target_country_df_list, pred_df_list = fit_sigmoid(7, top_women_countries,"Total_Cases")

target_country Bangladesh
Target date df:1062
train_start_date 2020-04-07 00:00:00 offset 29.0 params 387770.0024066557 0.033547560717860206 102.89640492038784
target_country Germany
Target date df:1284
train_start_date 2020-03-01 00:00:00 offset 61.0 params 2915344.071731949 0.004482949827179712 714.9580158875697
target_country Bolivia
Target date df:1104
train_start_date 2020-03-31 00:00:00 offset 19.0 params 142404.69832245348 0.04375330442659332 119.19198302288919
target_country Belgium
Target date df:1260
train_start_date 2020-03-05 00:00:00 offset 65.0 params 2195721.160699239 0.0067180708686988035 648.3878137829927
target_country Ethiopia
Target date df:990
train_start_date 2020-04-19 00:00:00 offset 36.0 params 83144.63657996044 0.05961945782742707 127.55924764300913
target_country Singapore
Target date df:1284
train_start_date 2020-03-01 00:00:00 offset 61.0 params 58855.07161231446 0.03222839290714907 86.29791359040715
target_country Switzerland
Target date df:1248
train_star

In [179]:
plot_sigmoid_fitting(target_country_df_list, pred_df_list, 'Sigmoid prediction of cases in top 10 women led countries', top_women_countries, 'Total_Cases')

In [180]:
target_country_df_list, pred_df_list = fit_sigmoid(7, top_men_countries,"Total_Cases")

target_country United States
Target date df:1272
train_start_date 2020-03-03 00:00:00 offset 63.0 params 8639182.745799184 0.02495540146659761 148.37455666205133
target_country India
Target date df:1188
train_start_date 2020-03-17 00:00:00 offset 77.0 params 9827594.136502309 0.035690752598835335 181.7056876827678
target_country Brazil
Target date df:1200
train_start_date 2020-03-15 00:00:00 offset 75.0 params 5399200.220451617 0.0335096461964709 140.47666946257397
target_country Russia
Target date df:1182
train_start_date 2020-03-18 00:00:00 offset 78.0 params 1246032.3841723646 0.024186836412422495 105.59778459006613
target_country Colombia
Target date df:1176
train_start_date 2020-03-19 00:00:00 offset 12.0 params 917358.3717542945 0.04628885969071163 150.65296682838627
target_country Peru
Target date df:1182
train_start_date 2020-03-18 00:00:00 offset 11.0 params 1216262.5017746375 0.022135113658529715 162.51539445736597
target_country Spain
Target date df:1278
train_start_date 202

In [181]:
plot_sigmoid_fitting(target_country_df_list, pred_df_list, 'Sigmoid prediction of cases in top 10 men led countries', top_men_countries, 'Total_Cases')

In [183]:
target_country_df_list, pred_df_list = fit_sigmoid(7, top_women_countries,"Total_Cases_100k")

target_country Bangladesh
Target date df:996
train_start_date 2020-04-18 00:00:00 offset 40.0 params 239.46818170340913 0.032790175600106504 92.52474734402031
target_country Germany
Target date df:1242
train_start_date 2020-03-08 00:00:00 offset 68.0 params 2253.614541420019 0.004663346673623649 581.6166226645335
target_country Bolivia
Target date df:1092
train_start_date 2020-04-02 00:00:00 offset 21.0 params 1236.5263477780688 0.04376502111486017 117.22188196634745
target_country Belgium
Target date df:1260
train_start_date 2020-03-05 00:00:00 offset 65.0 params 19612.15774954314 0.006660007139249186 656.537436340046
target_country Ethiopia
Target date df:732
train_start_date 2020-06-01 00:00:00 offset 79.0 params 71.73796636181419 0.06376309010652273 83.83257536042862
target_country Singapore
Target date df:1380
train_start_date 2020-02-14 00:00:00 offset 45.0 params 1024.8776972124974 0.0337903731906155 102.57780128377341
target_country Switzerland
Target date df:1254
train_start_d

In [185]:
plot_sigmoid_fitting(target_country_df_list, pred_df_list, 'Sigmoid prediction of cases per 100,000 population in top 10 women led countries', top_women_countries, 'Total_Cases_100k')

In [188]:
target_country_df_list, pred_df_list = fit_sigmoid(7, top_men_countries,"Total_Cases_100k")

target_country United States
Target date df:1194
train_start_date 2020-03-16 00:00:00 offset 76.0 params 2614.700013311636 0.025239735885216094 134.8419448793254
target_country India
Target date df:996
train_start_date 2020-04-18 00:00:00 offset 109.0 params 724.7739036401913 0.035519407918071594 150.2402203066512
target_country Brazil
Target date df:1140
train_start_date 2020-03-25 00:00:00 offset 85.0 params 2565.9192656477953 0.033296379574351304 130.6997989088373
target_country Russia
Target date df:1110
train_start_date 2020-03-30 00:00:00 offset 90.0 params 882.9124184247664 0.022730331609701067 95.36275891699454
target_country Colombia
Target date df:1122
train_start_date 2020-03-28 00:00:00 offset 21.0 params 1820.831533433016 0.04634466954123655 141.63974139430974
target_country Peru
Target date df:1152
train_start_date 2020-03-23 00:00:00 offset 16.0 params 3758.218159406859 0.022039537549735068 157.97956513268315
target_country Spain
Target date df:1254
train_start_date 2020

In [190]:
plot_sigmoid_fitting(target_country_df_list, pred_df_list, 'Sigmoid prediction of cases per 100,000 population in top 10 men led countries', top_men_countries, 'Total_Cases_100k')