In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats
from scipy.stats import zscore
from scipy.stats import stats
from scipy.stats import linregress
import os
import seaborn as sns
import re
import datetime
import nbconvert
import glob
import plotly.express as px
import plotly.graph_objects as go
import cufflinks as cf
import chart_studio

from statsmodels.compat import lzip
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [2]:
# Files to Load (csv files downloaded from the internet)

# Country information files
file1 = "data/unsd.csv"
file2 = "data/six-dem_mod.csv"

# Covid Case timeseries file
file3 = "data/covid-confirmed-cases-since-100th-case_hkg.csv"
file4 = "data/worldometer.csv"

# Country timeseries files
file5 = "data/population-density-vs-prosperity.csv"
file6 = "data/urban-and-rural-population.csv"
file7 = "data/urban-vs-rural-majority.csv"
file8 = "data/life-expectancy-at-birth-total-years.csv"
file9 = "data/median-age.csv"

# Read File and store into Pandas data frame
unsd = pd.read_csv(file1)
hoefstede = pd.read_csv(file2)

covid_data = pd.read_csv(file3, parse_dates=['Date'])
wom = pd.read_csv(file4,parse_dates=['date'])

density_vs_prosperity = pd.read_csv(file5)
urban_rural_population = pd.read_csv(file6)
urban_vs_rural_majority = pd.read_csv(file7)
life_expectancy = pd.read_csv(file8)
median_age = pd.read_csv(file9)


# Create 'countries' dataframe with all country information
#### This is in Tidy format

In [3]:
unsd.columns

Index(['Global Code', 'Global Name', 'Region Code', 'Region Name',
       'Sub-region Code', 'Sub-region Name', 'Intermediate Region Code',
       'Intermediate Region Name', 'Country or Area', 'M49 Code', 'Code',
       'Least Developed Countries (LDC)',
       'Land Locked Developing Countries (LLDC)',
       'Small Island Developing States (SIDS)',
       'Developed / Developing Countries'],
      dtype='object')

In [4]:
#Format columns
unsd = unsd.rename(columns={"Global Code":"globalCode",
                            "Global Name":"globalName",
                            'Region Code':"regionCode",
                            'Region Name':"regionName",
                            'Sub-region Code':"subRegionCode",
                            'Sub-region Name':"subRegionName",
                            'Intermediate Region Code':"interRegionCode",
                            'Intermediate Region Name':"interRegionName",
                            'Country or Area':"country",
                            'M49 Code':"m49",
                            'Code':"code",
                            'Least Developed Countries (LDC)':"ldc",
                            'Land Locked Developing Countries (LLDC)':"lldc",
                            'Small Island Developing States (SIDS)':"sids",
                            'Developed / Developing Countries':"development"
                            })

In [5]:
unsd.drop(columns=['interRegionCode','interRegionName'])

Unnamed: 0,globalCode,globalName,regionCode,regionName,subRegionCode,subRegionName,country,m49,code,ldc,lldc,sids,development
0,1,World,142,Asia,34,Southern Asia,Afghanistan,4,AFG,1.0,1.0,,Developing
1,1,World,150,Europe,154,Northern Europe,Åland Islands,248,ALA,,,,Developed
2,1,World,150,Europe,39,Southern Europe,Albania,8,ALB,,,,Developed
3,1,World,2,Africa,15,Northern Africa,Algeria,12,DZA,,,,Developing
4,1,World,9,Oceania,61,Polynesia,American Samoa,16,ASM,,,1.0,Developing
5,1,World,150,Europe,39,Southern Europe,Andorra,20,AND,,,,Developed
6,1,World,2,Africa,202,Sub-Saharan Africa,Angola,24,AGO,1.0,,,Developing
7,1,World,19,Americas,419,Latin America and the Caribbean,Anguilla,660,AIA,,,1.0,Developing
8,1,World,10,Antartica,100,AntarticaSubRegion,Antarctica,10,ATA,,,,Developing
9,1,World,19,Americas,419,Latin America and the Caribbean,Antigua and Barbuda,28,ATG,,,1.0,Developing


In [6]:
hoefstede.columns

Index(['ctr', 'country', 'pdi', 'idv', 'mas', 'uai', 'ltowvs', 'ivr'], dtype='object')

In [7]:
#Format columns
hoefstede = hoefstede.rename(columns={"ctr":"code"
                            })

In [8]:
hoefstede = hoefstede.replace('#NULL!','')

In [9]:
hoefstede[['pdi','idv','mas','uai','ltowvs','ivr']] = hoefstede[['pdi','idv','mas','uai','ltowvs','ivr']].apply(pd.to_numeric)

In [10]:
hoefstede.head()

Unnamed: 0,code,country,pdi,idv,mas,uai,ltowvs,ivr
0,GTM,Guatemala,95.0,6.0,37.0,101.0,,
1,ECU,Ecuador,78.0,8.0,63.0,67.0,,
2,PAN,Panama,95.0,11.0,44.0,86.0,,
3,VEN,Venezuela,81.0,12.0,73.0,76.0,16.0,100.0
4,COL,Colombia,67.0,13.0,64.0,80.0,13.0,83.0


In [11]:
# Merge data
countries = pd.merge(unsd, hoefstede, on=["code"])
countries.shape

(118, 22)

In [12]:
countries.columns

Index(['globalCode', 'globalName', 'regionCode', 'regionName', 'subRegionCode',
       'subRegionName', 'interRegionCode', 'interRegionName', 'country_x',
       'm49', 'code', 'ldc', 'lldc', 'sids', 'development', 'country_y', 'pdi',
       'idv', 'mas', 'uai', 'ltowvs', 'ivr'],
      dtype='object')

In [13]:
countries.drop(['country_y'], axis=1,inplace=True)

In [14]:
countries = countries.rename(columns={'country_x': 'country'})

In [15]:
countries.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118 entries, 0 to 117
Data columns (total 21 columns):
globalCode         118 non-null int64
globalName         118 non-null object
regionCode         118 non-null int64
regionName         118 non-null object
subRegionCode      118 non-null int64
subRegionName      118 non-null object
interRegionCode    38 non-null float64
interRegionName    36 non-null object
country            118 non-null object
m49                118 non-null int64
code               118 non-null object
ldc                17 non-null float64
lldc               15 non-null float64
sids               7 non-null float64
development        118 non-null object
pdi                102 non-null float64
idv                102 non-null float64
mas                102 non-null float64
uai                102 non-null float64
ltowvs             97 non-null float64
ivr                95 non-null float64
dtypes: float64(10), int64(4), object(7)
memory usage: 20.3+ KB


In [16]:
# Export CSV
countries.to_csv('countries.csv')  

In [17]:
# Create multiindex using unsd naming conventions
#countries = countries.set_index(['globalName','regionName','subRegionName','code'])

In [18]:
countries.head()

Unnamed: 0,globalCode,globalName,regionCode,regionName,subRegionCode,subRegionName,interRegionCode,interRegionName,country,m49,...,ldc,lldc,sids,development,pdi,idv,mas,uai,ltowvs,ivr
0,1,World,150,Europe,39,Southern Europe,,,Albania,8,...,,,,Developed,90.0,20.0,80.0,70.0,61.0,15.0
1,1,World,2,Africa,15,Northern Africa,,,Algeria,12,...,,,,Developing,,,,,26.0,32.0
2,1,World,150,Europe,39,Southern Europe,,,Andorra,20,...,,,,Developed,,,,,,65.0
3,1,World,2,Africa,202,Sub-Saharan Africa,17.0,Middle Africa,Angola,24,...,1.0,,,Developing,83.0,18.0,20.0,60.0,15.0,83.0
4,1,World,19,Americas,419,Latin America and the Caribbean,5.0,South America,Argentina,32,...,,,,Developing,49.0,46.0,56.0,86.0,20.0,62.0


# Create 'covid' dataframe with timeseries & confirmed cases

In [19]:
covid_data.columns

Index(['Entity', 'Code', 'Date', ' (cases)',
       'Number of days since the 100th confirmed case (days)'],
      dtype='object')

In [20]:
covid_data.head()

Unnamed: 0,Entity,Code,Date,(cases),Number of days since the 100th confirmed case (days)
0,Afghanistan,AFG,2019-12-31,0.0,
1,Afghanistan,AFG,2020-01-01,0.0,
2,Afghanistan,AFG,2020-01-02,0.0,
3,Afghanistan,AFG,2020-01-03,0.0,
4,Afghanistan,AFG,2020-01-04,0.0,


In [21]:
#Format columns
covid = covid_data.rename(columns={'Entity': "country",
                              'Code' : "code",
                              'Date': "date",
                              ' (cases)' : "cases",
                              'Number of days since the 100th confirmed case (days)':"daysGT100"
                                         })
covid = covid[['code','country','date','cases','daysGT100']]

In [22]:
covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12768 entries, 0 to 12767
Data columns (total 5 columns):
code         11092 non-null object
country      12768 non-null object
date         12768 non-null datetime64[ns]
cases        12761 non-null float64
daysGT100    4459 non-null float64
dtypes: datetime64[ns](1), float64(2), object(2)
memory usage: 498.8+ KB


In [23]:
covid.head()

Unnamed: 0,code,country,date,cases,daysGT100
0,AFG,Afghanistan,2019-12-31,0.0,
1,AFG,Afghanistan,2020-01-01,0.0,
2,AFG,Afghanistan,2020-01-02,0.0,
3,AFG,Afghanistan,2020-01-03,0.0,
4,AFG,Afghanistan,2020-01-04,0.0,


In [24]:
covidGT100 = covid[covid['daysGT100'] >= 0]
covidGT100.dropna(subset=['code'],inplace=True)

In [25]:
covidGT100['logCases'] = np.log(covidGT100['cases'])

In [26]:
covidGT100.to_csv('covidGT100.csv')

In [27]:
covidGT100.head()

Unnamed: 0,code,country,date,cases,daysGT100,logCases
79,AFG,Afghanistan,2020-03-29,106.0,0.0,4.663439
80,AFG,Afghanistan,2020-03-30,114.0,1.0,4.736198
81,AFG,Afghanistan,2020-03-31,141.0,2.0,4.94876
82,AFG,Afghanistan,2020-04-01,166.0,3.0,5.111988
83,AFG,Afghanistan,2020-04-02,192.0,4.0,5.257495


In [29]:
hoefstede.head()

Unnamed: 0,code,country,pdi,idv,mas,uai,ltowvs,ivr
0,GTM,Guatemala,95.0,6.0,37.0,101.0,,
1,ECU,Ecuador,78.0,8.0,63.0,67.0,,
2,PAN,Panama,95.0,11.0,44.0,86.0,,
3,VEN,Venezuela,81.0,12.0,73.0,76.0,16.0,100.0
4,COL,Colombia,67.0,13.0,64.0,80.0,13.0,83.0


In [30]:
# Merge data
covidGT100Hoef = pd.merge(covidGT100, urban_rural_population, on=["code"])
covidGT100Hoef.shape

KeyError: 'code'

In [None]:
density_vs_prosperity.shape

In [None]:
density_vs_prosperity.dropna(subset=['Code'],inplace=True)
density_vs_prosperity.shape

In [None]:
urban_rural_population.dropna(subset=['Code'],inplace=True)
urban_rural_population.shape

In [None]:
urban_vs_rural_majority.dropna(subset=['Code'],inplace=True)
urban_vs_rural_majority.shape

In [None]:
life_expectancy.dropna(subset=['Code'],inplace=True)
life_expectancy.shape

In [None]:
median_age.dropna(subset=['Code'],inplace=True)
median_age.shape

In [None]:
covid_data.shape

# Create Worldometer dataframe

In [None]:
wom.head()

In [None]:
# Merge data
merged_data = pd.merge(density_vs_prosperity, urban_rural_population, on=["Entity","Code", "Year"])
merged_data.shape

In [None]:
merged_data = pd.merge(merged_data, urban_vs_rural_majority, on=["Entity", "Code", "Year"])
merged_data.shape

In [None]:
merged_data = pd.merge(merged_data, life_expectancy, on=["Entity", "Code", "Year"])
merged_data.shape

In [None]:
merged_data = pd.merge(merged_data, median_age, on=["Entity", "Code", "Year"])
merged_data.shape

In [None]:
merged_data.columns

In [None]:
#Format columns
merged_data = merged_data.rename(columns={'Code':'code',
                                          'Year':'date',
                                          "Population density (people per km² of land area)": "popDensity",
                                          "GDP per capita (constant 2011 international $)" : "gdp",
                                         "Total population (Gapminder)": "gapPop",
                                         "Urban population": "urbanPop",
                                         "Rural population": "ruralPop",
                                         "Urban (%)":"urbanPct",
                                         "Rural (%)": "ruralPct",
                                         "Life expectancy at birth, total (years) (years)":"lifeExpectancy",
                                         "UN Population Division (Median Age) (2017) (years)":"medianAge"
                                         })

In [None]:
unsd.columns

In [None]:
merged_data = pd.merge(merged_data, unsd, on=["code"])
merged_data.shape

In [None]:
wom.columns

In [None]:
merged_data = pd.merge(merged_data, wom, on=["code"])
merged_data.shape

In [None]:
merged_data.head()

In [None]:
merged_data.head()

In [None]:
merged_data.columns

In [None]:
merged_data.columns

In [None]:
merged_data.head()

In [None]:
hoefstede.shape

In [None]:
covid.shape

In [None]:
merged_data = pd.merge(merged_data, hoefstede, on=[ "code"])
#merged_data.head()

In [None]:
merged_data.drop(['Entity_y','date_y'], axis=1,inplace=True)

In [None]:
merged_data = merged_data.rename(columns={"Entity_x": "Entity",
                                         "date_x":'date'})

In [None]:
merged_data['date'].max()

In [None]:
merged_data = merged_data[merged_data['date'] == 2015]
#merged_data.head()

In [None]:
merged_data.columns

In [None]:
merged_data.head()

In [None]:
merged_data.columns

In [None]:
merged_data.shape

In [None]:
# Export CSV
merged_data.to_csv('mergetest.csv')  

In [None]:
#merged_data.info()

In [None]:
covid.info()

In [None]:
merged_data = pd.merge(merged_data, covid, on=[ "code"])
merged_data.columns

In [None]:
merged_data.drop(['date_x'], axis=1,inplace=True)

In [None]:
merged_data = merged_data.rename(columns={"date_y":'date'})

In [None]:
merged_data.columns

In [None]:
# Drop rows where days_gt_100 is null
#merged_data = merged_data[merged_data.days_gt_100 != 0]
merged_data.dropna(subset=['daysGT100'],inplace=True)
merged_data.shape

In [None]:
merged_data['logCases'] = np.log(merged_data['cases'])
merged_data['casesPop'] = (merged_data['cases']/merged_data['calcPop'])
merged_data['logCasesPop'] = np.log(merged_data['cases']/merged_data['calcPop'])
merged_data['logCasesMPop'] = np.log(merged_data['casesMPop'])
# merged_data['log_cases'] = np.log(merged_data['cases'])
# merged_data['log_cases'] = np.log(merged_data['cases'])

In [None]:
merged_data.info()

In [None]:
merged_data.columns

In [None]:
# Get rid of sparse data columns
merged_data.drop(['globalCode','interRegionCode','ldc','lldc','sids'], axis=1,inplace=True)

In [None]:
# Export CSV
merged_data.to_csv('merged_data_modified.csv')  

In [None]:
merged_data['date'] = pd.to_timedelta(merged_data['date']).dt.total_seconds().astype(int)
#merged_data['Date'] = pd.to_datetime(merged_data['Date'], format='%Y-%m-%d').dt.time

In [None]:
merged_data.head()

In [None]:

# kip_data = merged_data.groupby('code')[['date','logCases']].apply(.expanding(lambda x: print(x.date,x.logCases))
# kip_data.head()

In [None]:
# def make_m(x):
#     y = merged_data['date'].iloc[0:len(x)]
#     return np.polyfit(x, y, 1)[0]
# def make_b(x):
#     y = merged_data['date'].iloc[0:len(x)]
#     return np.polyfit(x, y, 1)[1]
# merged_data['new'] = merged_data['date'].expanding().apply(make_m, raw=True)*merged_data['date'] + merged_data['date'].expanding().apply(make_b, raw=True)

In [None]:
# merged_data.to_csv('kiptest2.csv')

In [None]:
#merged_data['kslope'] = (merged_data.groupby('code')[['date','logCases']] linregress(x.date,x.logCases)[0]
#merged_data['kslope'] = (merged_data.groupby('code')[['date','logCases']].expanding().apply(lambda x : pd.Series(linregress(x['date'],x['logCases']))[0]))

## Create subset of Countries that have reached 5 days > 100 cases

In [None]:
cases5 = merged_data[merged_data.groupby('code')['daysGT100'].transform(lambda x: (x == 5).sum())>0]

In [None]:
cases5 = cases5[cases5['daysGT100'] <= 5]

In [None]:
cases5group = cases5.groupby('code')

In [None]:
cases5group.head()

In [None]:
cases5slopes = (cases5group.apply(lambda x: pd.Series(linregress(x['daysGT100'],x['logCases'])))\
        .rename(columns={
        0: 'slope',
        1: 'intercept',
        2: 'rvalue',
        3: 'pvalue',
        4: 'stderr'
    }))
casesPop5slopes = (cases5group.apply(lambda x: pd.Series(linregress(x['daysGT100'],x['logCasesPop'])))\
        .rename(columns={
        0: 'slopePop',
        1: 'interceptPop',
        2: 'rvaluePop',
        3: 'pvaluePop',
        4: 'stderrPop'
    }))

In [None]:
cases5slopes.to_csv('cases5slopes.csv')

In [None]:
merged5 = merged_data[merged_data.daysGT100 == 5]

In [None]:
mergedcases5 = pd.merge(merged5, cases5slopes, on=["code"])
mergedcases5 = pd.merge(mergedcases5, casesPop5slopes, on=["code"])

In [None]:
#mergedcases5.dropna(subset=['idv'],inplace=True)

In [None]:
# Export CSV
mergedcases5.to_csv('cases5.csv')  

In [None]:
corr5 = mergedcases5.corr()

In [None]:
# Plot the Pearson Correlation reduced matrix
matrix = np.triu(corr5)
fig, ax = plt.subplots(figsize=(15,9)) 
ax = sns.heatmap(corr5,   mask = matrix, cmap = 'RdYlBu')
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.title('Pearson Correlation matrix for Countries at 5 days since 100th Covid case')

In [None]:
corr5.to_csv('corrcases5.csv')

## Create subset of Countries that have reached 10 days > 100 cases

In [None]:
cases10 = merged_data[merged_data.groupby('code')['daysGT100'].transform(lambda x: (x == 10).sum())>0]

In [None]:
cases10 = cases10[cases10['daysGT100'] <= 10]

In [None]:
cases10.to_csv('cases10precalc.csv')

In [None]:
cases10group = cases10.groupby('code')

In [None]:
cases10slopes = (cases10group.apply(lambda x: pd.Series(linregress(x['daysGT100'],x['logCases'])))\
        .rename(columns={
        0: 'slope',
        1: 'intercept',
        2: 'rvalue',
        3: 'pvalue',
        4: 'stderr'
    }))
casesPop10slopes = (cases10group.apply(lambda x: pd.Series(linregress(x['daysGT100'],x['logCasesPop'])))\
        .rename(columns={
        0: 'slopePop',
        1: 'interceptPop',
        2: 'rvaluePop',
        3: 'pvaluePop',
        4: 'stderrPop'
    }))

In [None]:
merged10 = merged_data[merged_data.daysGT100 == 10]

In [None]:
merged10.shape

In [None]:
mergedcases10 = pd.merge(merged10, cases10slopes, on=["code"])
mergedcases10 = pd.merge(mergedcases10, casesPop10slopes, on=["code"])

In [None]:
# Export CSV
mergedcases10.to_csv('cases10.csv')  

In [None]:
corr10 = mergedcases10.corr()

In [None]:
# Plot the Pearson Correlation reduced matrix
matrix = np.triu(corr10)
fig, ax = plt.subplots(figsize=(15,9)) 
ax = sns.heatmap(corr10,   mask = matrix, cmap = 'RdYlBu')
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.title('Pearson Correlation matrix for Countries at 10 days since 100th Covid case')

In [None]:
corr10.to_csv('corrcases10.csv')

## Create subset of Countries that have reached 20 days > 100 cases

In [None]:
cases20 = merged_data[merged_data.groupby('code')['daysGT100'].transform(lambda x: (x == 20).sum())>0]

In [None]:
cases20 = cases20[cases20['daysGT100'] <= 20]

In [None]:
cases20.to_csv('cases20inter.csv')

In [None]:
cases20group = cases20.groupby('code')

In [None]:
cases20slopes = (cases20group.apply(lambda x: pd.Series(linregress(x['daysGT100'],x['logCases'])))\
        .rename(columns={
        0: 'slope',
        1: 'intercept',
        2: 'rvalue',
        3: 'pvalue',
        4: 'stderr'
    }))
casesPop20slopes = (cases20group.apply(lambda x: pd.Series(linregress(x['daysGT100'],x['logCasesPop'])))\
        .rename(columns={
        0: 'slopePop',
        1: 'interceptPop',
        2: 'rvaluePop',
        3: 'pvaluePop',
        4: 'stderrPop'
    }))

In [None]:
cases20slopes.to_csv('cases20slopes.csv')

In [None]:
merged20 = merged_data[merged_data.daysGT100 == 20]

In [None]:
mergedcases20 = pd.merge(merged20, cases20slopes, on=["code"])
mergedcases20 = pd.merge(mergedcases20, casesPop20slopes, on=["code"])

In [None]:
# Export CSV
mergedcases20.to_csv('cases20.csv')  

In [None]:
corr20 = mergedcases20.corr()

In [None]:
# Plot the Pearson Correlation reduced matrix
matrix = np.triu(corr20)
fig, ax = plt.subplots(figsize=(15,9)) 
ax = sns.heatmap(corr20,   mask = matrix, cmap = 'RdYlBu')
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.title('Pearson Correlation matrix for Countries at 20 days since 100th Covid case')

In [None]:
corr20.to_csv('corrcases20.csv')

## Create subset of Countries that have reached 30 days > 100 cases

In [None]:
cases30 = merged_data[merged_data.groupby('code')['daysGT100'].transform(lambda x: (x == 30).sum())>0]

In [None]:
cases30 = cases30[cases30['daysGT100'] <= 30]

In [None]:
cases30group = cases30.groupby('code')

In [None]:
cases30slopes = (cases30group.apply(lambda x: pd.Series(linregress(x['daysGT100'],x['logCases'])))\
        .rename(columns={
        0: 'slope',
        1: 'intercept',
        2: 'rvalue',
        3: 'pvalue',
        4: 'stderr'
    }))
casesPop30slopes = (cases30group.apply(lambda x: pd.Series(linregress(x['daysGT100'],x['logCasesPop'])))\
        .rename(columns={
        0: 'slopePop',
        1: 'interceptPop',
        2: 'rvaluePop',
        3: 'pvaluePop',
        4: 'stderrPop'
    }))

In [None]:
merged30 = merged_data[merged_data.daysGT100 == 30]

In [None]:
mergedcases30 = pd.merge(merged30, cases30slopes, on=["code"])
mergedcases30 = pd.merge(mergedcases30, casesPop30slopes, on=["code"])

In [None]:
# Export CSV
mergedcases30.to_csv('cases30.csv')  

In [None]:
corr30 = mergedcases30.corr()

In [None]:
# Plot the Pearson Correlation reduced matrix
matrix = np.triu(corr30)
fig, ax = plt.subplots(figsize=(15,9)) 
ax = sns.heatmap(corr30,   mask = matrix, cmap = 'RdYlBu')
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.title('Pearson Correlation matrix for Countries at 30 days since 100th Covid case')

In [None]:
corr30.to_csv('corrcases30.csv')

## Create subset of Countries that have reached 35 days > 100 cases

In [None]:
cases35 = merged_data[merged_data.groupby('code')['daysGT100'].transform(lambda x: (x == 35).sum())>0]

In [None]:
cases35 = cases35[cases35['daysGT100'] <= 35]

In [None]:
cases35group = cases35.groupby('code')

In [None]:
cases35slopes = (cases35group.apply(lambda x: pd.Series(linregress(x['daysGT100'],x['logCases'])))\
        .rename(columns={
        0: 'slope',
        1: 'intercept',
        2: 'rvalue',
        3: 'pvalue',
        4: 'stderr'
    }))
casesPop35slopes = (cases35group.apply(lambda x: pd.Series(linregress(x['daysGT100'],x['logCasesPop'])))\
        .rename(columns={
        0: 'slopePop',
        1: 'interceptPop',
        2: 'rvaluePop',
        3: 'pvaluePop',
        4: 'stderrPop'
    }))

In [None]:
merged35 = merged_data[merged_data.daysGT100 == 35]

In [None]:
mergedcases35 = pd.merge(merged35, cases35slopes, on=["code"])
mergedcases35 = pd.merge(mergedcases35, casesPop35slopes, on=["code"])

In [None]:
# Export CSV
mergedcases35.to_csv('cases35.csv')  

In [None]:
corr35 = mergedcases35.corr()

In [None]:
# Plot the Pearson Correlation reduced matrix
matrix = np.triu(corr35)
fig, ax = plt.subplots(figsize=(15,9)) 
ax = sns.heatmap(corr35,   mask = matrix, cmap = 'RdYlBu')
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.title('Pearson Correlation matrix for Countries at 30 days since 100th Covid case')

In [None]:
corr35.to_csv('corrcases35.csv')

## Create subset of Countries that have reached 40 days > 100 cases

In [None]:
cases40 = merged_data[merged_data.groupby('code')['daysGT100'].transform(lambda x: (x == 40).sum())>0]

In [None]:
cases40 = cases40[cases40['daysGT100'] <= 40]

In [None]:
#cases40.to_csv('cases40inter.csv')

In [None]:
cases40group = cases40.groupby('code')

In [None]:
cases40slopes = (cases40group.apply(lambda x: pd.Series(linregress(x['daysGT100'],x['logCases'])))\
        .rename(columns={
        0: 'slope',
        1: 'intercept',
        2: 'rvalue',
        3: 'pvalue',
        4: 'stderr'
    }))
casesPop40slopes = (cases40group.apply(lambda x: pd.Series(linregress(x['daysGT100'],x['logCasesPop'])))\
        .rename(columns={
        0: 'slopePop',
        1: 'interceptPop',
        2: 'rvaluePop',
        3: 'pvaluePop',
        4: 'stderrPop'
    }))

In [None]:
merged40 = merged_data[merged_data.daysGT100 == 40]

In [None]:
mergedcases40 = pd.merge(merged40, cases40slopes, on=["code"])
mergedcases40 = pd.merge(mergedcases40, casesPop40slopes, on=["code"])

In [None]:
# Export CSV
mergedcases40.to_csv('cases40.csv')  

In [None]:
corr40 = mergedcases40.corr()

In [None]:
# Plot the Pearson Correlation reduced matrix
matrix = np.triu(corr40)
fig, ax = plt.subplots(figsize=(15,9)) 
ax = sns.heatmap(corr40,   mask = matrix, cmap = 'RdYlBu')
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.title('Pearson Correlation matrix for Countries at 40 days since 100th Covid case')

In [None]:
corr40.to_csv('corrcases40.csv')

### Concatonate the dataframes back together into one dataframe

In [None]:
# Concatonate all the dataframes 
cases_df = pd.concat([mergedcases40, mergedcases35, mergedcases30, mergedcases20, mergedcases10, mergedcases5])

In [None]:
cases_df.head()

In [None]:
cases_df.to_csv('cases_df.csv')

In [None]:
cases_df.columns

In [None]:
cases_df.drop(['country_y'], axis=1,inplace=True)

In [None]:
cases_df = cases_df.rename(columns={'country_x': 'country'})

In [None]:
cases_df.head()
multi_slope = cases_df.set_index(['globalName','regionName','subRegionName','country'])
multi_slope = multi_slope.sort_index()
print(multi_slope.shape)
multi_slope.head()

### Function to easily do a panel of scatterplots easily

In [None]:
def makePanelScatterplot(data_df, x_data,y_data,color_col,split_col):
    x_label = x_data
    y_label = y_data
    ylim_min = data_df[y_data].min()*1.05
    ylim_max = data_df[y_data].max()*1.05
    xlim_min = data_df[x_data].min()*1.05
    xlim_max = data_df[x_data].max()*1.05

    g = sns.lmplot(x=x_data, y=y_data, data=data_df,\
           hue=color_col, col=split_col,col_wrap=3)

        
    g = (g.set_axis_labels(x_label, y_label)
          .set(ylim=(ylim_min, ylim_max),xlim=(xlim_min, xlim_max))
          .fig.subplots_adjust(wspace=.02))

# Slope of # Covid Cases vs. Power-Distance (PDI) of Culture

In [None]:
makePanelScatterplot(cases_df,"pdi","slopePop","daysGT100","daysGT100")

# Slope of # Covid Cases vs. Individualism/Collectivism of Culture

In [None]:
makePanelScatterplot(cases_df,"idv","slopePop","daysGT100","daysGT100")

# Slope of # Covid Cases vs. Indulgence (IVR) of Culture

In [None]:
makePanelScatterplot(cases_df,"ivr","slopePop","daysGT100","daysGT100")

# Slope of # Covid Cases vs. Long-term Orientation (LTOWVS) of Culture

In [None]:
makePanelScatterplot(cases_df,"ltowvs","slopePop","daysGT100","daysGT100")

# Slope of # Covid Cases vs. Masculine/Feminine (MAS) of Culture

In [None]:
makePanelScatterplot(cases_df,"mas","slope","daysGT100","daysGT100")

# Slope of # Covid Cases vs. Uncertainty Avoidance (UAI) of Culture

In [None]:
makePanelScatterplot(cases_df,"uai","slope","daysGT100","daysGT100")

In [None]:
cases_df.columns

In [None]:
makePanelScatterplot(cases_df,"lifeExpectancy","medianAge","daysGT100","daysGT100")

# Change is

In [None]:
makePanelScatterplot(cases_df,"logCasesPop","slopePop","daysGT100","daysGT100")

In [None]:
cases_df.columns

In [None]:
multi_slope.head()

In [None]:
multi_slope.to_csv('multi_slope_kip.csv')

In [None]:
makePanelScatterplot(cases_df,"ivr","slope","daysGT100","daysGT100")

In [None]:
makePanelScatterplot(cases_df,"idv","casesPop","daysGT100","daysGT100")

In [None]:
def olsRegressionAnalysis (df,df_name,dep_col,ind_col):
    stat_dep_col = dep_col.replace(' ','_').replace('-','_').replace('(','_').replace(')','')
    stat_ind_col = ind_col.replace(' ','_').replace('-','_').replace('(','_').replace(')','')
    stat_col_list = [stat_dep_col,stat_ind_col]
    #print(f'stat_dep_col: {stat_dep_col}, stat_ind_col: {stat_ind_col}')
    col_list = [dep_col,ind_col]

    col_dict = {col_list[i]: stat_col_list[i] for i in range(len(col_list))} 
    #print(col_dict)
    stat_df = df[col_list].dropna()
    stat_df.rename(columns=col_dict,inplace=True)
    print(f'\u001b[34m{dep_col}\u001b[0m fitted against \u001b[34m{ind_col}\u001b[0m \
    using \x1b[31m{df_name}\x1b[0m dataframe:\n')
    print(f'We have {stat_df.shape[0]} rows left after dropping Null values\n')
    model_string = stat_dep_col + " ~ " + stat_ind_col
    all_model = ols(model_string, data=stat_df).fit()
    print(all_model.summary())

In [None]:
cases_df.columns

In [None]:
cases_df

In [None]:
# # Create tidy datasets for Country then Covid
# country_cols = ['Entity', 'Code', 'popDensity', 'gdp', 'gapPop', 'urbanPop', 'ruralPop',
#        'urbanPct', 'ruralPct', 'lifeExpectancy', 'globalCode', 'globalName',
#        'regionCode', 'regionName', 'subRegionCode', 'subRegionName',
#        'interRegionCode', 'interRegionName', 'country', 'm49', 'ldc', 'lldc',
#        'sids', 'development', 'countryWom', 'calcPop', 'medianAge', 'pdi',
#        'idv', 'mas', 'uai', 'ltowvs', 'ivr']
# countries = cases_df[country_cols].drop_duplicates()
# countries = countries.reset_index(drop=True)
# countries = countries.set_index(['Code'])
# countries.head()

In [None]:
countries.columns

In [None]:
cases_df.columns

In [None]:
# Create Covid cases table
covid_vc = pd.merge(cases_df, countries, on=['globalName',
       'regionCode', 'regionName', 'subRegionName',
       'interRegionName', 'country', 'm49', 'development', 'pdi',
       'idv', 'mas', 'uai', 'ltowvs', 'ivr'])
covid_vc.columns

In [None]:
covid_vc.drop(['code_y'], axis=1,inplace=True)

In [None]:
covid_vc = covid_vc.rename(columns={'code_x': 'code'})

In [None]:
covid_vc = covid_vc[['code','date', 'cases', 'daysGT100',
       'logCases', 'casesPop', 'logCasesPop', 'logCasesMPop', 'slope',
       'intercept', 'rvalue', 'pvalue', 'stderr', 'slopePop', 'interceptPop',
       'rvaluePop', 'pvaluePop', 'stderrPop']]
covid_vc

In [None]:
tidy_df = cases_df.melt(id_vars="daysGT100")

In [None]:
cases_df.head()

In [None]:
tidy_df.columns

In [None]:
df = cases_df[cases_df['daysGT100'] >= 30].sort_values(by=['casesPop'])

fig = px.scatter(df, x="country", y="casesPop", color="idv", 
              color_continuous_scale= px.colors.sequential.Inferno,
              title="Confirmed Covid cases per Capita vs. Hoefstede 'Individualism' Score",
             hover_name="country", hover_data=['cases','slope','ivr'])
fig.update_traces(marker=dict(size=12,
                              line=dict(width=1,
                                        color='DarkSlateGrey')),
                  selector=dict(mode="markers"))


fig.show()

In [None]:
df = cases_df[cases_df['daysGT100'] >= 35]

fig = px.line(df, x="daysGT100", y="slope", color="country", title="layout.hovermode='x unified'",
             hover_name="country", hover_data=['country','slope','ivr'])
fig.update_traces(mode="markers+lines")


fig.show()

In [None]:
wom.head()

In [None]:
df = wow[cases_df['totalCases'] >= 100].sort_values(by=['totalCases'])

fig = px.scatter(df, x="countryWom", y="casesMPop", color="idv", 
              color_continuous_scale= px.colors.sequential.Inferno,
              title="layout.hovermode='x unified'",
             hover_name="country", hover_data=['cases','slope','ivr'])
fig.update_traces(marker=dict(size=12,
                              line=dict(width=1,
                                        color='DarkSlateGrey')),
                  selector=dict(mode="markers"))


fig.show()

In [None]:
#olsRegressionAnalysis(cases40,"log_cases","slope","Code")

In [None]:

#merged_data['slope'] = merged_data.groupby('Code').diff()(lambda v: linregress(v.Date, v.cases)[0])
merged_data['slope'] = (merged_data.groupby('code')[['date','logCases']].expanding().apply(lambda x: linregress(x['Date'],x['cases'])[0]))
#merged_data['slope'] = (merged_data.groupby('Code')['Date','cases'].apply(lambda x,y : x.shift().expanding().apply(lambda v: linregress([0], [1])[0])))

In [None]:
#jupyter nbconvert merge_data_modified.ipynb --to html --output covidCulture.html