# Zindi - Challenge

https://zindi.africa/competitions/predict-the-global-spread-of-covid-19

April 9, 2020

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib as plt 
import requests
import lxml.html as lh

## Import Data

In [2]:
#Read train 
deaths = pd.read_csv('data/deaths.csv')
cases = pd.read_csv('data/cases.csv')
deaths.drop(['Province/State'], axis=True)
deaths.shape

(274, 84)

## Add variables


### Temperature 

In [3]:
def web_scraping_data(link):
    #Create a handle, page, to handle the contents of the website
    page = requests.get(link)
    doc = lh.fromstring(page.content)
    tr_elements = doc.xpath('//tr')

    tr_elements = doc.xpath('//tr')
    col=[]
    i=0
    for t in tr_elements[0]:
        i+=1
        name=t.text_content()
        #print ('%d:"%s"'%(i,name))
        col.append((name,[]))

    for j in range(1,len(tr_elements)):
        T=tr_elements[j]
        i=0

        for t in T.iterchildren():
            data=t.text_content() 
            if i>0:
                try:
                    data=int(data)
                except:
                    pass

            col[i][1].append(data)
            i+=1

    Dict={title:column for (title,column) in col}
    df=pd.DataFrame(Dict)
    
    return df

In [4]:
code_country = web_scraping_data("https://www.iban.com/country-codes")

In [5]:
code_country.shape

(249, 4)

Source 
- https://datacatalog.worldbank.org/dataset/climate-change-knowledge-portal-historical-data

In [6]:
temp = pd.read_csv('variable/temp.csv')

In [7]:
temp.shape

(178, 14)

In [8]:
temperature = pd.merge(left=temp, right=code_country, left_on='ISO_3DIGIT', right_on='Alpha-3 code')

In [9]:
temperature = temperature.drop(['Numeric','Alpha-2 code'], axis=True)

In [10]:
death1 = pd.merge(left=deaths, right=temperature, left_on='Territory', right_on='Country')

In [11]:
death1.shape

(229, 100)

I am loosing 45 countries which are 

In [12]:
l1 = list(death1.Country)
l2 = list(deaths.Territory)
[x for x in l2 if x not in l1]

['Andorra',
 'Antigua and Barbuda',
 'Bahrain',
 'Barbados',
 'Cabo Verde',
 'Democratic Republic of the Congo (the)',
 'Faroe Islands',
 'Greenland',
 'French Polynesia',
 'Mayotte',
 'Reunion',
 'Saint Barthelemy',
 'St Martin',
 "Democratic People's Republic of Korea (the)",
 'Liechtenstein',
 'Maldives',
 'Malta',
 'Republic of Moldova (the)',
 'Monaco',
 'Aruba',
 'Curacao',
 'North Macedonia',
 'Saint Lucia',
 'San Marino',
 'Seychelles',
 'Singapore',
 'Taiwan',
 'United Republic of Tanzania (the)',
 'Bermuda',
 'Cayman Islands',
 'Montserrat',
 'Dominica',
 'Grenada',
 'Syrian Arab Republic (the)',
 'Saint Kitts and Nevis',
 'Sao Tome and Principe',
 'Kiribati',
 'Marshall Islands (the)',
 'Micronesia (Federated States of)',
 'Nauru',
 'Palau',
 'Republic of Korea (the)',
 'Samoa',
 'Tonga',
 'Tuvalu']

### Others epidemy 

SARS-CoV

China Canada, Hong Kong Special Administrative Region of China, Chinese Taipei, Singapore, and Hanoi in Viet Nam.

Ebola 

The Democratic Republic of the Congo
Gabon
Guinea
Italy
Ivory Coast
Liberia
Mali
Nigeria
Philippines
Russia
Senegal
Sierra Leone
South Africa
South Sudan
Spain
Uganda
United Kingdom
United States
What causes Ebol

### Age / density / wealth /sex 

https://datacatalog.worldbank.org/search?search_api_views_fulltext_op=AND&f%5B0%5D=field_collection_field%3A2026&sort_by=field_wbddh_modified_date

https://data.worldbank.org/indicator/en.pop.dnst

In [13]:
def wordbankdata(variable, tab):
    data = pd.read_csv('variable/'+variable+'.csv')
    data = data[['Country Code', '2018']]
    death = pd.merge(left=data, right=tab, left_on='Country Code', right_on='Alpha-3 code')
    death[variable] = death['2018']
    death.drop(['2018'], axis=True, inplace=True)
    return(death)

variable = '%+65'

In [14]:
death2 = wordbankdata(variable, death1)

In [15]:
l1 = list(death1.Country)
l2 = list(death2.Country)
[x for x in l1 if x not in l2]

['French Guiana', 'Guadeloupe']

In [16]:
variable = '15-64pop'
death2 = wordbankdata(variable, death2)
death2 = wordbankdata('density', death2)
death2 = wordbankdata( 'sex', death2)

In [17]:
rich = pd.read_csv('variable/income.csv')
rich = rich[['Country Code', 'IncomeGroup']]

In [18]:
death2 = pd.merge(left=rich, right=death2, left_on='Country Code', right_on='Alpha-3 code')

In [19]:
death2.drop(['Alpha-3 code','Country Code_y', 'Country Code_x','Country Code_y' ,'Country'], axis=1, inplace=True)


In [20]:
death2.head()

Unnamed: 0,Country Code,IncomeGroup,Province/State,Country/Region,Territory,Population,Lat,Long,1/22/20,1/23/20,...,Aug_Temp,Sept_temp,Oct_temp,Nov_Temp,Dec_temp,Annual_temp,%+65,15-64pop,density,sex
0,AFG,Low income,,Afghanistan,Afghanistan,26023100.0,33.0,65.0,0,0,...,23.77,19.03,12.99,7.0,2.43,12.92,2.584927,54.324898,56.93776,48.635847
1,AGO,Lower middle income,,Angola,Angola,24383301.0,-11.2027,17.8739,0,0,...,19.9,22.19,23.18,22.79,22.61,21.51,2.216374,50.974702,24.713052,50.530463
2,ALB,Upper middle income,,Albania,Albania,2895947.0,41.1533,20.1683,0,0,...,20.48,17.16,12.27,7.58,3.65,11.27,13.744736,68.58239,104.612263,49.063095
3,ARE,High income,,United Arab Emirates,United Arab Emirates (the),,24.0,54.0,0,0,...,33.55,31.74,28.34,24.06,20.28,26.83,1.085001,84.31149,135.60911,30.636688
4,ARG,Upper middle income,,Argentina,Argentina,42669500.0,-38.4161,-63.6167,0,0,...,9.02,11.53,14.67,17.54,19.83,14.22,11.117789,64.121277,16.25851,51.237348


In [21]:
death2.shape

(227, 104)

In [22]:
deaths.shape

(274, 84)

## Split by continent

In [23]:
continent = pd.read_csv('variable/locations.csv')

In [24]:
continent.head()

Unnamed: 0,countriesAndTerritories,location,continent,population_year,population
0,Afghanistan,Afghanistan,Asia,2020.0,38928341.0
1,Albania,Albania,Europe,2020.0,2877800.0
2,Algeria,Algeria,Africa,2020.0,43851043.0
3,Andorra,Andorra,Europe,2020.0,77265.0
4,Angola,Angola,Africa,2020.0,32866268.0


In [25]:
continent.shape

(207, 5)

In [26]:
continent.drop(['population_year','population','countriesAndTerritories'], axis=1, inplace=True)

In [27]:
death2 = pd.merge(left=continent, right=death1, left_on='location', right_on='Country/Region')

In [28]:
death2.shape

(216, 102)

In [29]:
l1 = list(death4.location)
l2 = list(death3['Country/Region'])
[x for x in l2 if x not in l1]

NameError: name 'death4' is not defined

In [None]:
death4.head()

### Infected cases March 1 / March 15 / March 30

We take the number of cases every 15 days since the beginning of March. 

In [None]:
cases = pd.read_csv('data/cases.csv')

In [None]:
cases[['2/15/20', '3/1/20', '3/30/20', '3/30/20', 'Country/Region']]

## Commorbidity / Risk Factors

### Obesity 

https://www.kaggle.com/arttua/who-obesity-by-country-2016

In [None]:
obesity = pd.read_csv('variable/obesity.csv')
obesity['Country'] = obesity['Unnamed: 0']
obesity.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
death3 = pd.merge(left=obesity, right=death2, left_on='Country', right_on='Country/Region')
death3.shape

In [None]:
death2.shape

### 

## Medical care

### Hospital beds

https://www.kaggle.com/hamzael1/hospital-beds-by-country

In [None]:
beds = pd.read_csv('variable/beds.csv')

In [None]:
beds.count() / len(beds)

In [None]:
beds= beds[['Country Code', '2011']]

MERGE !

### Health Systems - WHO 


In [None]:
hs = pd.read_csv('variable/Health_systems.csv')

In [None]:
hs.shape

## Strict mesure de confinement 

1 - 10

In [None]:
sum(deaths.Territory == 'France')

In [None]:
deaths[deaths.Territory == 'France']