In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import requests
import sqlite3
import pymongo

## Extracting the longitude Latitude table

In [2]:
# found a table containing longitude and latitudes of countries and 
# brought it into pandas

long_lat=pd.read_html("https://developers.google.com/public-data/docs/canonical/countries_csv",skiprows=1)[0]

# renaming columns
long_lat.columns = ['country_code', 'latitude','longitude','name']

# printing dataframe
long_lat.head()

Unnamed: 0,country_code,latitude,longitude,name
0,AE,23.424076,53.847818,United Arab Emirates
1,AF,33.93911,67.709953,Afghanistan
2,AG,17.060816,-61.796428,Antigua and Barbuda
3,AI,18.220554,-63.068615,Anguilla
4,AL,41.153332,20.168331,Albania


In [3]:
#  country code 
converting_country_code=pd.read_html("https://www.worldatlas.com/aatlas/ctycodes.htm",skiprows=1)[0]
converting_country_code.columns= ['name','country_code_2','country_code_3','x','y']
converting_country_code=converting_country_code.drop(['x', 'y'], axis=1)
converting_country_code.head()


Unnamed: 0,name,country_code_2,country_code_3
0,Afghanistan,AF,AFG
1,Albania,AL,ALB
2,Algeria,DZ,DZA
3,American Samoa,AS,ASM
4,Andorra,AD,AND


In [4]:
merged_col=pd.merge(converting_country_code, long_lat, left_on='country_code_2',right_on="country_code",how="inner")
merged_col.head()



Unnamed: 0,name_x,country_code_2,country_code_3,country_code,latitude,longitude,name_y
0,Afghanistan,AF,AFG,AF,33.93911,67.709953,Afghanistan
1,Albania,AL,ALB,AL,41.153332,20.168331,Albania
2,Algeria,DZ,DZA,DZ,28.033886,1.659626,Algeria
3,American Samoa,AS,ASM,AS,-14.270972,-170.132217,American Samoa
4,Angola,AO,AGO,AO,-11.202692,17.873887,Angola


In [5]:
# cleaning up the pandas dataframe
# dropping cols I dont need
merged_col=merged_col.drop(['name_y','country_code','country_code_2'], axis=1)
merged_col.head()


Unnamed: 0,name_x,country_code_3,latitude,longitude
0,Afghanistan,AFG,33.93911,67.709953
1,Albania,ALB,41.153332,20.168331
2,Algeria,DZA,28.033886,1.659626
3,American Samoa,ASM,-14.270972,-170.132217
4,Angola,AGO,-11.202692,17.873887


In [6]:
# renaming cols
merged_col.columns = ['country','country_code','latitude','longitude']

In [7]:
merged_col.head()


Unnamed: 0,country,country_code,latitude,longitude
0,Afghanistan,AFG,33.93911,67.709953
1,Albania,ALB,41.153332,20.168331
2,Algeria,DZA,28.033886,1.659626
3,American Samoa,ASM,-14.270972,-170.132217
4,Angola,AGO,-11.202692,17.873887


In [10]:
# Used the WHO API to get access to how they add the country code and will be 
# merging it with our own table to make sure the data we intend to parse in future from WHO alligns with our 
# merged_col
url = "http://apps.who.int/gho/athena/api/COUNTRY?format=json"
country_r = requests.get(url).json()



In [11]:
# checking to see what needs to be added into the country_r to output the country_code 

country_r['dimension'][0]['code'][0]['label']


'AFG'

In [12]:
# Creating a list of all the country_codes I can get from the WHO website

who_country_list=[]

country_code=country_r['dimension'][0]['code']

for country in country_code:
    who_country_list.append(country["label"])

In [13]:
# converting the list into a dataframe and renaming the col. I did this 
# so I can merge this dataframe with the one we made earlier to make sure all the who countries are in the mergel_col

who_df=pd.DataFrame(who_country_list)
who_df.columns=["who_country"]
who_df.head()


Unnamed: 0,who_country
0,AFG
1,ALB
2,DZA
3,AND
4,AGO


In [14]:
# merging the two data frames together on country code. this output 226 
# countries which is sufficent for our data set
who_and_others=pd.merge(merged_col, who_df, left_on='country_code',right_on="who_country",how="inner")
who_and_others.head()


Unnamed: 0,country,country_code,latitude,longitude,who_country
0,Afghanistan,AFG,33.93911,67.709953,AFG
1,Albania,ALB,41.153332,20.168331,ALB
2,Algeria,DZA,28.033886,1.659626,DZA
3,American Samoa,ASM,-14.270972,-170.132217,ASM
4,Angola,AGO,-11.202692,17.873887,AGO


In [15]:
who_and_others=who_and_others.drop(['who_country'], axis=1)


In [16]:
who_and_others.head(1)
# who_and_others.shape

Unnamed: 0,country,country_code,latitude,longitude
0,Afghanistan,AFG,33.93911,67.709953


## Extracting the Disease data using API

In [17]:
## diseases calls

url = "http://apps.who.int/gho/athena/data/GHO/"
url_after = "?format=html&filter=COUNTRY:*"
malaria_code = "WHS3_48"
yel_fev_code = "WHS3_50"
leprosy_code = "WHS3_45"

In [18]:
malaria_url = url + malaria_code + url_after
yel_fev_url = url + yel_fev_code + url_after
leprosy_url = url + leprosy_code + url_after

In [19]:
mal_raw = pd.read_html(malaria_url)[0]

In [20]:
yel_raw = pd.read_html(yel_fev_url)[0]

In [21]:
lep_raw = pd.read_html(leprosy_url)[0]

In [22]:
# display data to observe
display(mal_raw.head())
display(yel_raw.head())
display(lep_raw.head())

Unnamed: 0,GHO,PUBLISHSTATE,YEAR,REGION,COUNTRY,DISPLAY VALUE,NUMERIC VALUE,LOW RANGE,HIGH RANGE,Comment
0,Malaria - number of reported confirmed cases,Published,2011,Western Pacific,Cambodia,57423,57423.0,,,
1,Malaria - number of reported confirmed cases,Published,2005,Africa,Gabon,70644,70644.0,,,
2,Malaria - number of reported confirmed cases,Published,2017,Africa,Central African Republic,383309,383309.0,,,
3,Malaria - number of reported confirmed cases,Published,2014,Americas,Costa Rica,0,0.0,,,
4,Malaria - number of reported confirmed cases,Published,2008,Africa,Congo,117291,117291.0,,,


Unnamed: 0,GHO,PUBLISHSTATE,YEAR,REGION,WORLDBANKINCOMEGROUP,COUNTRY,DISPLAY VALUE,NUMERIC VALUE,LOW RANGE,HIGH RANGE,Comment
0,Yellow fever - number of reported cases,Published,2008,Europe,High-income,France,0,0.0,,,
1,Yellow fever - number of reported cases,Published,2004,Africa,Lower-middle-income,Zambia,0,0.0,,,
2,Yellow fever - number of reported cases,Published,2016,Americas,Lower-middle-income,Guatemala,0,0.0,,,
3,Yellow fever - number of reported cases,Published,2015,Africa,Lower-middle-income,Ghana,0,0.0,,,
4,Yellow fever - number of reported cases,Published,2004,Africa,Low-income,Guinea-Bissau,0,0.0,,,


Unnamed: 0,GHO,PUBLISHSTATE,YEAR,REGION,COUNTRY,DISPLAY VALUE,NUMERIC VALUE,LOW RANGE,HIGH RANGE,Comment
0,Number of new leprosy cases,Published,2006,,Kiribati,41,41.0,,,
1,Number of new leprosy cases,Published,2008,,Iraq,0,0.0,,,
2,Number of new leprosy cases,Published,2007,,Eswatini,No data,,,,
3,Number of new leprosy cases,Published,2014,,Guinea-Bissau,No data,,,,
4,Number of new leprosy cases,Published,2012,,Brazil,33303,33303.0,,,


In [23]:
mal_cols = mal_raw.loc[:, ['YEAR', 'COUNTRY', 'NUMERIC VALUE']].copy('deep')
mal_cols['DISEASE'] = 'Malaria'

In [24]:
yel_cols = yel_raw.loc[:, ['YEAR', 'COUNTRY', 'NUMERIC VALUE']].copy('deep')
yel_cols['DISEASE'] = 'Yellow Fever'

In [25]:
lep_cols = lep_raw.loc[:, ['YEAR', 'COUNTRY', 'NUMERIC VALUE']].copy('deep')
lep_cols['DISEASE'] = 'Leprosy'

In [26]:
# observe changes to data
display(mal_cols.count())
display(yel_cols.count())
display(lep_cols.count())

YEAR             1710
COUNTRY          1710
NUMERIC VALUE    1710
DISEASE          1710
dtype: int64

YEAR             2706
COUNTRY          2706
NUMERIC VALUE    2706
DISEASE          2706
dtype: int64

YEAR             2412
COUNTRY          2412
NUMERIC VALUE    1618
DISEASE          2412
dtype: int64

In [27]:
dfs = [mal_cols, yel_cols, lep_cols]

In [28]:
disease_df = pd.concat(dfs, sort=True)

In [29]:
disease_df = disease_df[['YEAR', 'COUNTRY', 'DISEASE', 'NUMERIC VALUE']]
disease_df.head()

Unnamed: 0,YEAR,COUNTRY,DISEASE,NUMERIC VALUE
0,2011,Cambodia,Malaria,57423.0
1,2005,Gabon,Malaria,70644.0
2,2017,Central African Republic,Malaria,383309.0
3,2014,Costa Rica,Malaria,0.0
4,2008,Congo,Malaria,117291.0


In [30]:
# can fill Nan by 0 as this will make the following analysis ignore the missing data
disease_df['NUMERIC VALUE'].fillna(0)


0         57423.0
1         70644.0
2        383309.0
3             0.0
4        117291.0
5          1741.0
6          1025.0
7             3.0
8        280550.0
9           313.0
10           29.0
11         4345.0
12          531.0
13        32037.0
14        19725.0
15       142309.0
16       333871.0
17            6.0
18        31436.0
19        54216.0
20        17583.0
21          762.0
22       268912.0
23          544.0
24       281182.0
25        48441.0
26      2118815.0
27         2421.0
28       465004.0
29            0.0
          ...    
2382          3.0
2383          0.0
2384          2.0
2385       6114.0
2386          5.0
2387          0.0
2388          3.0
2389          0.0
2390          0.0
2391          0.0
2392          2.0
2393        207.0
2394          0.0
2395        169.0
2396          0.0
2397          0.0
2398          0.0
2399          0.0
2400          0.0
2401        400.0
2402          0.0
2403          0.0
2404          5.0
2405        759.0
2406      

In [51]:
# getting temp data
temp_url = 'https://www.ncdc.noaa.gov/cag/global/time-series/globe/land_ocean/1/12/1980-2019'
temp_df = pd.read_html(temp_url)[0].drop('Rank', axis=1)

In [52]:
# remove °C and cast to numeric
for row in temp_df.iterrows():
    row[1]['Anomaly(1901-2000 Base Period)'] = row[1]['Anomaly(1901-2000 Base Period)'][:-2]

In [53]:
temp_df

Unnamed: 0,Year,Anomaly(1901-2000 Base Period)
0,1980,0.27°C
1,1981,0.43°C
2,1982,0.44°C
3,1983,0.28°C
4,1984,-0.09°C
5,1985,0.20°C
6,1986,0.22°C
7,1987,0.53°C
8,1988,0.36°C
9,1989,0.40°C
