In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import requests
import sqlite3
import pymongo

## Extracting the longitude Latitude table

In [13]:
# found a table containing longitude and latitudes of countries and 
# brought it into pandas

long_lat=pd.read_html("https://developers.google.com/public-data/docs/canonical/countries_csv",skiprows=1)[0]

# renaming columns
long_lat.columns = ['country_code', 'latitude','longitude','name']

# printing dataframe
long_lat.head()

Unnamed: 0,country_code,latitude,longitude,name
0,AE,23.424076,53.847818,United Arab Emirates
1,AF,33.93911,67.709953,Afghanistan
2,AG,17.060816,-61.796428,Antigua and Barbuda
3,AI,18.220554,-63.068615,Anguilla
4,AL,41.153332,20.168331,Albania


In [14]:
#  country code 
converting_country_code=pd.read_html("https://www.worldatlas.com/aatlas/ctycodes.htm",skiprows=1)[0]
converting_country_code.columns= ['name','country_code_2','country_code_3','x','y']
converting_country_code=converting_country_code.drop(['x', 'y'], axis=1)
converting_country_code.head()


Unnamed: 0,name,country_code_2,country_code_3
0,Afghanistan,AF,AFG
1,Albania,AL,ALB
2,Algeria,DZ,DZA
3,American Samoa,AS,ASM
4,Andorra,AD,AND


In [15]:
merged_col=pd.merge(converting_country_code, long_lat, left_on='country_code_2',right_on="country_code",how="inner")
merged_col.head()



Unnamed: 0,name_x,country_code_2,country_code_3,country_code,latitude,longitude,name_y
0,Afghanistan,AF,AFG,AF,33.93911,67.709953,Afghanistan
1,Albania,AL,ALB,AL,41.153332,20.168331,Albania
2,Algeria,DZ,DZA,DZ,28.033886,1.659626,Algeria
3,American Samoa,AS,ASM,AS,-14.270972,-170.132217,American Samoa
4,Angola,AO,AGO,AO,-11.202692,17.873887,Angola


In [16]:
# cleaning up the pandas dataframe
# dropping cols I dont need
merged_col=merged_col.drop(['name_y','country_code','country_code_2'], axis=1)
merged_col.head()


Unnamed: 0,name_x,country_code_3,latitude,longitude
0,Afghanistan,AFG,33.93911,67.709953
1,Albania,ALB,41.153332,20.168331
2,Algeria,DZA,28.033886,1.659626
3,American Samoa,ASM,-14.270972,-170.132217
4,Angola,AGO,-11.202692,17.873887


In [17]:
# renaming cols
merged_col.columns = ['country','country_code','latitude','longitude']

In [18]:
merged_col.head()


Unnamed: 0,country,country_code,latitude,longitude
0,Afghanistan,AFG,33.93911,67.709953
1,Albania,ALB,41.153332,20.168331
2,Algeria,DZA,28.033886,1.659626
3,American Samoa,ASM,-14.270972,-170.132217
4,Angola,AGO,-11.202692,17.873887


In [19]:
# Saved the merged cols into a csv
# merged_col.to_csv("/Users/muhammadwaliji/Desktop/project_2/country_code.csv")


In [20]:
# Used the WHO API to get access to how they add the country code and will be 
# merging it with our own table to make sure the data we intend to parse in future from WHO alligns with our 
# merged_col
url = "http://apps.who.int/gho/athena/api/COUNTRY?format=json"
country_r = requests.get(url).json()


In [21]:
# checking to see what needs to be added into the country_r to output the country_code 

country_r['dimension'][0]['code'][0]['label']


'AFG'

In [22]:
# Creating a list of all the country_codes I can get from the WHO website

who_country_list=[]

country_code=country_r['dimension'][0]['code']

for country in country_code:
    who_country_list.append(country["label"])

In [23]:
# converting the list into a dataframe and renaming the col. I did this 
# so I can merge this dataframe with the one we made earlier to make sure all the who countries are in the mergel_col

who_df=pd.DataFrame(who_country_list)
who_df.columns=["who_country"]
who_df.head()


Unnamed: 0,who_country
0,AFG
1,ALB
2,DZA
3,AND
4,AGO


In [24]:
# merging the two data frames together on country code. this output 226 
# countries which is sufficent for our data set
who_and_others=pd.merge(merged_col, who_df, left_on='country_code',right_on="who_country",how="inner")
who_and_others.head()


Unnamed: 0,country,country_code,latitude,longitude,who_country
0,Afghanistan,AFG,33.93911,67.709953,AFG
1,Albania,ALB,41.153332,20.168331,ALB
2,Algeria,DZA,28.033886,1.659626,DZA
3,American Samoa,ASM,-14.270972,-170.132217,ASM
4,Angola,AGO,-11.202692,17.873887,AGO


In [25]:
who_and_others=who_and_others.drop(['who_country'], axis=1)


In [26]:
who_and_others.head(1)
# who_and_others.shape

Unnamed: 0,country,country_code,latitude,longitude
0,Afghanistan,AFG,33.93911,67.709953


In [27]:
# who_and_others.to_csv("/Users/muhammadwaliji/Desktop/project_2/who_inc_country_code.csv")


## Extracting the Disease data using API

In [28]:
## diseases calls

url = "http://apps.who.int/gho/athena/data/GHO/"
url_after = "?format=html&filter=COUNTRY:*"
malaria_code = "WHS3_48"
yel_fev_code = "WHS3_50"
leprosy_code = "WHS3_45"

In [29]:
malaria_url = url + malaria_code + url_after
yel_fev_url = url + yel_fev_code + url_after
leprosy_url = url + leprosy_code + url_after

In [30]:
mal_raw = pd.read_html(malaria_url)[0]

In [31]:
yel_raw = pd.read_html(yel_fev_url)[0]

In [32]:
lep_raw = pd.read_html(leprosy_url)[0]

In [33]:
# display data to observe
display(mal_raw.head())
display(yel_raw.head())
display(lep_raw.head())

Unnamed: 0,GHO,PUBLISHSTATE,YEAR,REGION,COUNTRY,DISPLAY VALUE,NUMERIC VALUE,LOW RANGE,HIGH RANGE,Comment
0,Malaria - number of reported confirmed cases,Published,2013,Eastern Mediterranean,Afghanistan,39263,39263.0,,,
1,Malaria - number of reported confirmed cases,Published,2015,Eastern Mediterranean,Afghanistan,86895,86895.0,,,
2,Malaria - number of reported confirmed cases,Published,2012,Africa,Algeria,55,55.0,,,
3,Malaria - number of reported confirmed cases,Published,2014,Africa,Algeria,0,0.0,,,
4,Malaria - number of reported confirmed cases,Published,2016,Africa,Angola,3794253,3794253.0,,,


Unnamed: 0,GHO,PUBLISHSTATE,YEAR,REGION,WORLDBANKINCOMEGROUP,COUNTRY,DISPLAY VALUE,NUMERIC VALUE,LOW RANGE,HIGH RANGE,Comment
0,Yellow fever - number of reported cases,Published,2008,Europe,High-income,France,0,0.0,,,
1,Yellow fever - number of reported cases,Published,2004,Africa,Lower-middle-income,Zambia,0,0.0,,,
2,Yellow fever - number of reported cases,Published,2016,Americas,Lower-middle-income,Guatemala,0,0.0,,,
3,Yellow fever - number of reported cases,Published,2015,Africa,Lower-middle-income,Ghana,0,0.0,,,
4,Yellow fever - number of reported cases,Published,2004,Africa,Low-income,Guinea-Bissau,0,0.0,,,


Unnamed: 0,GHO,PUBLISHSTATE,YEAR,REGION,COUNTRY,DISPLAY VALUE,NUMERIC VALUE,LOW RANGE,HIGH RANGE,Comment
0,Number of new leprosy cases,Published,2005,,Algeria,0,0.0,,,
1,Number of new leprosy cases,Published,2005,,Bahrain,0,0.0,,,
2,Number of new leprosy cases,Published,2005,,Cook Islands,0,0.0,,,
3,Number of new leprosy cases,Published,2005,,Democratic People's Republic of Korea,0,0.0,,,
4,Number of new leprosy cases,Published,2005,,Djibouti,0,0.0,,,


In [34]:
mal_cols = mal_raw.loc[:, ['YEAR', 'COUNTRY', 'NUMERIC VALUE']].copy('deep')
mal_cols['DISEASE'] = 'Malaria'

In [35]:
yel_cols = yel_raw.loc[:, ['YEAR', 'COUNTRY', 'NUMERIC VALUE']].copy('deep')
yel_cols['DISEASE'] = 'Yellow Fever'

In [36]:
lep_cols = lep_raw.loc[:, ['YEAR', 'COUNTRY', 'NUMERIC VALUE']].copy('deep')
lep_cols['DISEASE'] = 'Leprosy'

In [37]:
# observe changes to data
display(mal_cols.head())
display(yel_cols.head())
display(lep_cols.head())

Unnamed: 0,YEAR,COUNTRY,NUMERIC VALUE,DISEASE
0,2013,Afghanistan,39263.0,Malaria
1,2015,Afghanistan,86895.0,Malaria
2,2012,Algeria,55.0,Malaria
3,2014,Algeria,0.0,Malaria
4,2016,Angola,3794253.0,Malaria


Unnamed: 0,YEAR,COUNTRY,NUMERIC VALUE,DISEASE
0,2008,France,0.0,Yellow Fever
1,2004,Zambia,0.0,Yellow Fever
2,2016,Guatemala,0.0,Yellow Fever
3,2015,Ghana,0.0,Yellow Fever
4,2004,Guinea-Bissau,0.0,Yellow Fever


Unnamed: 0,YEAR,COUNTRY,NUMERIC VALUE,DISEASE
0,2005,Algeria,0.0,Leprosy
1,2005,Bahrain,0.0,Leprosy
2,2005,Cook Islands,0.0,Leprosy
3,2005,Democratic People's Republic of Korea,0.0,Leprosy
4,2005,Djibouti,0.0,Leprosy


In [38]:
dfs = [mal_cols, yel_cols, lep_cols]

In [39]:
disease_df = pd.concat(dfs)

In [42]:
disease_df = disease_df[['YEAR', 'COUNTRY', 'DISEASE', 'NUMERIC VALUE']]
disease_df['NUMERIC VALUE'].fillna(0, inplace=True)
disease_df.count()

YEAR             6828
COUNTRY          6828
DISEASE          6828
NUMERIC VALUE    6828
dtype: int64

In [43]:
disease_df.loc[
    (disease_df['COUNTRY'] == 'Congo') &
    (disease_df['DISEASE'] == 'Malaria')
].sort_values('YEAR')


Unnamed: 0,YEAR,COUNTRY,DISEASE,NUMERIC VALUE
259,2000,Congo,Malaria,15751.0
260,2001,Congo,Malaria,11981.0
1113,2002,Congo,Malaria,7677.0
1114,2003,Congo,Malaria,1633.0
687,2004,Congo,Malaria,293.0
1328,2005,Congo,Malaria,67.0
474,2007,Congo,Malaria,103213.0
1329,2008,Congo,Malaria,117291.0
688,2009,Congo,Malaria,92855.0
47,2011,Congo,Malaria,37744.0


# Temperature data from NOAA


In [44]:
temp_data_url="https://www.ncdc.noaa.gov/cag/global/time-series/globe/land_ocean/12/9/1980-2019"

In [45]:
temp_data_df=pd.read_html(temp_data_url)[0]
temp_data_df=temp_data_df.drop(['Rank'], axis=1)

In [46]:
temp_data_df.head(6)

Unnamed: 0,Year,Anomaly(1901-2000 Base Period)
0,1980,0.29°C
1,1981,0.30°C
2,1982,0.19°C
3,1983,0.33°C
4,1984,0.22°C
5,1985,0.10°C


In [47]:
temp_data_df.loc[:,"Anomaly(1901-2000 Base Period)"] = temp_data_df["Anomaly(1901-2000 Base Period)"].str.replace("°C","")
temp_data_df["Anomaly(1901-2000 Base Period)"] = pd.to_numeric(temp_data_df["Anomaly(1901-2000 Base Period)"])


In [48]:
temp_data_df["Difference"]=temp_data_df["Anomaly(1901-2000 Base Period)"]-0.10

In [49]:
temp_data_df.sort_values('Difference')

Unnamed: 0,Year,Anomaly(1901-2000 Base Period),Difference
5,1985,0.1,0.0
2,1982,0.19,0.09
4,1984,0.22,0.12
6,1986,0.22,0.12
13,1993,0.27,0.17
0,1980,0.29,0.19
14,1994,0.29,0.19
9,1989,0.29,0.19
1,1981,0.3,0.2
12,1992,0.3,0.2


## Option 1: Use pd.to_sql

In [37]:
# SQlite file name and location

# fix this when running on your own computer as this is the file name for Schehrbano's computer

sqlite_filename = '/Users/muhammadwaliji/Desktop/project_2/global_diseases.sqlite'


In [38]:
# connection to the sqlite file 
conn = sqlite3.connect(sqlite_filename)
cur = conn.cursor()

In [39]:
# Adding the country long lat to a sqlite file
who_and_others.to_sql("country_long_lat", conn, if_exists="replace")


In [40]:
# Adding the disease instances table to a sqlite file
disease_df.to_sql('instances_table', conn, if_exists='replace')

  dtype=dtype)


In [41]:
temp_data_df.to_sql('temp_data', conn, if_exists='replace')

  dtype=dtype)


In [42]:
# At this point we are essentially done adding things to sqlite
#  if this is the end, run the following lines
# cur.close()
# conn.close()

# If you want to run sql queries now, do as follows 
# |
# |
# |
# V

In [43]:
# Easy way of reading the data back directly into pandas.

# df_from_sqlite = pd.read_sql_query("""
# SELECT * FROM country_long_lat
# """, conn)

In [44]:
# df_from_sqlite.head()

In [45]:
cur.close()
conn.close()