In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import requests
from pymongo import MongoClient

## Extracting the longitude Latitude table

In [2]:
# found a table containing longitude and latitudes of countries and 
# brought it into pandas

long_lat=pd.read_html("https://developers.google.com/public-data/docs/canonical/countries_csv",skiprows=1)[0]

# renaming columns
long_lat.columns = ['country_code', 'latitude','longitude','name']

# printing dataframe
long_lat.head()

Unnamed: 0,country_code,latitude,longitude,name
0,AE,23.424076,53.847818,United Arab Emirates
1,AF,33.93911,67.709953,Afghanistan
2,AG,17.060816,-61.796428,Antigua and Barbuda
3,AI,18.220554,-63.068615,Anguilla
4,AL,41.153332,20.168331,Albania


In [3]:
#  country code 
converting_country_code=pd.read_html("https://www.worldatlas.com/aatlas/ctycodes.htm",skiprows=1)[0]
converting_country_code.columns= ['name','country_code_2','country_code_3','x','y']
converting_country_code=converting_country_code.drop(['x', 'y'], axis=1)
converting_country_code.head()


Unnamed: 0,name,country_code_2,country_code_3
0,Afghanistan,AF,AFG
1,Albania,AL,ALB
2,Algeria,DZ,DZA
3,American Samoa,AS,ASM
4,Andorra,AD,AND


In [4]:
merged_col=pd.merge(converting_country_code, long_lat, left_on='country_code_2',right_on="country_code",how="inner")
merged_col.head()



Unnamed: 0,name_x,country_code_2,country_code_3,country_code,latitude,longitude,name_y
0,Afghanistan,AF,AFG,AF,33.93911,67.709953,Afghanistan
1,Albania,AL,ALB,AL,41.153332,20.168331,Albania
2,Algeria,DZ,DZA,DZ,28.033886,1.659626,Algeria
3,American Samoa,AS,ASM,AS,-14.270972,-170.132217,American Samoa
4,Angola,AO,AGO,AO,-11.202692,17.873887,Angola


In [5]:
# cleaning up the pandas dataframe
# dropping cols I dont need
merged_col=merged_col.drop(['name_y','country_code','country_code_2'], axis=1)
merged_col.head()


Unnamed: 0,name_x,country_code_3,latitude,longitude
0,Afghanistan,AFG,33.93911,67.709953
1,Albania,ALB,41.153332,20.168331
2,Algeria,DZA,28.033886,1.659626
3,American Samoa,ASM,-14.270972,-170.132217
4,Angola,AGO,-11.202692,17.873887


In [6]:
# renaming cols
merged_col.columns = ['country','country_code','latitude','longitude']

In [7]:
merged_col.head()


Unnamed: 0,country,country_code,latitude,longitude
0,Afghanistan,AFG,33.93911,67.709953
1,Albania,ALB,41.153332,20.168331
2,Algeria,DZA,28.033886,1.659626
3,American Samoa,ASM,-14.270972,-170.132217
4,Angola,AGO,-11.202692,17.873887


In [8]:
# Accessing WHO country list to narrow lat/longs to relevent names
country_r = requests.get("http://apps.who.int/gho/athena/api/COUNTRY?format=json").json()



In [9]:
# checking to see what needs to be added into the country_r to output the country_code 

country_r['dimension'][0]['code'][0]['label']


'AFG'

In [10]:
# Creating a list of all the country_codes I can get from the WHO website

who_country_list=[]

country_code=country_r['dimension'][0]['code']

for country in country_code:
    who_country_list.append(country["label"])

In [11]:
# converting the list into a dataframe and renaming the col. I did this 
# so I can merge this dataframe with the one we made earlier to make sure all the who countries are in the mergel_col
who_df=pd.DataFrame(who_country_list)
who_df.columns=["who_country"]
who_df.head()


Unnamed: 0,who_country
0,AFG
1,ALB
2,DZA
3,AND
4,AGO


In [12]:
# merging the two data frames together on country code. this output 226 
# countries which is sufficent for our data set
who_and_others=pd.merge(merged_col, who_df, left_on='country_code',right_on="who_country",how="inner")
who_and_others.head()


Unnamed: 0,country,country_code,latitude,longitude,who_country
0,Afghanistan,AFG,33.93911,67.709953,AFG
1,Albania,ALB,41.153332,20.168331,ALB
2,Algeria,DZA,28.033886,1.659626,DZA
3,American Samoa,ASM,-14.270972,-170.132217,ASM
4,Angola,AGO,-11.202692,17.873887,AGO


In [13]:
who_and_others=who_and_others.drop(['who_country'], axis=1)


In [14]:
who_and_others.head(1)
# who_and_others.shape

Unnamed: 0,country,country_code,latitude,longitude
0,Afghanistan,AFG,33.93911,67.709953


## Extracting the Disease data using API

In [15]:
## diseases calls

url_before = "http://apps.who.int/gho/athena/data/GHO/"
url_after = "?format=html&filter=COUNTRY:*"
malaria_code = "WHS3_48"
yel_fev_code = "WHS3_50"
leprosy_code = "WHS3_45"
code_list = ["WHS3_48", "WHS3_50", "WHS3_45"]
disease_list = ["Malaria", "Yellow Fever", "Leprosy"]

In [16]:
malaria_url = url_before + malaria_code + url_after
yel_fev_url = url_before + yel_fev_code + url_after
leprosy_url = url_before + leprosy_code + url_after

In [17]:
mal_raw = pd.read_html(malaria_url)[0]

In [18]:
yel_raw = pd.read_html(yel_fev_url)[0]

In [19]:
lep_raw = pd.read_html(leprosy_url)[0]

In [20]:
# display data to observe
display(mal_raw.head())
display(yel_raw.head())
display(lep_raw.head())

Unnamed: 0,GHO,PUBLISHSTATE,YEAR,REGION,COUNTRY,DISPLAY VALUE,NUMERIC VALUE,LOW RANGE,HIGH RANGE,Comment
0,Malaria - number of reported confirmed cases,Published,2011,Western Pacific,Cambodia,57423,57423.0,,,
1,Malaria - number of reported confirmed cases,Published,2005,Africa,Gabon,70644,70644.0,,,
2,Malaria - number of reported confirmed cases,Published,2017,Africa,Central African Republic,383309,383309.0,,,
3,Malaria - number of reported confirmed cases,Published,2014,Americas,Costa Rica,0,0.0,,,
4,Malaria - number of reported confirmed cases,Published,2008,Africa,Congo,117291,117291.0,,,


Unnamed: 0,GHO,PUBLISHSTATE,YEAR,REGION,WORLDBANKINCOMEGROUP,COUNTRY,DISPLAY VALUE,NUMERIC VALUE,LOW RANGE,HIGH RANGE,Comment
0,Yellow fever - number of reported cases,Published,2008,Europe,High-income,France,0,0.0,,,
1,Yellow fever - number of reported cases,Published,2004,Africa,Lower-middle-income,Zambia,0,0.0,,,
2,Yellow fever - number of reported cases,Published,2016,Americas,Lower-middle-income,Guatemala,0,0.0,,,
3,Yellow fever - number of reported cases,Published,2015,Africa,Lower-middle-income,Ghana,0,0.0,,,
4,Yellow fever - number of reported cases,Published,2004,Africa,Low-income,Guinea-Bissau,0,0.0,,,


Unnamed: 0,GHO,PUBLISHSTATE,YEAR,REGION,COUNTRY,DISPLAY VALUE,NUMERIC VALUE,LOW RANGE,HIGH RANGE,Comment
0,Number of new leprosy cases,Published,2006,,Kiribati,41,41.0,,,
1,Number of new leprosy cases,Published,2008,,Iraq,0,0.0,,,
2,Number of new leprosy cases,Published,2007,,Eswatini,No data,,,,
3,Number of new leprosy cases,Published,2014,,Guinea-Bissau,No data,,,,
4,Number of new leprosy cases,Published,2012,,Brazil,33303,33303.0,,,


In [41]:
mal_cols = mal_raw.loc[:, ['YEAR', 'COUNTRY', 'NUMERIC VALUE']].copy('deep')
mal_cols['DISEASE'] = 'Malaria'

In [42]:
yel_cols = yel_raw.loc[:, ['YEAR', 'COUNTRY', 'NUMERIC VALUE']].copy('deep')
yel_cols['DISEASE'] = 'Yellow Fever'

In [43]:
lep_cols = lep_raw.loc[:, ['YEAR', 'COUNTRY', 'NUMERIC VALUE']].copy('deep')
lep_cols['DISEASE'] = 'Leprosy'

In [24]:
# observe changes to data
display(mal_cols.head())
display(yel_cols.head())
display(lep_cols.head())

Unnamed: 0,YEAR,COUNTRY,NUMERIC VALUE,DISEASE
0,2011,Cambodia,57423.0,Malaria
1,2005,Gabon,70644.0,Malaria
2,2017,Central African Republic,383309.0,Malaria
3,2014,Costa Rica,0.0,Malaria
4,2008,Congo,117291.0,Malaria


Unnamed: 0,YEAR,COUNTRY,NUMERIC VALUE,DISEASE
0,2008,France,0.0,Yellow Fever
1,2004,Zambia,0.0,Yellow Fever
2,2016,Guatemala,0.0,Yellow Fever
3,2015,Ghana,0.0,Yellow Fever
4,2004,Guinea-Bissau,0.0,Yellow Fever


Unnamed: 0,YEAR,COUNTRY,NUMERIC VALUE,DISEASE
0,2006,Kiribati,41.0,Leprosy
1,2008,Iraq,0.0,Leprosy
2,2007,Eswatini,,Leprosy
3,2014,Guinea-Bissau,,Leprosy
4,2012,Brazil,33303.0,Leprosy


In [44]:
dfs = [mal_cols, yel_cols, lep_cols]

In [45]:
disease_df = pd.concat(dfs)

In [46]:
disease_df.columns = ['year', 'country', 'instances', 'disease']
disease_df.head()

Unnamed: 0,year,country,instances,disease
0,2011,Cambodia,57423.0,Malaria
1,2005,Gabon,70644.0,Malaria
2,2017,Central African Republic,383309.0,Malaria
3,2014,Costa Rica,0.0,Malaria
4,2008,Congo,117291.0,Malaria


In [29]:
disease_df["instances"]=disease_df["instances"].fillna(0)
disease_df.count()

year         6828
country      6828
disease      6828
instances    6828
dtype: int64

# Temperature data from NOAA


In [30]:
temp_data_url="https://www.ncdc.noaa.gov/cag/global/time-series/globe/land_ocean/12/9/1980-2019"

In [31]:
temp_data_df=pd.read_html(temp_data_url)[0]
temp_data_df=temp_data_df.drop(['Rank'], axis=1)

In [32]:
temp_data_df.head(6)

Unnamed: 0,Year,Anomaly(1901-2000 Base Period)
0,1980,0.29°C
1,1981,0.30°C
2,1982,0.19°C
3,1983,0.33°C
4,1984,0.22°C
5,1985,0.10°C


In [33]:
temp_data_df.loc[:,"Anomaly(1901-2000 Base Period)"] = temp_data_df["Anomaly(1901-2000 Base Period)"].str.replace("°C","")
temp_data_df["Anomaly(1901-2000 Base Period)"] = pd.to_numeric(temp_data_df["Anomaly(1901-2000 Base Period)"])


In [34]:
temp_data_df["Normalized Temp"]=temp_data_df["Anomaly(1901-2000 Base Period)"]-temp_data_df["Anomaly(1901-2000 Base Period)"].min()

In [35]:
temp_data_df.head(6)

Unnamed: 0,Year,Anomaly(1901-2000 Base Period),Normalized Temp
0,1980,0.29,0.19
1,1981,0.3,0.2
2,1982,0.19,0.09
3,1983,0.33,0.23
4,1984,0.22,0.12
5,1985,0.1,0.0


In [55]:
# # initiate mongo connection
client = MongoClient()

In [58]:
db = client['disease_db']

In [59]:
collection = db.disease_collection


In [37]:
# make df with latitudes and disease instance information
disease_lat = pd.merge(disease_df, who_and_others[['country', 'latitude']], on='country')
disease_lat.head()

Unnamed: 0,year,country,disease,instances,latitude
0,2011,Cambodia,Malaria,57423.0,12.565679
1,2002,Cambodia,Malaria,46902.0,12.565679
2,2000,Cambodia,Malaria,62442.0,12.565679
3,2009,Cambodia,Malaria,64595.0,12.565679
4,2015,Cambodia,Malaria,33930.0,12.565679


In [38]:
# make dict for JSON format
years = list(temp_data_df['Year'])

In [39]:
# make dict of one list to upload to mongo
JSON = {
    'data':[]
}
# up to 2016 to match yellow fever data
for year in years[:-2]:
    # initialize dict to add info for each year
    document = {}
    # add temp
    document['temp'] = temp_data_df.loc[
        temp_data_df['Year'] == year, 
        ['Normalized Temp']
    ].values[0][0]
    # add year
    document['year'] = year
    # add disease list, to which disease dicts will be added
    document['disease'] = []
    # iterate over diseases in the dataframe
    for disease in disease_lat['disease'].unique():
        disease_dict = {}
        # check that there is info for a given disease for that year
        if not disease_lat.loc[(disease_lat['disease'] == disease) &
                            (disease_lat['year'] == year) &
                            (disease_lat['instances'] > 2), 
                            :].empty:
            # if there is info, query min and max latitude of spread for that year
            try:
                max_lat = disease_lat.loc[(disease_lat['disease'] == disease) &
                                      (disease_lat['year'] == year) &
                                      (disease_lat['instances'] > 2), 
                                      :]['latitude'].max()
                min_lat = disease_lat.loc[(disease_lat['disease'] == disease) &
                                      (disease_lat['year'] == year) &
                                      (disease_lat['instances'] > 2), 
                                      :]['latitude'].min()
                disease_dict['name'] = disease
                disease_dict['max_lat'] = max_lat
                disease_dict['min_lat'] = min_lat
                # add disease info to list for that year
                document['disease'].append(disease_dict)
            except:
                pass
    # add each year's dictionary to overall list
    JSON['data'].append(document)


In [40]:
JSON

{'data': [{'temp': 0.18999999999999997,
   'year': 1980,
   'disease': [{'name': 'Yellow Fever',
     'max_lat': 9.081999,
     'min_lat': -14.235004}]},
  {'temp': 0.19999999999999998,
   'year': 1981,
   'disease': [{'name': 'Yellow Fever',
     'max_lat': 14.497401000000002,
     'min_lat': -14.235004}]},
  {'temp': 0.09,
   'year': 1982,
   'disease': [{'name': 'Yellow Fever',
     'max_lat': 7.946527000000001,
     'min_lat': -14.235004}]},
  {'temp': 0.23,
   'year': 1983,
   'disease': [{'name': 'Yellow Fever',
     'max_lat': 12.238333,
     'min_lat': -14.235004}]},
  {'temp': 0.12,
   'year': 1984,
   'disease': [{'name': 'Yellow Fever',
     'max_lat': 12.238333,
     'min_lat': -14.235004}]},
  {'temp': 0.0,
   'year': 1985,
   'disease': [{'name': 'Yellow Fever',
     'max_lat': 12.238333,
     'min_lat': -14.235004}]},
  {'temp': 0.12,
   'year': 1986,
   'disease': [{'name': 'Yellow Fever',
     'max_lat': 9.081999,
     'min_lat': -14.235004}]},
  {'temp': 0.22,
   'yea

In [176]:
collection.update({}, JSON, upsert=True)

  """Entry point for launching an IPython kernel.


{'n': 1, 'nModified': 1, 'ok': 1.0, 'updatedExisting': True}

In [182]:
collection.find()[0]

{'_id': ObjectId('5ca564b109db5618be288b0c'),
 'data': [{'temp': 0.18999999999999997,
   'year': 1980,
   'disease': [{'name': 'Yellow Fever',
     'max_lat': 9.081999,
     'min_lat': -14.235004}]},
  {'temp': 0.19999999999999998,
   'year': 1981,
   'disease': [{'name': 'Yellow Fever',
     'max_lat': 14.497401000000002,
     'min_lat': -14.235004}]},
  {'temp': 0.09,
   'year': 1982,
   'disease': [{'name': 'Yellow Fever',
     'max_lat': 7.946527000000001,
     'min_lat': -14.235004}]},
  {'temp': 0.23,
   'year': 1983,
   'disease': [{'name': 'Yellow Fever',
     'max_lat': 12.238333,
     'min_lat': -14.235004}]},
  {'temp': 0.12,
   'year': 1984,
   'disease': [{'name': 'Yellow Fever',
     'max_lat': 12.238333,
     'min_lat': -14.235004}]},
  {'temp': 0.0,
   'year': 1985,
   'disease': [{'name': 'Yellow Fever',
     'max_lat': 12.238333,
     'min_lat': -14.235004}]},
  {'temp': 0.12,
   'year': 1986,
   'disease': [{'name': 'Yellow Fever',
     'max_lat': 9.081999,
     'min