## Visualization Portfolio Project

##### This is mini-visualization project to combine web automation and data visualization with key Python 3 packages to produce an up to date display of all Confirmed Covid-19 Cases in Ontario by location

##### Web Automation
This would browse the data.ontario.ca site to collect latest data

In [1]:
# very basic web automation to download the data using the request library

import requests

url = 'https://data.ontario.ca/dataset/f4112442-bdc8-45d2-be3c-12efae72fb27/resource/455fd63b-603d-4608-8216-7d8647f43350/download/conposcovidloc.csv'
myfile = requests.get(url)
open('data/conposcovidloc.csv', 'wb').write(myfile.content)

3808434

##### Data Analysis (Cleaning and Exploration)

In [2]:
import numpy as np
import pandas as pd
import googlemaps
import geopandas as gpd
import seaborn
from shapely.geometry import Point, Polygon

In [3]:
Counties = gpd.read_file('data/FME_b03600530703068/obm_OBMUSER_Municipal_upper_tier_poly.shp')
Counties = Counties.sort_values('OFF_NAME')
Counties.head(3)

Unnamed: 0,GEODB_OID,OBJECTID,OBJECT_ID,DESCR,GUT_NUMBER,OFF_NAME,LOC_NAME,SHAPE_AREA,SHAPE_LEN,geometry
6,7,7,57438.0,"Municipal, Upper Tier",1293.0,ALGOMA,ALGOMA,6.259083,18.455851,"POLYGON ((-83.39369 46.23813, -83.40462 46.240..."
42,43,43,57460.0,"Municipal, Upper Tier",1054.0,BRANT,BRANT,0.123686,1.698624,"POLYGON ((-80.06115 42.99635, -80.06309 42.993..."
20,21,21,57468.0,"Municipal, Upper Tier",1054.0,BRUCE,BRUCE,0.47927,5.444395,"POLYGON ((-81.10314 44.45166, -81.09335 44.410..."


In [4]:
covid_19 = pd.read_csv('data/conposcovidloc.csv')

cv0 = pd.get_dummies(covid_19, columns = ['Outcome1','Client_Gender','Age_Group'])
# this creates dummy variables for the categorical variables names
cv0 = cv0.drop(cv0.columns[0],axis = 1)
cv0 = cv0.replace(0,np.nan)
cv0.head(3)

Unnamed: 0,Accurate_Episode_Date,Case_AcquisitionInfo,Reporting_PHU,Reporting_PHU_Address,Reporting_PHU_City,Reporting_PHU_Postal_Code,Reporting_PHU_Website,Reporting_PHU_Latitude,Reporting_PHU_Longitude,Outcome1_Fatal,...,Age_Group_20s,Age_Group_30s,Age_Group_40s,Age_Group_50s,Age_Group_60s,Age_Group_70s,Age_Group_80s,Age_Group_90s,Age_Group_<20,Age_Group_Unknown
0,2020-04-29,Information pending,Peel Public Health,7120 Hurontario Street,Mississauga,L5W 1N4,www.peelregion.ca/health/,43.647471,-79.708893,,...,,,,1.0,,,,,,
1,2020-04-12,Contact of a confirmed case,Ottawa Public Health,100 Constellation Drive,Ottawa,K2G 6J8,www.ottawapublichealth.ca,45.345665,-75.763912,,...,,1.0,,,,,,,,
2,2020-04-08,Neither,Ottawa Public Health,100 Constellation Drive,Ottawa,K2G 6J8,www.ottawapublichealth.ca,45.345665,-75.763912,,...,,1.0,,,,,,,,


In [5]:
# to confirm there are no more 0 values in the dataframe
cv0.isin([0]).sum()

Accurate_Episode_Date        0
Case_AcquisitionInfo         0
Reporting_PHU                0
Reporting_PHU_Address        0
Reporting_PHU_City           0
Reporting_PHU_Postal_Code    0
Reporting_PHU_Website        0
Reporting_PHU_Latitude       0
Reporting_PHU_Longitude      0
Outcome1_Fatal               0
Outcome1_Not Resolved        0
Outcome1_Resolved            0
Client_Gender_(blank)        0
Client_Gender_FEMALE         0
Client_Gender_MALE           0
Client_Gender_OTHER          0
Client_Gender_TRANSGENDER    0
Client_Gender_UNKNOWN        0
Age_Group_20s                0
Age_Group_30s                0
Age_Group_40s                0
Age_Group_50s                0
Age_Group_60s                0
Age_Group_70s                0
Age_Group_80s                0
Age_Group_90s                0
Age_Group_<20                0
Age_Group_Unknown            0
dtype: int64

In [6]:
# converting the zeros to NaN values enabled using the combination of the .groupby and .count method to sum only the 1s we created 
# through .get_dummies for the categorical variables

cv1 = cv0.groupby(['Reporting_PHU','Reporting_PHU_Latitude','Reporting_PHU_Longitude','Reporting_PHU_City']).count().reset_index()
cv1 = cv1.drop(['Accurate_Episode_Date','Case_AcquisitionInfo','Reporting_PHU_Postal_Code','Reporting_PHU_Website'], axis =1)
cv1.rename(columns = {'Outcome1_Not Resolved':'Outcome1_Not_Resolved'}, inplace = True)
cv1.head()

Unnamed: 0,Reporting_PHU,Reporting_PHU_Latitude,Reporting_PHU_Longitude,Reporting_PHU_City,Reporting_PHU_Address,Outcome1_Fatal,Outcome1_Not_Resolved,Outcome1_Resolved,Client_Gender_(blank),Client_Gender_FEMALE,...,Age_Group_20s,Age_Group_30s,Age_Group_40s,Age_Group_50s,Age_Group_60s,Age_Group_70s,Age_Group_80s,Age_Group_90s,Age_Group_<20,Age_Group_Unknown
0,Algoma Public Health Unit,46.532373,-84.314836,Sault Ste. Marie,16,0,3,13,0,9,...,2,1,3,3,4,2,0,1,0,0
1,Brant County Health Unit,43.151811,-80.274374,Brantford,108,4,9,95,0,62,...,20,9,21,22,10,9,6,3,8,0
2,Chatham-Kent Health Unit,42.403861,-82.208561,Chatham,90,1,6,83,0,26,...,23,25,19,9,11,2,1,0,0,0
3,Durham Region Health Department,43.898605,-78.940341,Whitby,1155,145,207,803,0,732,...,99,107,131,215,140,110,175,160,18,0
4,Eastern Ontario Health Unit,45.029152,-74.736298,Cornwall,131,7,32,92,0,68,...,9,18,13,25,28,13,11,8,5,1


In [7]:
# now to pass the names of the relevant cities back in to the gdf for the Counties

city_order = [0,1,5,22,31,3,26,32,12,5,6,7,8,9,10,11,12,18,2,13,14,14,12,27,30,15,25,16,17,7,19,26,17
              ,20,11,21,4,10,18,24,25,4,27,27,28,29,7,23,31,33]
county_city = []

for i,v in enumerate(city_order):
    city = cv1['Reporting_PHU_City'][v]
    county_city.append(city)
    
Counties['CITY']=county_city

Counties.head(3)

Unnamed: 0,GEODB_OID,OBJECTID,OBJECT_ID,DESCR,GUT_NUMBER,OFF_NAME,LOC_NAME,SHAPE_AREA,SHAPE_LEN,geometry,CITY
6,7,7,57438.0,"Municipal, Upper Tier",1293.0,ALGOMA,ALGOMA,6.259083,18.455851,"POLYGON ((-83.39369 46.23813, -83.40462 46.240...",Sault Ste. Marie
42,43,43,57460.0,"Municipal, Upper Tier",1054.0,BRANT,BRANT,0.123686,1.698624,"POLYGON ((-80.06115 42.99635, -80.06309 42.993...",Brantford
20,21,21,57468.0,"Municipal, Upper Tier",1054.0,BRUCE,BRUCE,0.47927,5.444395,"POLYGON ((-81.10314 44.45166, -81.09335 44.410...",Owen Sound


In [8]:
import whratio
cv1['TOTAL_CASES'] = cv1['Outcome1_Resolved']+cv1['Outcome1_Fatal']+cv1['Outcome1_Not_Resolved']
cv1['PERCENTAGE_Recovered'] = (cv1['Outcome1_Resolved']
                              /(cv1['TOTAL_CASES']))*100

cv1['GENDER_RATIO'] = [whratio.as_int(cv1['Client_Gender_MALE'][i],cv1['Client_Gender_FEMALE'][i]) for i in range(34)]
cv1['GENDER_RATIO1'] = [('{0}:{1}'.format(*cv1['GENDER_RATIO'][i])) for i in range(34)]
cv1.drop(['GENDER_RATIO'], axis=1, inplace = True)

# used tuple unpacking to return the tuple into a string and then format the string into a more presentable format
cv1.head()

Unnamed: 0,Reporting_PHU,Reporting_PHU_Latitude,Reporting_PHU_Longitude,Reporting_PHU_City,Reporting_PHU_Address,Outcome1_Fatal,Outcome1_Not_Resolved,Outcome1_Resolved,Client_Gender_(blank),Client_Gender_FEMALE,...,Age_Group_50s,Age_Group_60s,Age_Group_70s,Age_Group_80s,Age_Group_90s,Age_Group_<20,Age_Group_Unknown,TOTAL_CASES,PERCENTAGE_Recovered,GENDER_RATIO1
0,Algoma Public Health Unit,46.532373,-84.314836,Sault Ste. Marie,16,0,3,13,0,9,...,3,4,2,0,1,0,0,16,81.25,7:9
1,Brant County Health Unit,43.151811,-80.274374,Brantford,108,4,9,95,0,62,...,22,10,9,6,3,8,0,108,87.962963,23:31
2,Chatham-Kent Health Unit,42.403861,-82.208561,Chatham,90,1,6,83,0,26,...,9,11,2,1,0,0,0,90,92.222222,32:13
3,Durham Region Health Department,43.898605,-78.940341,Whitby,1155,145,207,803,0,732,...,215,140,110,175,160,18,0,1155,69.52381,141:244
4,Eastern Ontario Health Unit,45.029152,-74.736298,Cornwall,131,7,32,92,0,68,...,25,28,13,11,8,5,1,131,70.229008,63:68


In [9]:
# longitude must always come before latitude
geometry = [Point(xy) for xy in zip(cv1['Reporting_PHU_Longitude'],cv1['Reporting_PHU_Latitude'])]
geo_cv = gpd.GeoDataFrame(cv1, geometry = geometry)
geo_cv.to_csv('data/geo_cv.csv')

geo_cv.head(3)

Unnamed: 0,Reporting_PHU,Reporting_PHU_Latitude,Reporting_PHU_Longitude,Reporting_PHU_City,Reporting_PHU_Address,Outcome1_Fatal,Outcome1_Not_Resolved,Outcome1_Resolved,Client_Gender_(blank),Client_Gender_FEMALE,...,Age_Group_60s,Age_Group_70s,Age_Group_80s,Age_Group_90s,Age_Group_<20,Age_Group_Unknown,TOTAL_CASES,PERCENTAGE_Recovered,GENDER_RATIO1,geometry
0,Algoma Public Health Unit,46.532373,-84.314836,Sault Ste. Marie,16,0,3,13,0,9,...,4,2,0,1,0,0,16,81.25,7:9,POINT (-84.31484 46.53237)
1,Brant County Health Unit,43.151811,-80.274374,Brantford,108,4,9,95,0,62,...,10,9,6,3,8,0,108,87.962963,23:31,POINT (-80.27437 43.15181)
2,Chatham-Kent Health Unit,42.403861,-82.208561,Chatham,90,1,6,83,0,26,...,11,2,1,0,0,0,90,92.222222,32:13,POINT (-82.20856 42.40386)


##### **Feature Engineering**
Here some basic feature engineering is performed which would strengthen the visualization later

In [10]:
percentages = []
for i,v in enumerate(city_order):
    city = geo_cv['PERCENTAGE_Recovered'][v]
    percentages.append(city)
    
amounts = []
for i,v in enumerate(city_order):
    amount = geo_cv['TOTAL_CASES'][v]
    amounts.append(amount)

Counties.reset_index(inplace = True)
Counties['PERCENTAGE_Recovered']=percentages
Counties['PERCENTAGE_Recovered_log'] = np.log(Counties['PERCENTAGE_Recovered'])
Counties['TOTAL_CASES']=amounts
Counties['PERCENTAGE_TOTAL'] = (Counties['PERCENTAGE_Recovered'])*(Counties['TOTAL_CASES'])/100
Counties['PERCENTAGE_TOTAL_log'] = np.log(Counties['PERCENTAGE_TOTAL'])
Counties = Counties.drop(['index','GEODB_OID','OBJECTID','OBJECT_ID'],axis =1)
Counties.head(3)

Unnamed: 0,DESCR,GUT_NUMBER,OFF_NAME,LOC_NAME,SHAPE_AREA,SHAPE_LEN,geometry,CITY,PERCENTAGE_Recovered,PERCENTAGE_Recovered_log,TOTAL_CASES,PERCENTAGE_TOTAL,PERCENTAGE_TOTAL_log
0,"Municipal, Upper Tier",1293.0,ALGOMA,ALGOMA,6.259083,18.455851,"POLYGON ((-83.39369 46.23813, -83.40462 46.240...",Sault Ste. Marie,81.25,4.397531,16,13.0,2.564949
1,"Municipal, Upper Tier",1054.0,BRANT,BRANT,0.123686,1.698624,"POLYGON ((-80.06115 42.99635, -80.06309 42.993...",Brantford,87.962963,4.476916,108,95.0,4.553877
2,"Municipal, Upper Tier",1054.0,BRUCE,BRUCE,0.47927,5.444395,"POLYGON ((-81.10314 44.45166, -81.09335 44.410...",Owen Sound,90.47619,4.505087,84,76.0,4.330733


##### Data Visualization with GoogleMaps API and BOKEH

In [11]:
import json
from bokeh.io import output_notebook, show, output_file
from bokeh.plotting import figure
from bokeh.models import GeoJSONDataSource, AjaxDataSource,ColumnDataSource, LinearColorMapper, ColorBar, Label, HoverTool
from bokeh.models import Range1d, Cross, NumeralTickFormatter, ColorMapper, Slider
from bokeh.models.glyphs import MultiLine
from bokeh.palettes import brewer, mpl

api_key = 'AIzaSyAtFrqHweJOUuJTdhU9l7VvDY9Depz2Esw'
gmaps = googlemaps.Client(key=api_key)

#Read data to json.
Counties_json = json.loads(Counties.to_json())

#Convert to String-like object.
json_data = json.dumps(Counties_json)

#Input GeoJSON source that contains features for plotting.
geosource = GeoJSONDataSource(geojson = json_data)

#Read data to json.
geo_csv_json = json.loads(geo_cv.to_json())

#Convert to String-like object.
json_data2 = json.dumps(geo_csv_json)

#Input GeoJSON source that contains features for plotting.
geosource2 = GeoJSONDataSource(geojson = json_data2) 

In [17]:

fig = figure(title = 'Ontario Covid 19 Cases by Municipality', plot_height = 750 , plot_width = 900
            ,active_scroll='wheel_zoom')
fig.xgrid.grid_line_color = None
fig.ygrid.grid_line_color = None

# initialize the plot on south and eastern ontario
left, right, bottom, top = -83.5, -74.0, 41.5, 47.0
fig.x_range=Range1d(left, right)
fig.y_range=Range1d(bottom, top)

# Creating Color Map by Recovery Rate
palette1 = brewer['RdYlGn'][11] 
cmap1 = LinearColorMapper(palette = palette1,
                                 low = Counties['PERCENTAGE_TOTAL_log'].min(),
                                 high = Counties['PERCENTAGE_TOTAL_log'].max())

cmap2 = LinearColorMapper(palette = palette1,
                                 low = geo_cv['TOTAL_CASES'].min(),
                                 high = geo_cv['TOTAL_CASES'].max())

#Add patch renderer to figure. 
f2 = fig.patches('xs','ys', source = geosource, line_color = 'black', line_width = 0.25, line_alpha = 0.9, fill_alpha = 0.6,
                 fill_color = {'field':'PERCENTAGE_TOTAL_log','transform':cmap1})

fig.add_layout(ColorBar(color_mapper=cmap2, location='bottom_right',label_standoff=10))

hover2 = HoverTool(renderers = [f2], tooltips = [('Municipality Name','@OFF_NAME')])
fig.add_tools(hover2)

f1 = fig.circle(x='Reporting_PHU_Longitude',y='Reporting_PHU_Latitude',size = 10,
                color = 'black', line_color = 'white', source = geosource2, fill_alpha = 0.55)
hover1 = HoverTool(renderers = [f1], tooltips=[('Health Unit','@Reporting_PHU')
                                               ,('Health Unit City','@Reporting_PHU_City')
                                               ,('Fatalities','@Outcome1_Fatal')
                                               ,('Percentage Recovered','@PERCENTAGE_Recovered'),('Total Cases','@TOTAL_CASES')
                                              ,('Male to Female Ratio','@GENDER_RATIO1')])

fig.add_tools(hover1)

#Display figure in Jupyter Notebook.
output_notebook()
#Display figure.

show(fig)