# Visualization and data analysis of output indicators 

This notebook presents data visualization and analysis for output indicators from the Global indicator project.   
The analyses consist of two major components:  
   1. Within-city variations
    - Show maps of walkability indicators for all cities and do a visual sanity check to see if any issue occurs
    - Interpret the within-city variation patterns
    - Pick one or two cities as examples, plot different indicators and compare, interprete the within-city variations and how that may or may not represent the real-world situation

   2. Between-city analysis
    - Show tables for measurements and raw indicator number, rank cities from the highest to the lowest, and interprete the results
    - Plot in a world map using graduated symbol or color to visualize and compare indicators across cities
    - Create box plot to compare median statistics across cities
    - We could may be do similar analyses like policy indicators analyses to color code cities based on the lancet study threshold?
    

**Note: Refer to the [workflow documentation](https://github.com/gboeing/global-indicators/blob/master/documentation/workflow.md) for indicators tables and description**
    
    

In [None]:
import geopandas as gpd
import pandas as pd
import osmnx as ox
import numpy as np
import os
import time
from multiprocessing import Pool, cpu_count, Value, Manager, Process
from functools import partial
import json
import sys
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
# get the work directory
dirname = os.path.abspath('../process/')

# the configuration file should put in the "/configuration" folder located at the same folder as scripts
jsonFile = 'configuration/' + sys.argv[1]

In [None]:
# read json file
try:
    jsonPath = os.path.join(dirname, jsonFile)
    with open('../process/configuration/cities.json') as json_file:
        config = json.load(json_file)
except Exception as e:
    print('Failed to read json file.')
    print(e)

folder = config['folder']
input_folder = config['input_folder']

# read city names from json
cities = list(config["gpkgNames"].keys())
print("Cities:{}".format(cities))

# the path of "global_indicators_hex_250m.gpkg"
gpkgOutput_hex250 = os.path.join(dirname, folder,
                                 config['output_hex_250m'])

# create the path of "global_indicators_city.gpkg"
gpkgOutput_cities = os.path.join(dirname, folder,
                                 config['global_indicators_city'])

# 1. Within-city variations

## All cities hex-level walkability maps

In [None]:
col='all_cities_walkability'

fig, axes = plt.subplots(nrows=6, ncols=4, figsize=(20, 20))

for ax, city in zip(axes.flatten(), cities):
    #read file
    hex250 = gpd.read_file(gpkgOutput_hex250, layer=city)
    city_bound = gpd.read_file(gpkgOutput_cities, layer=city)
    
    #plot indicators
    ax = hex250.plot(ax=ax, column=col, scheme='NaturalBreaks', k=6, cmap='inferno_r', edgecolor='none')
    ax = city_bound.plot(ax=ax, color='none', edgecolor='blue')

    ax.set_title(city, fontsize=12)
    ax.set_axis_off()

# add a title to the figure
fig.suptitle('Walkability Index', y=0.95, fontsize=20, weight='bold')
#fig.text(0.1, 0, 'Note: Walkability index relative to all cities - sum of the z-scores of pop and intersection density, and daily living generated at the hex level; ranked by median', 
  #       size=12, color='#555555')


#fig.savefig('figure/walkability_maps_popunweighted.png', dpi=600)


plt.show()

## All cities hex-level indicators boxplot

In [None]:
# concat all hexes for all cities in one big dataframe
all_hexes=pd.DataFrame()

for city in cities:
    #read file
    hex250 = gpd.read_file(gpkgOutput_hex250, layer=city)
    hex250['city'] = city
    # append all hexes into one dataframe for analysis
    all_hexes = pd.concat([all_hexes, hex250], ignore_index=True)

len(all_hexes)

In [None]:
all_hexes.columns

In [None]:
col = 'all_cities_walkability'
note = 'Note: Walkability index relative to all cities - sum of the z-scores of population and intersection density, and daily living generated at the hex level; ranked by median \n'
title = 'Walkability Index  \n (place-based estimates, unweighted by population)'
fig_path = 'figure/walkability_popunweighted.png'


In [None]:
col =  'local_daily_living'
note = 'Note: Daily living scores - sum of the binary accessibility to all daily living destinations (i.e. supermarket, convience, and public transport); ranked by median \n'
title = 'Daily Living Scores'
fig_path = 'figure/dailyliving.png'


In [None]:
col =  'local_nh_population_density'
note = 'Note: local_nh_population_density - local neighbourhood population per square kilometre at the hex level; ranked by median \n'
title = 'Local population Density'
fig_path = 'figure/population.png'


In [None]:
col =  'local_nh_intersection_density'
note = 'Note: local_nh_intersection_density - local neighbourhood intersections per square kilometre at the hex level; ranked by median \n'
title = 'Local Intersection Density'
fig_path = 'figure/intersection.png'


In [None]:
# library & dataset
import seaborn as sns


#order by median
median_order = all_hexes.groupby(by=['study_region'])[col].median().sort_values().index

fig, ax = plt.subplots(figsize=(20, 10))

# Just switch x and y
ax=sns.boxplot(ax=ax, y=all_hexes['study_region'], x=all_hexes[col], order=median_order, palette="Blues", width=0.6)
ax.tick_params(axis='both', which='major', labelsize=14)
ax.set_xlabel(col, fontsize=16)
ax.set_ylabel('Study region', fontsize=16)
#ax.set(xlim=(all_hexes[col].min(), all_hexes[col].max()))

# add a title to the figure
fig.suptitle(title, y=0.95, fontsize=20, weight='bold')

fig.text(0.1, 0, note, 
         fontsize=12, color='#555555')

#fig.savefig(fig_path, dpi=600)

plt.show()

## All cities pop-weighted hex-level indicators boxplot (work-in-progress)

Pop-weighted hex-level indicators are not currently in our processing scripts but we may want to include this for consistency with the city-level indicators

In [None]:
import setup_config as sc

def aggregation_hexes_pop_weighted(input_gdf, out_gdf, fieldNames):
    """
    Aggregating hexagon level indicators by weighted population
    Parameters

    """
    # loop over each indicators field names of input and output gdf
    for field in fieldNames:
        # calculate the population weighted indicators based on input hexagon layer
        # sum to aggregate up to the city level
        out_gdf[field[1]] = (input_gdf[sc.cities_parameters["pop_est"]] * input_gdf[field[0]]) / (
            input_gdf[sc.cities_parameters["pop_est"]].sum())
    return out_gdf


def calc_hexes_pop_pct_indicators(gpkg_hex_250m, city, gpkg_input, gpkg_output):
    """
    Calculate population-weighted hex-level indicators,
    and save to output geopackage
    """
    gdf_hex = gpd.read_file(gpkg_hex_250m, layer=city)

    gdf_hex_origin = gpd.read_file(gpkg_input, layer=sc.cities_parameters["hex250"])
    # join pop_est from original hex to processed hex
    gdf_hex = gdf_hex.join(gdf_hex_origin.set_index("index"), on="index", how="left", rsuffix="_origin")

    # hex-level field names from city-specific hex indicators gpkg
    fieldNames = sc.hex_fieldNames[3:-1]
    
    # new file names for population-weighted city-level indicators
    fieldNames_new = sc.city_fieldNames[2:-1]
    
    # calculate the population weighted city-level indicators
    gdf_hexes = aggregation_hexes_pop_weighted(gdf_hex, gdf_hex, list(zip(fieldNames, fieldNames_new)))
    
    #gdf_hexes.to_file(gpkg_output, layer=city, driver="GPKG")
    return gdf_hexes

In [None]:
# read pre-prepared sample point stats of each city from disk
gpkgInput_ori = []
for gpkg in list(config["gpkgNames"].values()):
    gpkgInput_ori.append(os.path.join(dirname, input_folder, gpkg))

In [None]:
all_hexes1=pd.DataFrame()
cities = list(config["gpkgNames"].keys())

for index, gpkgInput in enumerate(gpkgInput_ori):
    #print(index, gpkgInput)
    gdf_hexes = calc_hexes_pop_pct_indicators(gpkgOutput_hex250, cities[index], 
                                      gpkgInput, gpkgOutput_hex250) 
    # append all hexes into one dataframe for analysis
    all_hexes1 = pd.concat([all_hexes1, gdf_hexes], ignore_index=True)

all_hexes1.columns

In [None]:
# check if the results are the same as city-level outputs
all_hexes1[['study_region', 'all_cities_pop_walkability', 'all_cities_pop_z_daily_living',
       'all_cities_pop_z_nh_intersection_density',
       'all_cities_pop_z_nh_population_density']].groupby('study_region').sum()

In [None]:
# library & dataset
import seaborn as sns

col ='all_cities_pop_walkability'

#order by median
median_order = all_hexes1.groupby(by=['study_region'])[col].median().sort_values().index

fig, ax = plt.subplots(figsize=(20, 10))

# Just switch x and y
ax=sns.boxplot(ax=ax, y=all_hexes1['study_region'], x=all_hexes1[col], order=median_order, palette="Blues", width=0.6)
ax.tick_params(axis='both', which='major', labelsize=14)
ax.set_xlabel('Walkability index (Z Scores)', fontsize=16)
ax.set_ylabel('Study region', fontsize=16)
ax.set(xlim=(-0.005, 0.008))

# add a title to the figure
fig.suptitle('Walkability Index  \n ( weighted by population)', y=0.95, fontsize=20, weight='bold')

fig.text(0.1, 0, 'Note: Population-weighted walkability index relative to all cities - sum of the population-weighted z-scores of pop and intersection density, and daily living generated at the hex level; ranked by median \n', 
         fontsize=12, color='#555555')

#fig.savefig('figure/walkability_popweighted.png', dpi=600)

plt.show()

## 2. Between-city variations

In [None]:
cities_ind = gpd.GeoDataFrame()
for city in cities:
    #read file
    city_ind = gpd.read_file(gpkgOutput_cities, layer=city)
    cities_ind = cities_ind.append(city_ind, ignore_index=True)

In [None]:
#correct vic study region column
cities_ind['Study region'] = cities_ind['Study region'].fillna('Vic')
cities_ind.columns

In [None]:
# show pop-weighted walkability score ranking relative to all cities
cities_ind[['Study region', 'all_cities_walkability', 'all_cities_pop_walkability'
           ]].sort_values('all_cities_pop_walkability').reset_index().drop(columns=['index'])

In [None]:
# save these indicators in csv file
cities_ind[['Study region', 'all_cities_pop_walkability',
       'all_cities_pop_z_daily_living',
       'all_cities_pop_z_nh_intersection_density',
       'all_cities_pop_z_nh_population_density', 
        'pop_daily_living',  'pop_walkability',
       'pop_nh_intersection_density', 'pop_nh_pop_density',
       'pop_pct_access_500m_convenience_binary',
       'pop_pct_access_500m_fresh_food_market_binary',
       'pop_pct_access_500m_pt_any_binary',
       'pop_pct_access_500m_pt_gtfs_any_binary',
       'pop_pct_access_500m_pt_gtfs_freq_20_binary',
       'pop_pct_access_500m_pt_gtfs_freq_30_binary',
       'pop_pct_access_500m_pt_osm_any_binary',
       'pop_pct_access_500m_public_open_space_any_binary',
       'pop_pct_access_500m_public_open_space_large_binary', 
        'all_cities_walkability', 'all_cities_z_daily_living', 
        'all_cities_z_nh_intersection_density',
       'all_cities_z_nh_population_density',
       'local_daily_living', 'local_nh_intersection_density',
       'local_nh_population_density', 'local_walkability', 
       'urban_sample_point_count']].to_csv('figure/globe_cities_results_Sept2020.csv')

## Other visualization to consider (work-in-progress):
1. global mapping to plot indicators: [this site](https://geopandas.org/mapping.html)

In [None]:
continents = pd.read_csv('cities_continents.csv')
cities_point = pd.merge(cities_ind, continents, left_on='Study region', right_on='City', how='outer')
cities_point.columns

In [None]:
# creat point geometry to plot in the map
list_lat = []   # create empty lists
list_long = []

for index, row in cities_point.iterrows(): # iterate over rows in dataframe
    City = row['City']
    Country = row['Country']
    query = str(City) +','+str(Country)

    results = ox.geocode(query)   
    lat = results[1]
    long = results[0]

    list_lat.append(lat)
    list_long.append(long)

# create new columns from lists   
cities_point['lat'] = list_lat   
cities_point['lon'] = list_long

In [None]:
# generate point geometry columns 
cities_point = cities_point.rename(columns={'geometry':'poly_geometry'})

from shapely.geometry import Point
cities_point['geometry'] = cities_point.apply(lambda row: Point(row['lat'], row['lon']), axis=1)

In [None]:
import mpl_toolkits
import matplotlib
from mpl_toolkits.basemap import Basemap
#[A world map of #surf tweets](https://python-graph-gallery.com/315-a-world-map-of-surf-tweets/)
#https://matplotlib.org/basemap/users/index.html

# Set the dimension of the figure
my_dpi=96
plt.figure(figsize=(2600/my_dpi, 1800/my_dpi), dpi=my_dpi)

# Make the background map
m=Basemap(llcrnrlon=-150, llcrnrlat=-65,urcrnrlon=180,urcrnrlat=80)
m.drawmapboundary(fill_color='#A6CAE0', linewidth=0)
m.fillcontinents(color='grey', alpha=0.3)
m.drawcoastlines(linewidth=0.1, color="white")
 
# prepare a color for each point depending on the continent.
cities_point['labels_enc'] = pd.factorize(cities_point['Continents'])[0]

m.scatter(cities_point['lat'], cities_point['lon'], marker='^', s=200,
          c=cities_point['labels_enc'], cmap="Set1", alpha=0.9)
 
# copyright and source data info
#plt.text( -170, -58,'walkability', ha='left', va='bottom', size=9, color='#555555' )
 

Note: Cannot plot the negative z score bubble maps: If a variable takes negative values, then it cannot be directly assigned to point size as an encoding: after all, how can a shape have a negative area? Additional information needs to be encoded into shape size in order to indicate negative values. For example, you might have filled circles indicate positive values and unfilled circles indicate negative values. As another alternative, you might have positive points in one color, and negative points in a distinct, different color.