# Visualization and data analysis of output indicators 

This notebook presents data visualization and analysis for output indicators from the Global indicator project.   
The analyses consist of two major components:  
   1. Within-city variations
    - Show maps of walkability indicators for all cities and do a visual sanity check to see if any issue occurs
    - Interpret the within-city variation patterns
    - Pick one or two cities as examples, plot different indicators and compare, interprete the within-city variations and how that may or may not represent the real-world situation

   2. Between-city analysis
    - Show tables for measurements and raw indicator number, rank cities from the highest to the lowest, and interprete the results
    - Plot in a world map using graduated symbol or color to visualize and compare indicators across cities
    - Create box plot to compare median statistics across cities
    - We could may be do similar analyses like policy indicators analyses to color code cities based on the lancet study threshold?
    

**Note: Refer to the [workflow documentation](https://github.com/gboeing/global-indicators/blob/master/documentation/workflow.md) for indicators tables and description**
    
    

In [1]:
import geopandas as gpd
import pandas as pd
import os
import json
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
image_path = './images'
dpi = 300

process_folder = '../process'
process_config_path = '../process/configuration/cities.json'

In [3]:
with open(process_config_path) as json_file:
    config = json.load(json_file)

output_folder = os.path.join(process_folder, config['folder'])
input_folder = os.path.join(process_folder, config['input_folder'])

# the path of "global_indicators_hex_250m.gpkg"
gpkgOutput_hex250 = os.path.join(output_folder, config['output_hex_250m'])

# create the path of "global_indicators_city.gpkg"
gpkgOutput_cities = os.path.join(output_folder, config['global_indicators_city'])

In [4]:
cities = ['adelaide',
 'auckland',
 'baltimore',
 'bangkok',
 'barcelona',
 'belfast',
 'bern',
 'chennai',
 'mexico_city',
 'cologne',
 'ghent',
 'graz',
 'hanoi',
 'hong_kong',
 'lisbon',
 'melbourne',
 'odense',
 'olomouc',
 'sao_paulo',
 'phoenix',
 'seattle',
 'sydney',
 'valencia',
 'vic']

## All cities hex-level indicators boxplot

In [5]:
# concat all hexes for all cities in one big dataframe
all_hexes=pd.DataFrame()

for city in cities:
    #read file
    hex250 = gpd.read_file(gpkgOutput_hex250, layer=city)
    hex250['city'] = city
    # append all hexes into one dataframe for analysis
    all_hexes = pd.concat([all_hexes, hex250], ignore_index=True)

len(all_hexes)

334003

In [6]:
all_hexes.columns

Index(['index', 'study_region', 'urban_sample_point_count',
       'pct_access_500m_fresh_food_market_binary',
       'pct_access_500m_convenience_binary',
       'pct_access_500m_pt_osm_any_binary',
       'pct_access_500m_public_open_space_any_binary',
       'pct_access_500m_public_open_space_large_binary',
       'pct_access_500m_pt_gtfs_any_binary',
       'pct_access_500m_pt_gtfs_freq_30_binary',
       'pct_access_500m_pt_gtfs_freq_20_binary',
       'pct_access_500m_pt_any_binary', 'local_nh_population_density',
       'local_nh_intersection_density', 'local_daily_living',
       'local_walkability', 'all_cities_z_nh_population_density',
       'all_cities_z_nh_intersection_density', 'all_cities_z_daily_living',
       'all_cities_walkability', 'geometry', 'city'],
      dtype='object')

In [7]:
figs = {'all_cities_walkability': {'note': 'Sum z-scores of pop density + intersect density + daily living, hex-level',
                                   'title': 'Walkability Index (place-based, not pop weighted)',
                                   'filename': 'boxplot-walkability_unweighted.png'},
        'local_daily_living': {'note': 'Sum of binary accessibility to all daily living destinations',
                               'title': 'Daily Living Scores',
                               'filename': 'boxplot-daily_living.png'},
        'local_nh_population_density': {'note': 'Population per km2, hex-level',
                                        'title': 'Local Population Density',
                                        'filename': 'boxplot-pop_density.png'},
        'local_nh_intersection_density': {'note': 'Intersections per km2, hex-level',
                                          'title': 'Local Intersection Density',
                                          'filename': 'boxplot-intersect_density.png'}
}

In [8]:
sns.set_style('whitegrid') #visual styles
sns.set_context('paper') #presets for scaling figure element sizes

In [9]:
for col, details in figs.items():

    #order by median
    median_order = all_hexes.groupby(by=['study_region'])[col].median().sort_values().index

    fig, ax = plt.subplots(figsize=(8, 6))

    # switch x and y
    ax=sns.boxplot(ax=ax, y=all_hexes['study_region'], x=all_hexes[col],
                   order=median_order, palette='Blues', width=0.6,
                   fliersize=0.05, boxprops={'alpha':0.7})
    ax.tick_params(axis='both', which='major')
    ax.set_xlabel(details['note'])
    ax.set_ylabel('')

    # add a title to the figure
    ax.set_title(details['title'], fontsize=16)
    fig.tight_layout()

    save_path = os.path.join(image_path, details['filename'])
    fig.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()

## All cities pop-weighted hex-level indicators boxplot (work-in-progress)

Pop-weighted hex-level indicators are not currently in our processing scripts but we may want to include this for consistency with the city-level indicators

In [10]:
def aggregation_hexes_pop_weighted(input_gdf, out_gdf, fieldNames):
    """
    Aggregating hexagon level indicators by weighted population
    Parameters

    """
    # loop over each indicators field names of input and output gdf
    for field in fieldNames:
        # calculate the population weighted indicators based on input hexagon layer
        # sum to aggregate up to the city level
        out_gdf[field[1]] = (input_gdf[sc.cities_parameters["pop_est"]] * input_gdf[field[0]]) / (
            input_gdf[sc.cities_parameters["pop_est"]].sum())
    return out_gdf


def calc_hexes_pop_pct_indicators(gpkg_hex_250m, city, gpkg_input, gpkg_output):
    """
    Calculate population-weighted hex-level indicators,
    and save to output geopackage
    """
    gdf_hex = gpd.read_file(gpkg_hex_250m, layer=city)

    gdf_hex_origin = gpd.read_file(gpkg_input, layer=sc.cities_parameters["hex250"])
    # join pop_est from original hex to processed hex
    gdf_hex = gdf_hex.join(gdf_hex_origin.set_index("index"), on="index", how="left", rsuffix="_origin")

    # hex-level field names from city-specific hex indicators gpkg
    fieldNames = sc.hex_fieldNames[3:-1]
    
    # new file names for population-weighted city-level indicators
    fieldNames_new = sc.city_fieldNames[2:-1]
    
    # calculate the population weighted city-level indicators
    gdf_hexes = aggregation_hexes_pop_weighted(gdf_hex, gdf_hex, list(zip(fieldNames, fieldNames_new)))
    
    #gdf_hexes.to_file(gpkg_output, layer=city, driver="GPKG")
    return gdf_hexes

In [11]:
# read pre-prepared sample point stats of each city from disk
gpkgInput_ori = []
for gpkg in list(config["gpkgNames"].values()):
    gpkgInput_ori.append(os.path.join(dirname, input_folder, gpkg))

NameError: name 'dirname' is not defined

In [None]:
all_hexes1=pd.DataFrame()
cities = list(config["gpkgNames"].keys())

for index, gpkgInput in enumerate(gpkgInput_ori):
    #print(index, gpkgInput)
    gdf_hexes = calc_hexes_pop_pct_indicators(gpkgOutput_hex250, cities[index], 
                                      gpkgInput, gpkgOutput_hex250) 
    # append all hexes into one dataframe for analysis
    all_hexes1 = pd.concat([all_hexes1, gdf_hexes], ignore_index=True)

all_hexes1.columns

In [None]:
# check if the results are the same as city-level outputs
all_hexes1[['study_region', 'all_cities_pop_walkability', 'all_cities_pop_z_daily_living',
       'all_cities_pop_z_nh_intersection_density',
       'all_cities_pop_z_nh_population_density']].groupby('study_region').sum()

In [None]:
# library & dataset
import seaborn as sns

col ='all_cities_pop_walkability'

#order by median
median_order = all_hexes1.groupby(by=['study_region'])[col].median().sort_values().index

fig, ax = plt.subplots(figsize=(20, 10))

# Just switch x and y
ax=sns.boxplot(ax=ax, y=all_hexes1['study_region'], x=all_hexes1[col], order=median_order, palette="Blues", width=0.6)
ax.tick_params(axis='both', which='major', labelsize=14)
ax.set_xlabel('Walkability index (Z Scores)', fontsize=16)
ax.set_ylabel('Study region', fontsize=16)
ax.set(xlim=(-0.005, 0.008))

# add a title to the figure
fig.suptitle('Walkability Index  \n ( weighted by population)', y=0.95, fontsize=20, weight='bold')

fig.text(0.1, 0, 'Note: Population-weighted walkability index relative to all cities - sum of the population-weighted z-scores of pop and intersection density, and daily living generated at the hex level; ranked by median \n', 
         fontsize=12, color='#555555')

#fig.savefig('figure/walkability_popweighted.png', dpi=600)

plt.show()