# Visualization and data analysis of output indicators 

This notebook presents data visualization and analysis for output indicators from the Global indicator project.   
The analyses consist of two major components:  
   1. Within-city variations
    - Show maps of walkability indicators for all cities and do a visual sanity check to see if any issue occurs
    - Interpret the within-city variation patterns
    - Pick one or two cities as examples, plot different indicators and compare, interprete the within-city variations and how that may or may not represent the real-world situation

   2. Between-city analysis
    - Show tables for measurements and raw indicator number, rank cities from the highest to the lowest, and interprete the results
    - Plot in a world map using graduated symbol or color to visualize and compare indicators across cities
    - Create box plot to compare median statistics across cities
    - We could may be do similar analyses like policy indicators analyses to color code cities based on the lancet study threshold?
    

**Note: Refer to the [workflow documentation](https://github.com/gboeing/global-indicators/blob/master/documentation/workflow.md) for indicators tables and description**
    
    

In [1]:
import geopandas as gpd
import os
import pandas as pd
import json
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
image_path = './images'
dpi = 300

process_folder = '../process'
process_config_path = '../process/configuration/cities.json'

In [3]:
with open(process_config_path) as json_file:
    config = json.load(json_file)

output_folder = os.path.join(process_folder, config['folder'])
input_folder = os.path.join(process_folder, config['input_folder'])

# the path of "global_indicators_hex_250m.gpkg"
gpkgOutput_hex250 = os.path.join(output_folder, config['output_hex_250m'])

# create the path of "global_indicators_city.gpkg"
gpkgOutput_cities = os.path.join(output_folder, config['global_indicators_city'])

In [4]:
cities = ['adelaide',
 'auckland',
 'baltimore',
 'bangkok',
 'barcelona',
 'belfast',
 'bern',
 'chennai',
 'mexico_city',
 'cologne',
 'ghent',
 'graz',
 'hanoi',
 'hong_kong',
 'lisbon',
 'melbourne',
 'odense',
 'olomouc',
 'sao_paulo',
 'phoenix',
 'seattle',
 'sydney',
 'valencia',
 'vic']

In [5]:
cities_ind = gpd.GeoDataFrame()
for city in cities:
    #read file
    city_ind = gpd.read_file(gpkgOutput_cities, layer=city)
    cities_ind = cities_ind.append(city_ind, ignore_index=True)

In [6]:
#correct vic study region column
cities_ind['Study region'] = cities_ind['Study region'].fillna('Vic')
cities_ind.columns

Index(['Study region', 'urban_sample_point_count',
       'pop_pct_access_500m_fresh_food_market_binary',
       'pop_pct_access_500m_convenience_binary',
       'pop_pct_access_500m_pt_osm_any_binary',
       'pop_pct_access_500m_public_open_space_any_binary',
       'pop_pct_access_500m_public_open_space_large_binary',
       'pop_pct_access_500m_pt_gtfs_any_binary',
       'pop_pct_access_500m_pt_gtfs_freq_30_binary',
       'pop_pct_access_500m_pt_gtfs_freq_20_binary',
       'pop_pct_access_500m_pt_any_binary', 'pop_nh_pop_density',
       'pop_nh_intersection_density', 'pop_daily_living', 'pop_walkability',
       'all_cities_pop_z_nh_population_density',
       'all_cities_pop_z_nh_intersection_density',
       'all_cities_pop_z_daily_living', 'all_cities_walkability', 'geometry',
       'db', 'area_sqkm'],
      dtype='object')

In [7]:
# show pop-weighted walkability score ranking relative to all cities
cities_ind[['Study region', 'all_cities_walkability', 'all_cities_pop_walkability'
           ]].sort_values('all_cities_pop_walkability').reset_index().drop(columns=['index'])

KeyError: "['all_cities_pop_walkability'] not in index"

In [None]:
# save these indicators in csv file
cities_ind[['Study region', 'all_cities_pop_walkability',
       'all_cities_pop_z_daily_living',
       'all_cities_pop_z_nh_intersection_density',
       'all_cities_pop_z_nh_population_density', 
        'pop_daily_living',  'pop_walkability',
       'pop_nh_intersection_density', 'pop_nh_pop_density',
       'pop_pct_access_500m_convenience_binary',
       'pop_pct_access_500m_fresh_food_market_binary',
       'pop_pct_access_500m_pt_any_binary',
       'pop_pct_access_500m_pt_gtfs_any_binary',
       'pop_pct_access_500m_pt_gtfs_freq_20_binary',
       'pop_pct_access_500m_pt_gtfs_freq_30_binary',
       'pop_pct_access_500m_pt_osm_any_binary',
       'pop_pct_access_500m_public_open_space_any_binary',
       'pop_pct_access_500m_public_open_space_large_binary', 
        'all_cities_walkability', 'all_cities_z_daily_living', 
        'all_cities_z_nh_intersection_density',
       'all_cities_z_nh_population_density',
       'local_daily_living', 'local_nh_intersection_density',
       'local_nh_population_density', 'local_walkability', 
       'urban_sample_point_count']].to_csv('images/globe_cities_results_Sept2020.csv')

## Other visualization to consider (work-in-progress):
1. global mapping to plot indicators: [this site](https://geopandas.org/mapping.html)

In [None]:
continents = pd.read_csv('cities_continents.csv')
cities_point = pd.merge(cities_ind, continents, left_on='Study region', right_on='City', how='outer')
cities_point.columns

In [None]:
# creat point geometry to plot in the map
list_lat = []   # create empty lists
list_long = []

for index, row in cities_point.iterrows(): # iterate over rows in dataframe
    City = row['City']
    Country = row['Country']
    query = str(City) +','+str(Country)

    results = ox.geocode(query)   
    lat = results[1]
    long = results[0]

    list_lat.append(lat)
    list_long.append(long)

# create new columns from lists   
cities_point['lat'] = list_lat   
cities_point['lon'] = list_long

In [None]:
# generate point geometry columns 
cities_point = cities_point.rename(columns={'geometry':'poly_geometry'})

from shapely.geometry import Point
cities_point['geometry'] = cities_point.apply(lambda row: Point(row['lat'], row['lon']), axis=1)

In [None]:
import mpl_toolkits
import matplotlib

# use cartopy not basemap (which was deprecated years ago)
from mpl_toolkits.basemap import Basemap

# Set the dimension of the figure
my_dpi=96
plt.figure(figsize=(2600/my_dpi, 1800/my_dpi), dpi=my_dpi)

# Make the background map
m=Basemap(llcrnrlon=-150, llcrnrlat=-65,urcrnrlon=180,urcrnrlat=80)
m.drawmapboundary(fill_color='#A6CAE0', linewidth=0)
m.fillcontinents(color='grey', alpha=0.3)
m.drawcoastlines(linewidth=0.1, color="white")
 
# prepare a color for each point depending on the continent.
cities_point['labels_enc'] = pd.factorize(cities_point['Continents'])[0]

m.scatter(cities_point['lat'], cities_point['lon'], marker='^', s=200,
          c=cities_point['labels_enc'], cmap="Set1", alpha=0.9)
 
# copyright and source data info
#plt.text( -170, -58,'walkability', ha='left', va='bottom', size=9, color='#555555' )
 

Note: Cannot plot the negative z score bubble maps: If a variable takes negative values, then it cannot be directly assigned to point size as an encoding: after all, how can a shape have a negative area? Additional information needs to be encoded into shape size in order to indicate negative values. For example, you might have filled circles indicate positive values and unfilled circles indicate negative values. As another alternative, you might have positive points in one color, and negative points in a distinct, different color.