In [None]:
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio

import numpy as np

from urllib.request import urlopen
import json
import geopandas
from collections import OrderedDict
from math import floor

In [None]:
nyc_census = geopandas.read_file('nyc_data/nyc_census_tracts/nyc_census_2010.shp')
nyc_census.head()
nyc_all_geoids = nyc_census['GEOID']
nyc_all_geoids = nyc_all_geoids.astype(int)
nyc_all_geoids.shape

In [None]:
with open('nyc_data/nyc_census_tracts/nyc_tracts.geojson') as response:
    nyc_tracts = json.load(response)

Processing test set ground truth flows:

In [None]:
gt_flows = pd.read_csv('nyc_data/LODES/CommutingFlow_2015_test_exp.csv')  
gt_flows = gt_flows.rename(columns={'count' : 'gt_count'})
gt_flows.shape

In [None]:
gt_orig_all = gt_flows.groupby(['h_geoid'], as_index=False).sum()
gt_dest_all = gt_flows.groupby(['w_geoid'], as_index=False).sum()

In [None]:
gt_orig_all[gt_orig_all['gt_count']==0].shape

In [None]:
gt_dest_all[gt_dest_all['gt_count']==0].shape

32 out of 2168 destination tracts where people are not going at all!

In [None]:
gt_dest_all.loc[gt_dest_all['gt_count']==0,'gt_count'] = 1
gt_dest_all[gt_dest_all['gt_count']==0].shape

Processing test set prediction flows using OSM features:   

In [None]:
pred_flows = pd.read_csv('nyc_predicted_flows_exp.csv')
pred_flows.shape

In [None]:
gt_flows[gt_flows['gt_count'] == 0].shape

In [None]:
pred_flows[pred_flows.pred_count == 0].shape

In [None]:
pred_flows[pred_flows['pred_count'] < 0]['pred_count'].shape

In [None]:
gt_flows[(pred_flows['pred_count'] < 0) & (gt_flows.gt_count==0)].shape

In [None]:
pred_flows[pred_flows['pred_count'] < 0]['pred_count'].sum()

In [None]:
pred_flows[pred_flows['pred_count'] < 0].h_geoid.unique().shape

In [None]:
pred_flows[pred_flows['pred_count'] < 0].w_geoid.unique().shape

In [None]:
pred_flows.loc[pred_flows['pred_count'] < 0, 'pred_count'] = 0
pred_flows[pred_flows['pred_count'] < 0]['pred_count'].sum()

In [None]:
gt_flows.shape

In [None]:
gt_flows[gt_flows['gt_count'] != 0].shape

In [None]:
gt_flows[gt_flows['gt_count'] == 0].shape

In [None]:
gt_flows[(pred_flows['pred_count'] == 0) & (gt_flows.gt_count==0)].shape

In [None]:
gt_flows[(pred_flows['pred_count'] != 0) & (gt_flows.gt_count!=0)].shape

In [None]:
gt_flows['gt_count'].sum()

In [None]:
pred_flows['pred_count'].sum()

In [None]:
pred_flows.pred_count.round(0).sum()

In [None]:
pred_orig_all = pred_flows.groupby(['h_geoid'], as_index=False).sum()
pred_dest_all = pred_flows.groupby(['w_geoid'], as_index=False).sum()

In [None]:
gt_orig_all['gt_count'].sum()

In [None]:
pred_orig_all['pred_count'].sum()

In [None]:
gt_dest_all['gt_count'].sum()

In [None]:
pred_dest_all['pred_count'].sum()

In [None]:
# delta percentage of all origin flows

delta_orig_all = gt_orig_all[['h_geoid', 'gt_count']]
delta_orig_all['pred_count'] = pred_orig_all['pred_count']
delta_orig_all['delta'] = pred_orig_all['pred_count'] - gt_orig_all['gt_count']
delta_orig_all['delta_percent'] = ((delta_orig_all['delta'] / delta_orig_all['gt_count'])*100)
delta_orig_all = delta_orig_all.sort_values(by=['delta_percent'], ascending=False)
delta_orig_all

In [None]:
bin_min = delta_orig_all.delta_percent.min()
bin_max = delta_orig_all.delta_percent.max()
print(bin_min)
print(bin_max)

In [None]:
bins = [-71,-50.000000001,-30.000000001,-10.000000001,10.000000001,30.000000001,50.000000001,
        70.000000001,90.000000001,bin_max]
labels = ['- 51-70%', '- 31-50%','- 11-30%', '± 10%','+ 11-30%', '+ 31-50%','+ 51-70%','+ 71-90%',' > 90%']

In [None]:
pd.cut(delta_orig_all['delta_percent'], precision=9,
                                bins=bins,retbins=True,include_lowest=True)[0].cat.categories

In [None]:
delta_orig_all_binned = delta_orig_all.copy()
delta_orig_all_binned.insert(delta_orig_all_binned.shape[1],
                        'delta_label',
                        pd.cut(delta_orig_all_binned['delta_percent'], precision=9,
                                bins=bins,
                                labels=labels, include_lowest = True))

delta_orig_all_binned

In [None]:
# delta percentage of all destination flows

delta_dest_all = gt_dest_all[['w_geoid', 'gt_count']]
delta_dest_all['pred_count'] = pred_dest_all['pred_count']
delta_dest_all['delta'] = pred_dest_all['pred_count'] - gt_dest_all['gt_count']
delta_dest_all['delta_percent'] = ((delta_dest_all['delta'] / delta_dest_all['gt_count'])*100)
delta_dest_all = delta_dest_all.sort_values(by=['delta_percent'], ascending=False)
delta_dest_all

In [None]:
bin_min = delta_dest_all.delta_percent.min()
bin_max = delta_dest_all.delta_percent.max()
print(bin_min)
print(bin_max)

In [None]:
bins = [-91,-70.000000001,-50.000000001,-30.000000001,-10.000000001,10.000000001,30.000000001,
        50.000000001,70.000000001,90.000000001,bin_max]
labels = ['- 71-90%', '- 51-70%', '- 31-50%','- 11-30%', '± 10%','+ 11-30%', '+ 31-50%',
          '+ 51-70%','+ 71-90%',' > 90%']

In [None]:
pd.cut(delta_dest_all['delta_percent'], precision=9,
                                bins=bins,retbins=True,include_lowest=True)[0].cat.categories

In [None]:
delta_dest_all_binned = delta_dest_all.copy()
delta_dest_all_binned.insert(delta_dest_all_binned.shape[1],
                        'delta_label',
                        pd.cut(delta_dest_all_binned['delta_percent'], precision=9,
                                bins=bins,
                                labels=labels, include_lowest = True))

delta_dest_all_binned

Processing test set prediction flows using GMEL features:  

In [None]:
pred_flows_GMEL = pd.read_csv('nyc_predicted_flows_exp_GMEL.csv')
pred_flows_GMEL.shape

In [None]:
pred_flows_GMEL[pred_flows_GMEL['pred_count'] < 0]['pred_count'].sum()

In [None]:
pred_flows_GMEL.loc[pred_flows_GMEL['pred_count'] < 0, 'pred_count'] = 0
pred_flows_GMEL[pred_flows_GMEL['pred_count'] < 0]['pred_count'].sum()

In [None]:
gt_flows['gt_count'].sum()

In [None]:
pred_flows_GMEL['pred_count'].sum()

In [None]:
pred_orig_all_GMEL = pred_flows_GMEL.groupby(['h_geoid'], as_index=False).sum()
pred_dest_all_GMEL = pred_flows_GMEL.groupby(['w_geoid'], as_index=False).sum()

In [None]:
gt_orig_all['gt_count'].sum()

In [None]:
pred_orig_all_GMEL['pred_count'].sum()

In [None]:
gt_dest_all['gt_count'].sum()

In [None]:
pred_dest_all_GMEL['pred_count'].sum()

In [None]:
# delta percentage of all GMEL origin flows

delta_orig_all_GMEL = gt_orig_all[['h_geoid', 'gt_count']]
delta_orig_all_GMEL['pred_count'] = pred_orig_all_GMEL['pred_count']
delta_orig_all_GMEL['delta'] = pred_orig_all_GMEL['pred_count'] - gt_orig_all['gt_count']
delta_orig_all_GMEL['delta_percent'] = ((delta_orig_all_GMEL['delta'] / delta_orig_all_GMEL['gt_count'])*100)
delta_orig_all_GMEL = delta_orig_all_GMEL.sort_values(by=['delta_percent'], ascending=False)
delta_orig_all_GMEL

In [None]:
bin_min = delta_orig_all_GMEL.delta_percent.min()
bin_max = delta_orig_all_GMEL.delta_percent.max()
print(bin_min)
print(bin_max)

In [None]:
bins = [-71,-50.000000001,-30.000000001,-10.000000001,10.000000001,30.000000001,50.000000001,
        70.000000001,90.000000001,bin_max]
labels = ['- 51-70%', '- 31-50%','- 11-30%', '± 10%','+ 11-30%', '+ 31-50%','+ 51-70%','+ 71-90%',' > 90%']

In [None]:
pd.cut(delta_orig_all_GMEL['delta_percent'], precision=9,
                                bins=bins,retbins=True,include_lowest=True)[0].cat.categories

In [None]:
delta_orig_all_binned_GMEL = delta_orig_all_GMEL.copy()
delta_orig_all_binned_GMEL.insert(delta_orig_all_binned_GMEL.shape[1],
                        'delta_label',
                        pd.cut(delta_orig_all_binned_GMEL['delta_percent'], precision=9,
                                bins=bins,
                                labels=labels, include_lowest = True))

delta_orig_all_binned_GMEL

In [None]:
# delta percentage of all GMEL destination flows

delta_dest_all_GMEL = gt_dest_all[['w_geoid', 'gt_count']]
delta_dest_all_GMEL['pred_count'] = pred_dest_all_GMEL['pred_count']
delta_dest_all_GMEL['delta'] = pred_dest_all_GMEL['pred_count'] - gt_dest_all['gt_count']
delta_dest_all_GMEL['delta_percent'] = ((delta_dest_all_GMEL['delta'] / delta_dest_all_GMEL['gt_count'])*100)
delta_dest_all_GMEL = delta_dest_all_GMEL.sort_values(by=['delta_percent'], ascending=False)
delta_dest_all_GMEL

In [None]:
bin_min = delta_dest_all.delta_percent.min()
bin_max = delta_dest_all.delta_percent.max()
print(bin_min)
print(bin_max)

In [None]:
bins = [-91,-70.000000001,-50.000000001,-30.000000001,-10.000000001,10.000000001,30.000000001,
        50.000000001,70.000000001,90.000000001,bin_max]
labels = ['- 71-90%', '- 51-70%', '- 31-50%','- 11-30%', '± 10%','+ 11-30%', '+ 31-50%',
          '+ 51-70%','+ 71-90%',' > 90%']

In [None]:
pd.cut(delta_dest_all_GMEL['delta_percent'], precision=9,
                                bins=bins,retbins=True,include_lowest=True)[0].cat.categories

In [None]:
delta_dest_all_binned_GMEL = delta_dest_all_GMEL.copy()
delta_dest_all_binned_GMEL.insert(delta_dest_all_binned_GMEL.shape[1],
                        'delta_label',
                        pd.cut(delta_dest_all_binned_GMEL['delta_percent'], precision=9,
                                bins=bins,
                                labels=labels, include_lowest = True))

delta_dest_all_binned_GMEL

In [None]:
delta_orig_all_binned.groupby('delta_label').count()

In [None]:
delta_orig_all_binned_GMEL.groupby('delta_label').count()

In [None]:
delta_dest_all_binned.groupby('delta_label').count()

In [None]:
delta_dest_all_binned_GMEL.groupby('delta_label').count()

Choropleth maps

In [None]:
# delta percentage of all origin flows using OSM features

fig = px.choropleth(delta_orig_all_binned, geojson=nyc_tracts, locations='h_geoid', featureidkey="properties.GEOID",\
                 color='delta_label',
                 color_discrete_sequence=[
                    'rgb(188,215,239)','rgb(107,174,214)','rgb(66,146,198)','rgb(33,113,181)',
                    'rgb(8,48,107)','rgb(124, 252, 0)','rgb(103,0,13)','rgb(203,24,29)','rgb(251,106,74)'])

fig.update_geos(fitbounds="locations", visible=False)
fig.update_traces(marker_line_width=0.1, marker_opacity=0.8)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_layout(width =1024, height=720, font_size=20);  #tune the font_size to your needs 
fig.update_layout(legend_x=0.8) 
fig.update_layout(legend_y=0.8) 
fig.update_layout(legend=dict(font=dict(size= 16)))
fig.update_layout(legend_title='Delta Percentage')
#fig.show()

#pio.write_image(fig, 'nyc_osm_origin_map.pdf')

In [None]:
# delta percentage of all origin flows using GMEL features

fig = px.choropleth(delta_orig_all_binned_GMEL, geojson=nyc_tracts, locations='h_geoid', featureidkey="properties.GEOID",\
                 color='delta_label',
                 color_discrete_sequence=[
                    'rgb(188,215,239)','rgb(107,174,214)','rgb(66,146,198)','rgb(33,113,181)',
                    'rgb(8,48,107)','rgb(124, 252, 0)','rgb(103,0,13)','rgb(203,24,29)','rgb(251,106,74)'])

fig.update_geos(fitbounds="locations", visible=False)
fig.update_traces(marker_line_width=0.1, marker_opacity=0.8)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_layout(width =1024, height=720, font_size=20);  #tune the font_size to your needs 
fig.update_layout(legend_x=0.8) 
fig.update_layout(legend_y=0.8) 
fig.update_layout(legend=dict(font=dict(size= 16)))
fig.update_layout(legend_title='Delta Percentage')
#fig.show()

#pio.write_image(fig, 'nyc_gmel_origin_map.pdf')

In [None]:
# delta percentage of all destination flows using OSM features

fig = px.choropleth(delta_dest_all_binned, geojson=nyc_tracts, locations='w_geoid', featureidkey="properties.GEOID",\
                 color='delta_label',
                 color_discrete_sequence=[
                    'rgb(188,215,239)','rgb(107,174,214)','rgb(66,146,198)','rgb(33,113,181)','rgb(8,48,107)',
                    'rgb(124, 252, 0)','rgb(103,0,13)','rgb(203,24,29)','rgb(234,106,74)','rgb(251,106,74)'])

fig.update_geos(fitbounds="locations", visible=False)
fig.update_traces(marker_line_width=0.1, marker_opacity=0.8)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_layout(width =1024, height=720, font_size=20);  #tune the font_size to your needs 
fig.update_layout(legend_x=0.8) 
fig.update_layout(legend_y=0.8) 
fig.update_layout(legend=dict(font=dict(size= 16)))
fig.update_layout(legend_title='Delta Percentage')

#pio.write_image(fig, 'nyc_osm_destination_map.pdf')

In [None]:
# delta percentage of all destination flows using GMEL features

fig = px.choropleth(delta_dest_all_binned_GMEL, geojson=nyc_tracts, locations='w_geoid', featureidkey="properties.GEOID",\
                 color='delta_label',
                 color_discrete_sequence=[
                    'rgb(188,215,239)','rgb(107,174,214)','rgb(66,146,198)','rgb(33,113,181)','rgb(8,48,107)',
                    'rgb(124, 252, 0)','rgb(103,0,13)','rgb(203,24,29)','rgb(234,106,74)','rgb(251,106,74)'])

fig.update_geos(fitbounds="locations", visible=False)
fig.update_traces(marker_line_width=0.1, marker_opacity=0.8)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_layout(width =1024, height=720, font_size=20);  #tune the font_size to your needs 
fig.update_layout(legend_x=0.8) 
fig.update_layout(legend_y=0.8) 
fig.update_layout(legend=dict(font=dict(size= 16)))
fig.update_layout(legend_title='Delta Percentage')

#pio.write_image(fig, 'nyc_gmel_destination_map.pdf')

Histograms with 10% bins

In [None]:
# OSM orign flows

delta_below_zero = delta_orig_all[delta_orig_all.delta_percent < 0]
delta_equals_zero = delta_orig_all[delta_orig_all.delta_percent == 0]
delta_above_zero = delta_orig_all[delta_orig_all.delta_percent > 0]
print(delta_below_zero.shape)
print(delta_equals_zero.shape)
print(delta_above_zero.shape)

In [None]:
bin_min = delta_orig_all.delta_percent.min()
bin_max = delta_orig_all.delta_percent.max()
print(bin_min)
print(bin_max)

In [None]:
bins_below_zeros = [-71, -60.000000001, -50.000000001, -40.000000001, -30.000000001, 
                    -20.000000001, -10.000000001,  -0.000000001]
bins_above_zeros = [0.000000001, 10.000000001, 20.000000001, 30.000000001, 40.000000001, 50.000000001,
                     60.000000001, 70.000000001, 80.000000001, 90.000000001, 100.000000001, bin_max]

labels_below = ['- 61-70%', '- 51-60%','- 41-50%', '- 31-40%','- 21-30%', '- 11-20%','- up to 10%']
lables_above = ['+ up to 10%', '+ 11-20%','+ 21-30%', '+ 31-40%','+ 41-50%', '+ 51-60%','+ 61-70%', '+ 71-80%',
                '+ 81-90%', '+ 91-100%',' > 100%']

In [None]:
pd.cut(delta_below_zero['delta_percent'], precision=9,
                                bins=bins_below_zeros,retbins=True,include_lowest=True)[0].cat.categories

In [None]:
pd.cut(delta_above_zero['delta_percent'], precision=9,
                                bins=bins_above_zeros,retbins=True,include_lowest=True)[0].cat.categories

In [None]:
delta_below_zero.insert(delta_below_zero.shape[1],
                        'delta_label',
                        pd.cut(delta_below_zero['delta_percent'], precision=9,
                                bins=bins_below_zeros,
                                labels=labels_below, include_lowest = True))

delta_below_zero

In [None]:
delta_above_zero.insert(delta_above_zero.shape[1],
                        'delta_label',
                        pd.cut(delta_above_zero['delta_percent'], precision=9,
                                bins=bins_above_zeros,
                                labels=lables_above, include_lowest = True))

delta_above_zero                                

In [None]:
delta_orig_all_binned = pd.concat([delta_below_zero, delta_above_zero], ignore_index=True)
delta_orig_all_binned = delta_orig_all_binned.sort_values(by=['delta_percent'])
delta_orig_all_binned.shape

In [None]:
delta_orig_all_binned.isnull().values.any()

In [None]:
data = [[0,0.000000001,0,0,0,' < 100%'], [0,0.000000001,0,0,0,'- 91-100%'],
        [0,0.000000001,0,0,0,'- 81-90%'],[0,0.000000001,0,0,0,'- 71-80%']]
empty_bins = pd.DataFrame(data,columns=delta_orig_all_binned.columns.to_list())

In [None]:
delta_orig_all_binned = pd.concat([empty_bins, delta_orig_all_binned], ignore_index=True)
delta_orig_all_binned.shape

In [None]:
# OSM destination flows

delta_below_zero = delta_dest_all[delta_dest_all.delta_percent < 0]
delta_equals_zero = delta_dest_all[delta_dest_all.delta_percent == 0]
delta_above_zero = delta_dest_all[delta_dest_all.delta_percent > 0]
print(delta_below_zero.shape)
print(delta_equals_zero.shape)
print(delta_above_zero.shape)

In [None]:
bin_min = delta_dest_all.delta_percent.min()
bin_max = delta_dest_all.delta_percent.max()
print(bin_min)
print(bin_max)

In [None]:
bins_below_zeros = [-90.000000001,-80.000000001,-70.000000001, -60.000000001, -50.000000001, -40.000000001,
                    -30.000000001, -20.000000001, -10.000000001,  -0.000000001]
bins_above_zeros = [0.000000001, 10.000000001, 20.000000001, 30.000000001, 40.000000001, 50.000000001,
                     60.000000001, 70.000000001, 80.000000001, 90.000000001, 100.000000001, bin_max]

labels_below = ['- 81-90%', '- 71-80%', '- 61-70%', '- 51-60%','- 41-50%', '- 31-40%','- 21-30%', 
                '- 11-20%','- up to 10%']
lables_above = ['+ up to 10%', '+ 11-20%','+ 21-30%', '+ 31-40%','+ 41-50%', '+ 51-60%','+ 61-70%', '+ 71-80%',
                '+ 81-90%', '+ 91-100%',' > 100%']

In [None]:
pd.cut(delta_below_zero['delta_percent'], precision=9,
                                bins=bins_below_zeros,retbins=True,include_lowest=True)[0].cat.categories

In [None]:
pd.cut(delta_above_zero['delta_percent'], precision=9,
                                bins=bins_above_zeros,retbins=True,include_lowest=True)[0].cat.categories

In [None]:
delta_below_zero.insert(delta_below_zero.shape[1],
                        'delta_label',
                        pd.cut(delta_below_zero['delta_percent'], precision=9,
                                bins=bins_below_zeros,
                                labels=labels_below, include_lowest = True))

delta_below_zero

In [None]:
delta_above_zero.insert(delta_above_zero.shape[1],
                        'delta_label',
                        pd.cut(delta_above_zero['delta_percent'], precision=9,
                                bins=bins_above_zeros,
                                labels=lables_above, include_lowest = True))

delta_above_zero                                

In [None]:
delta_dest_all_binned = pd.concat([delta_below_zero, delta_above_zero], ignore_index=True)
delta_dest_all_binned = delta_dest_all_binned.sort_values(by=['delta_percent'])
delta_dest_all_binned.shape

In [None]:
delta_dest_all_binned.isnull().values.any()

In [None]:
data = [[0,0.000000001,0,0,0,' < 100%'], [0,0.000000001,0,0,0,'- 91-100%']]
empty_bins = pd.DataFrame(data,columns=delta_dest_all_binned.columns.to_list())

In [None]:
delta_dest_all_binned = pd.concat([empty_bins, delta_dest_all_binned], ignore_index=True)
delta_dest_all_binned.shape

In [None]:
# GMEL orign flows

delta_below_zero = delta_orig_all_GMEL[delta_orig_all_GMEL.delta_percent < 0]
delta_equals_zero = delta_orig_all_GMEL[delta_orig_all_GMEL.delta_percent == 0]
delta_above_zero = delta_orig_all_GMEL[delta_orig_all_GMEL.delta_percent > 0]
print(delta_below_zero.shape)
print(delta_equals_zero.shape)
print(delta_above_zero.shape)

In [None]:
bin_min = delta_orig_all_GMEL.delta_percent.min()
bin_max = delta_orig_all_GMEL.delta_percent.max()
print(bin_min)
print(bin_max)

In [None]:
bins_below_zeros = [-71, -60.000000001, -50.000000001, -40.000000001, -30.000000001, 
                    -20.000000001, -10.000000001,  -0.000000001]
bins_above_zeros = [0.000000001, 10.000000001, 20.000000001, 30.000000001, 40.000000001, 50.000000001,
                     60.000000001, 70.000000001, 80.000000001, 90.000000001, 100.000000001, bin_max]

labels_below = ['- 61-70%', '- 51-60%','- 41-50%', '- 31-40%','- 21-30%', '- 11-20%','- up to 10%']
lables_above = ['+ up to 10%', '+ 11-20%','+ 21-30%', '+ 31-40%','+ 41-50%', '+ 51-60%','+ 61-70%', '+ 71-80%',
                '+ 81-90%', '+ 91-100%',' > 100%']

In [None]:
pd.cut(delta_below_zero['delta_percent'], precision=9,
                                bins=bins_below_zeros,retbins=True,include_lowest=True)[0].cat.categories

In [None]:
pd.cut(delta_above_zero['delta_percent'], precision=9,
                                bins=bins_above_zeros,retbins=True,include_lowest=True)[0].cat.categories

In [None]:
delta_below_zero.insert(delta_below_zero.shape[1],
                        'delta_label',
                        pd.cut(delta_below_zero['delta_percent'], precision=9,
                                bins=bins_below_zeros,
                                labels=labels_below, include_lowest = True))

delta_below_zero

In [None]:
delta_above_zero.insert(delta_above_zero.shape[1],
                        'delta_label',
                        pd.cut(delta_above_zero['delta_percent'], precision=9,
                                bins=bins_above_zeros,
                                labels=lables_above, include_lowest = True))

delta_above_zero                                

In [None]:
delta_orig_all_binned_GMEL = pd.concat([delta_below_zero, delta_above_zero], ignore_index=True)
delta_orig_all_binned_GMEL = delta_orig_all_binned_GMEL.sort_values(by=['delta_percent'])
delta_orig_all_binned_GMEL.shape

In [None]:
delta_orig_all_binned_GMEL.isnull().values.any()

In [None]:
data = [[0,0.000000001,0,0,0,' < 100%'], [0,0.000000001,0,0,0,'- 91-100%'],
        [0,0.000000001,0,0,0,'- 81-90%'],[0,0.000000001,0,0,0,'- 71-80%']]
empty_bins = pd.DataFrame(data,columns=delta_orig_all_binned_GMEL.columns.to_list())

In [None]:
delta_orig_all_binned_GMEL = pd.concat([empty_bins, delta_orig_all_binned_GMEL], ignore_index=True)
delta_orig_all_binned_GMEL.shape

In [None]:
# GMEL destination flows

delta_below_zero = delta_dest_all_GMEL[delta_dest_all_GMEL.delta_percent < 0]
delta_equals_zero = delta_dest_all_GMEL[delta_dest_all_GMEL.delta_percent == 0]
delta_above_zero = delta_dest_all_GMEL[delta_dest_all_GMEL.delta_percent > 0]
print(delta_below_zero.shape)
print(delta_equals_zero.shape)
print(delta_above_zero.shape)

In [None]:
bin_min = delta_dest_all_GMEL.delta_percent.min()
bin_max = delta_dest_all_GMEL.delta_percent.max()
print(bin_min)
print(bin_max)

In [None]:
bins_below_zeros = [-90.000000001,-80.000000001,-70.000000001, -60.000000001, -50.000000001,
                    -40.000000001, -30.000000001, -20.000000001, -10.000000001,  -0.000000001]
bins_above_zeros = [0.000000001, 10.000000001, 20.000000001, 30.000000001, 40.000000001, 50.000000001,
                     60.000000001, 70.000000001, 80.000000001, 90.000000001, 100.000000001, bin_max]

labels_below = ['- 81-90%', '- 71-80%', '- 61-70%', '- 51-60%','- 41-50%', '- 31-40%','- 21-30%', 
                '- 11-20%','- up to 10%']
lables_above = ['+ up to 10%', '+ 11-20%','+ 21-30%', '+ 31-40%','+ 41-50%', '+ 51-60%','+ 61-70%', '+ 71-80%',
                '+ 81-90%', '+ 91-100%',' > 100%']

In [None]:
pd.cut(delta_below_zero['delta_percent'], precision=9,
                                bins=bins_below_zeros,retbins=True,include_lowest=True)[0].cat.categories

In [None]:
pd.cut(delta_above_zero['delta_percent'], precision=9,
                                bins=bins_above_zeros,retbins=True,include_lowest=True)[0].cat.categories

In [None]:
delta_below_zero.insert(delta_below_zero.shape[1],
                        'delta_label',
                        pd.cut(delta_below_zero['delta_percent'], precision=9,
                                bins=bins_below_zeros,
                                labels=labels_below, include_lowest = True))

delta_below_zero

In [None]:
delta_above_zero.insert(delta_above_zero.shape[1],
                        'delta_label',
                        pd.cut(delta_above_zero['delta_percent'], precision=9,
                                bins=bins_above_zeros,
                                labels=lables_above, include_lowest = True))

delta_above_zero                                

In [None]:
delta_dest_all_binned_GMEL = pd.concat([delta_below_zero, delta_above_zero], ignore_index=True)
delta_dest_all_binned_GMEL = delta_dest_all_binned_GMEL.sort_values(by=['delta_percent'])
delta_dest_all_binned_GMEL.shape

In [None]:
delta_dest_all_binned_GMEL.isnull().values.any()

In [None]:
data = [[0,0.000000001,0,0,0,' < 100%'], [0,0.000000001,0,0,0,'- 91-100%']]
empty_bins = pd.DataFrame(data,columns=delta_dest_all_binned_GMEL.columns.to_list())

In [None]:
delta_dest_all_binned_GMEL = pd.concat([empty_bins, delta_dest_all_binned_GMEL], ignore_index=True)
delta_dest_all_binned_GMEL.shape

In [None]:
# delta percentages histogram of all origin flows using OSM features

fig = px.histogram(delta_orig_all_binned,x='delta_label',y='gt_count',
                   labels={'delta_label':'delta percentage','gt_count':'commuters'})
fig.update_layout(yaxis_range=[0,120000])
fig.update_layout(width =1024, height=720, font_size=24); 
#fig.show()
#pio.write_image(fig, 'nyc_osm_origin_hist.pdf')

In [None]:
# delta percentages histogram of all destination flows using OSM features

fig = px.histogram(delta_dest_all_binned,x='delta_label',y='gt_count',
                   labels={'delta_label':'delta percentage','gt_count':'commuters'})
fig.update_layout(yaxis_range=[0,160000])
fig.update_layout(width =1024, height=720, font_size=24)
#fig.show()
#pio.write_image(fig, 'nyc_osm_destination_hist.pdf')

In [None]:
# delta percentages histogram of all origin flows using GMEL features

fig = px.histogram(delta_orig_all_binned_GMEL,x='delta_label',y='gt_count',
                   labels={'delta_label':'delta percentage','gt_count':'commuters'})
fig.update_layout(yaxis_range=[0,120000])
fig.update_layout(width =1024, height=720, font_size=24); 
#fig.show()
#pio.write_image(fig, 'nyc_gmel_origin_hist.pdf')

In [None]:
# delta percentages histogram of all destination flows using OSM features

fig = px.histogram(delta_dest_all_binned_GMEL,x='delta_label',y='gt_count',
                   labels={'delta_label':'delta percentage','gt_count':'commuters'})
fig.update_layout(yaxis_range=[0,160000])
fig.update_layout(width =1024, height=720, font_size=24)
#fig.show()
#pio.write_image(fig, 'nyc_gmel_destination_hist.pdf')

NYC Scatter plots:

In [None]:
fig = px.scatter(x=gt_orig_all.gt_count, y=pred_orig_all.pred_count, #trendline='ols',
                 labels={
                     'x': 'Ground truth commuters count',
                     'y': 'Prediction commuters count'
                 })
fig.update_layout(xaxis_range=[-100,1640])
fig.update_layout(yaxis_range=[-100,1640])
fig.update_layout(width =1024, height=720, font_size=24); 

fig.add_shape(type="line",x0=0, y0=0, x1=1600, y1=1600,
    line=dict(color='#636EFA',width=2)
)
#fig.show()
#pio.write_image(fig, 'nyc_osm_origin_scatter.pdf')

In [None]:
fig = px.scatter(x=gt_orig_all.gt_count, y=pred_orig_all_GMEL.pred_count, #trendline='ols',
                 labels={
                     'x': 'Ground truth commuters count',
                     'y': 'Prediction commuters count'
                 })
fig.update_layout(xaxis_range=[-100,1640])
fig.update_layout(yaxis_range=[-100,1640])
fig.update_layout(width =1024, height=720, font_size=24); 

fig.add_shape(type="line",x0=0, y0=0, x1=1600, y1=1600,
    line=dict(color='#636EFA',width=2)
)
#fig.show()
#pio.write_image(fig, 'nyc_gmel_origin_scatter.pdf')

In [None]:
fig = px.scatter(x=gt_dest_all.gt_count, y=pred_dest_all.pred_count,log_x=True, log_y=True, #trendline='ols',
                 labels={
                     'x': 'Ground truth commuters count',
                     'y': 'Prediction commuters count'
                 })
fig.update_layout(width =1024, height=720, font_size=24); 
fig.add_shape(type="line",x0=1, y0=1, x1=10000, y1=10000,
    line=dict(color='#636EFA',width=2)
)
#fig.show()
#pio.write_image(fig, 'nyc_osm_destination_log_scatter.pdf')

In [None]:
fig = px.scatter(x=gt_dest_all.gt_count, y=pred_dest_all_GMEL.pred_count,log_x=True, log_y=True, #trendline='ols',
                 labels={
                     'x': 'Ground truth commuters count',
                     'y': 'Prediction commuters count'
                 })
fig.update_layout(width =1024, height=720, font_size=24); 
fig.add_shape(type="line",x0=1, y0=1, x1=10000, y1=10000,
    line=dict(color='#636EFA',width=2)
)
#fig.show()
#pio.write_image(fig, 'nyc_gmel_destination_log_scatter.pdf')

Some stats:

In [None]:
gt_orig_all.describe().gt_count.to_frame().rename(columns={'gt_count' : 'GT origin stats'})

In [None]:
gt_dest_all.describe().gt_count.to_frame().rename(columns={'gt_count' : 'GT destination stats'})

Processing test with a single origin and destination using OSM features:   

In [None]:
gt_flows[gt_flows.h_geoid==gt_flows.w_geoid]

In [None]:
orig_geoid = gt_orig_all[gt_orig_all.gt_count==gt_orig_all.gt_count.median()].iloc[0].h_geoid

In [None]:
dest_geoid = gt_dest_all[gt_dest_all.gt_count==gt_dest_all.gt_count.median()].iloc[1].w_geoid

In [None]:
gt_flows.shape

In [None]:
gt_orig_dest_frozen = gt_flows.loc[gt_flows.w_geoid==dest_geoid, ['h_geoid', 'gt_count']].copy()
gt_dest_orig_frozen = gt_flows.loc[gt_flows.h_geoid==orig_geoid, ['w_geoid', 'gt_count']].copy()

In [None]:
pred_flows.shape

In [None]:
pred_orig_dest_frozen = pred_flows.loc[pred_flows.w_geoid==dest_geoid, ['h_geoid', 'pred_count']].copy()
pred_dest_orig_frozen = pred_flows.loc[pred_flows.h_geoid==orig_geoid, ['w_geoid', 'pred_count']].copy()
GMEL_orig_dest_frozen = pred_flows_GMEL.loc[pred_flows_GMEL.w_geoid==dest_geoid, ['h_geoid', 'pred_count']].copy()
GMEL_dest_orig_frozen = pred_flows_GMEL.loc[pred_flows_GMEL.h_geoid==orig_geoid, ['w_geoid', 'pred_count']].copy()

In [None]:
# round predicted count to nearest integer

pred_orig_dest_frozen['pred_count'] = pred_orig_dest_frozen.pred_count.round(0)
pred_dest_orig_frozen['pred_count'] = pred_dest_orig_frozen.pred_count.round(0)
GMEL_orig_dest_frozen['pred_count'] = GMEL_orig_dest_frozen.pred_count.round(0)
GMEL_dest_orig_frozen['pred_count'] = GMEL_dest_orig_frozen.pred_count.round(0)

Processing predicted origin, single destination

In [None]:
gt_flows.shape

In [None]:
gt_orig_dest_frozen.shape

In [None]:
gt_orig_dest_frozen[gt_orig_dest_frozen.gt_count == 0].shape

In [None]:
pred_orig_dest_frozen[pred_orig_dest_frozen.pred_count==0].shape

In [None]:
GMEL_orig_dest_frozen[GMEL_orig_dest_frozen.pred_count==0].shape

In [None]:
gt_orig_dest_frozen['label'] = 'not matched'
pred_orig_dest_frozen['label'] = 'not matched'
GMEL_orig_dest_frozen['label'] = 'not matched'

In [None]:
gt_orig_dest_frozen.loc[((gt_orig_dest_frozen.gt_count == 0) & (pred_orig_dest_frozen.pred_count == 0) |
                         (gt_orig_dest_frozen.gt_count > 0) & (pred_orig_dest_frozen.pred_count > 0)),
                        'label'] = 'matched'

pred_orig_dest_frozen.loc[((gt_orig_dest_frozen.gt_count == 0) & (pred_orig_dest_frozen.pred_count == 0) |
                         (gt_orig_dest_frozen.gt_count > 0) & (pred_orig_dest_frozen.pred_count > 0)),
                        'label'] = 'matched'

GMEL_orig_dest_frozen.loc[((gt_orig_dest_frozen.gt_count == 0) & (GMEL_orig_dest_frozen.pred_count == 0) |
                         (gt_orig_dest_frozen.gt_count > 0) & (GMEL_orig_dest_frozen.pred_count > 0)),
                        'label'] = 'matched'

In [None]:
pred_orig_dest_frozen.groupby(by='label').count()

In [None]:
GMEL_orig_dest_frozen.groupby(by='label').count()

In [None]:
gt_orig_dest_frozen[(gt_orig_dest_frozen.gt_count == 0) & (pred_orig_dest_frozen.pred_count == 0)].shape

In [None]:
gt_orig_dest_frozen[(gt_orig_dest_frozen.gt_count == 0) & (GMEL_orig_dest_frozen.pred_count == 0)].shape

In [None]:
gt_orig_dest_frozen[(gt_orig_dest_frozen.gt_count != 0) & (pred_orig_dest_frozen.pred_count != 0)].shape

In [None]:
gt_orig_dest_frozen[(gt_orig_dest_frozen.gt_count != 0) & (GMEL_orig_dest_frozen.pred_count != 0)].shape

In [None]:
#gt_orig_dest_frozen.groupby(by='gt_count').count()

In [None]:
#pred_orig_dest_frozen.groupby(by='pred_count').count()

In [None]:
no_orig_flows = nyc_all_geoids[~nyc_all_geoids.isin(gt_orig_dest_frozen.h_geoid)].to_frame().\
    rename(columns={'GEOID' : 'h_geoid'})
no_orig_flows['gt_count'] = -1
no_orig_flows['label'] = 'no flows'
no_orig_flows.shape

In [None]:
no_orig_flows.loc[no_orig_flows.h_geoid==dest_geoid, 'label'] = 'destination'
no_orig_flows.loc[no_orig_flows.h_geoid==dest_geoid, 'gt_count'] = 0

In [None]:
gt_orig_dest_frozen = pd.concat([gt_orig_dest_frozen,no_orig_flows], ignore_index=True)
gt_orig_dest_frozen.shape

In [None]:
no_orig_flows = no_orig_flows.rename(columns={'gt_count' : 'pred_count'}).copy()
pred_orig_dest_frozen = pd.concat([pred_orig_dest_frozen,no_orig_flows], ignore_index=True)
pred_orig_dest_frozen.shape

In [None]:
GMEL_orig_dest_frozen = pd.concat([GMEL_orig_dest_frozen,no_orig_flows], ignore_index=True)
GMEL_orig_dest_frozen.shape

In [None]:
pred_orig_dest_frozen.groupby(by='label').count()

In [None]:
GMEL_orig_dest_frozen.groupby(by='label').count()

In [None]:
gt_orig_dest_frozen = gt_orig_dest_frozen.sort_values(by=['gt_count'], ascending=False)
gt_orig_dest_frozen['gt_count'] = gt_orig_dest_frozen['gt_count'].astype(str)
pred_orig_dest_frozen = pred_orig_dest_frozen.sort_values(by=['pred_count'], ascending=False)
pred_orig_dest_frozen['pred_count'] = pred_orig_dest_frozen['pred_count'].astype(str)
GMEL_orig_dest_frozen = GMEL_orig_dest_frozen.sort_values(by=['pred_count'], ascending=False)
GMEL_orig_dest_frozen['pred_count'] = GMEL_orig_dest_frozen['pred_count'].astype(str)

Processing predicted destination, single origin

In [None]:
gt_orig_dest_frozen.shape

In [None]:
gt_dest_orig_frozen.shape

In [None]:
gt_dest_orig_frozen[gt_dest_orig_frozen.gt_count == 0].shape

In [None]:
pred_dest_orig_frozen[pred_dest_orig_frozen.pred_count==0].shape

In [None]:
GMEL_dest_orig_frozen[GMEL_dest_orig_frozen.pred_count==0].shape

In [None]:
gt_dest_orig_frozen['label'] = 'not matched'
pred_dest_orig_frozen['label'] = 'not matched'
GMEL_dest_orig_frozen['label'] = 'not matched'

In [None]:
gt_dest_orig_frozen.loc[((gt_dest_orig_frozen.gt_count == 0) & (pred_dest_orig_frozen.pred_count == 0) |
                         (gt_dest_orig_frozen.gt_count > 0) & (pred_dest_orig_frozen.pred_count > 0)),
                        'label'] = 'matched'

pred_dest_orig_frozen.loc[((gt_dest_orig_frozen.gt_count == 0) & (pred_dest_orig_frozen.pred_count == 0) |
                         (gt_dest_orig_frozen.gt_count > 0) & (pred_dest_orig_frozen.pred_count > 0)),
                        'label'] = 'matched'
GMEL_dest_orig_frozen.loc[((gt_dest_orig_frozen.gt_count == 0) & (GMEL_dest_orig_frozen.pred_count == 0) |
                         (gt_dest_orig_frozen.gt_count > 0) & (GMEL_dest_orig_frozen.pred_count > 0)),
                        'label'] = 'matched'

In [None]:
pred_dest_orig_frozen.groupby(by='label').count()

In [None]:
GMEL_dest_orig_frozen.groupby(by='label').count()

In [None]:
gt_dest_orig_frozen[(gt_dest_orig_frozen.gt_count == 0) & (pred_dest_orig_frozen.pred_count == 0)].shape

In [None]:
gt_dest_orig_frozen[(gt_dest_orig_frozen.gt_count == 0) & (GMEL_dest_orig_frozen.pred_count == 0)].shape

In [None]:
gt_dest_orig_frozen[(gt_dest_orig_frozen.gt_count != 0) & (pred_dest_orig_frozen.pred_count != 0)].shape

In [None]:
gt_dest_orig_frozen[(gt_dest_orig_frozen.gt_count != 0) & (GMEL_dest_orig_frozen.pred_count != 0)].shape

In [None]:
#gt_dest_orig_frozen.groupby(by='gt_count').count()

In [None]:
#pred_dest_orig_frozen.groupby(by='pred_count').count()

In [None]:
no_dest_flows = nyc_all_geoids[~nyc_all_geoids.isin(gt_dest_orig_frozen.w_geoid)].to_frame().\
    rename(columns={'GEOID' : 'w_geoid'})
no_dest_flows['gt_count'] = -1
no_dest_flows['label'] = 'no flows'
no_dest_flows.shape

In [None]:
no_dest_flows.loc[no_dest_flows.w_geoid==orig_geoid, 'label'] = 'origin'
no_dest_flows.loc[no_dest_flows.w_geoid==orig_geoid, 'gt_count'] = 0

In [None]:
gt_dest_orig_frozen = pd.concat([gt_dest_orig_frozen,no_dest_flows], ignore_index=True)
gt_dest_orig_frozen.shape

In [None]:
no_dest_flows = no_dest_flows.rename(columns={'gt_count' : 'pred_count'}).copy()
pred_dest_orig_frozen = pd.concat([pred_dest_orig_frozen,no_dest_flows], ignore_index=True)
pred_dest_orig_frozen.shape

In [None]:
GMEL_dest_orig_frozen = pd.concat([GMEL_dest_orig_frozen,no_dest_flows], ignore_index=True)
GMEL_dest_orig_frozen.shape

In [None]:
gt_dest_orig_frozen = gt_dest_orig_frozen.sort_values(by=['gt_count'], ascending=False)
gt_dest_orig_frozen['gt_count'] = gt_dest_orig_frozen['gt_count'].astype(str)
pred_dest_orig_frozen = pred_dest_orig_frozen.sort_values(by=['pred_count'], ascending=False)
pred_dest_orig_frozen['pred_count'] = pred_dest_orig_frozen['pred_count'].astype(str)
GMEL_dest_orig_frozen = GMEL_dest_orig_frozen.sort_values(by=['pred_count'], ascending=False)
GMEL_dest_orig_frozen['pred_count'] = GMEL_dest_orig_frozen['pred_count'].astype(str)

Origin and destination maps!

In [None]:
# OSM origin, single destination

fig = px.choropleth(pred_orig_dest_frozen, geojson=nyc_tracts, locations='h_geoid', featureidkey="properties.GEOID",\
                 color='label',
                 color_discrete_sequence=[
                    'rgb(124, 252, 0)','rgb(107,174,214)','rgb(255, 0, 0)','rgb(255,255,255)']
                    #'rgb(33,113,181)','rgb(8,48,107)','rgb(235, 249, 246)']
                 )

fig.update_geos(fitbounds="locations", visible=False)
fig.update_traces(marker_line_width=0.1, marker_opacity=0.8)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_layout(width =1024, height=720, font_size=20); 
fig.update_layout(legend_x=0.8) 
fig.update_layout(legend_y=0.8) 
fig.update_layout(legend=dict(font=dict(size= 16)))
fig.update_layout(legend_title='Origin')
#fig.show()

#pio.write_image(fig, 'nyc_osm_dest_frozen.pdf')

In [None]:
# OSM destination, single origin

fig = px.choropleth(pred_dest_orig_frozen, geojson=nyc_tracts, locations='w_geoid', featureidkey="properties.GEOID",\
                 color='label',
                 color_discrete_sequence=[
                    'rgb(124, 252, 0)','rgb(107,174,214)','rgb(255, 0, 0)','rgb(255,255,255)']
                 )

fig.update_geos(fitbounds="locations", visible=False)
fig.update_traces(marker_line_width=0.1, marker_opacity=0.8)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_layout(width =1024, height=720, font_size=20); 
fig.update_layout(legend_x=0.8) 
fig.update_layout(legend_y=0.8) 
fig.update_layout(legend=dict(font=dict(size= 16)))
fig.update_layout(legend_title='Destination')
#fig.show()

#pio.write_image(fig, 'nyc_osm_orig_frozen.pdf')

In [None]:
# GMEL origin, single destination

fig = px.choropleth(GMEL_orig_dest_frozen, geojson=nyc_tracts, locations='h_geoid', featureidkey="properties.GEOID",\
                 color='label',
                 color_discrete_sequence=[
                    'rgb(124, 252, 0)','rgb(107,174,214)','rgb(255, 0, 0)','rgb(255,255,255)']
                    #'rgb(33,113,181)','rgb(8,48,107)','rgb(235, 249, 246)']
                 )

fig.update_geos(fitbounds="locations", visible=False)
fig.update_traces(marker_line_width=0.1, marker_opacity=0.8)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_layout(width =1024, height=720, font_size=20); 
fig.update_layout(legend_x=0.8) 
fig.update_layout(legend_y=0.8) 
fig.update_layout(legend=dict(font=dict(size= 16)))
fig.update_layout(legend_title='Origin')
#fig.show()

#pio.write_image(fig, 'nyc_gmel_dest_frozen.pdf')

In [None]:
# GMEL destination, single origin

fig = px.choropleth(GMEL_dest_orig_frozen, geojson=nyc_tracts, locations='w_geoid', featureidkey="properties.GEOID",\
                 color='label',
                 color_discrete_sequence=[
                    'rgb(124, 252, 0)','rgb(107,174,214)','rgb(255, 0, 0)','rgb(255,255,255)']
                 )

fig.update_geos(fitbounds="locations", visible=False)
fig.update_traces(marker_line_width=0.1, marker_opacity=0.8)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_layout(width =1024, height=720, font_size=20); 
fig.update_layout(legend_x=0.8) 
fig.update_layout(legend_y=0.8) 
fig.update_layout(legend=dict(font=dict(size= 16)))
fig.update_layout(legend_title='Destination')
#fig.show()

#pio.write_image(fig, 'nyc_gmel_orig_frozen.pdf')

In [None]:
gt_orig_dest_frozen.shape

In [None]:
gt_dest_orig_frozen.shape

In [None]:
gt_orig_dest_frozen.groupby(by='label').count()

In [None]:
gt_dest_orig_frozen.groupby(by='label').count()

FFX analysis:

In [None]:
ffx_census = geopandas.read_file('ffx_data/ffx_census_tracts/ffx_census_2010.shp')
ffx_census.head()
ffx_all_geoids = ffx_census['GEOID']
ffx_all_geoids = ffx_all_geoids.astype(int)
ffx_all_geoids.shape

Processing test set ground truth flows:

In [None]:
gt_flows_ffx = pd.read_csv('ffx_data/LODES/CommutingFlow_2015_test_exp.csv')  
gt_flows_ffx = gt_flows_ffx.rename(columns={'count' : 'gt_count'})
gt_flows_ffx.shape

In [None]:
gt_orig_ffx = gt_flows_ffx.groupby(['h_geoid'], as_index=False).sum()
gt_dest_ffx = gt_flows_ffx.groupby(['w_geoid'], as_index=False).sum()

In [None]:
gt_orig_ffx[gt_orig_ffx['gt_count']==0].shape

In [None]:
gt_dest_ffx[gt_dest_ffx['gt_count']==0].shape

In [None]:
gt_dest_ffx.loc[gt_dest_ffx['gt_count']==0,'gt_count'] = 1
gt_dest_ffx[gt_dest_ffx['gt_count']==0].shape

Processing test set prediction flows using OSM features:   

In [None]:
pred_flows_ffx = pd.read_csv('ffx_predicted_flows_exp_osm.csv')
pred_flows_ffx.shape

In [None]:
gt_flows_ffx[gt_flows_ffx['gt_count'] == 0].shape

In [None]:
pred_flows_ffx[pred_flows_ffx.pred_count == 0].shape

In [None]:
pred_flows_ffx[pred_flows_ffx['pred_count'] < 0]['pred_count'].shape

In [None]:
gt_flows_ffx[(pred_flows_ffx['pred_count'] < 0) & (gt_flows_ffx.gt_count==0)].shape

In [None]:
pred_flows_ffx[pred_flows_ffx['pred_count'] < 0]['pred_count'].sum()

In [None]:
pred_flows_ffx.loc[pred_flows_ffx['pred_count'] < 0, 'pred_count'] = 0
pred_flows_ffx[pred_flows_ffx['pred_count'] < 0]['pred_count'].sum()

In [None]:
gt_flows_ffx['gt_count'].sum()

In [None]:
pred_flows_ffx['pred_count'].sum()

In [None]:
pred_orig_ffx = pred_flows_ffx.groupby(['h_geoid'], as_index=False).sum()
pred_dest_ffx = pred_flows_ffx.groupby(['w_geoid'], as_index=False).sum()

In [None]:
gt_orig_ffx['gt_count'].sum()

In [None]:
pred_orig_ffx['pred_count'].sum()

In [None]:
gt_dest_ffx['gt_count'].sum()

In [None]:
pred_dest_ffx['pred_count'].sum()

FFX Scatter plots:

In [None]:
fig = px.scatter(x=gt_orig_ffx.gt_count, y=pred_orig_ffx.pred_count, #trendline='ols',
                 labels={
                     'x': 'Ground truth commuters count',
                     'y': 'Prediction commuters count'
                 })
fig.update_layout(xaxis_range=[-50,950])
fig.update_layout(yaxis_range=[-50,950])
fig.update_layout(width =1024, height=720, font_size=24)

fig.add_shape(type="line",x0=0, y0=0, x1=900, y1=900,
    line=dict(color='#636EFA',width=2)
)
#fig.show()
#pio.write_image(fig, 'ffx_osm_origin_scatter.pdf')

In [None]:
fig = px.scatter(x=gt_dest_ffx.gt_count, y=pred_dest_ffx.pred_count,log_x=True, log_y=True, #trendline='ols',
                 labels={
                     'x': 'Ground truth commuters count',
                     'y': 'Prediction commuters count'
                 })
fig.update_layout(width =1024, height=720, font_size=24)

fig.add_shape(type="line",x0=1, y0=1, x1=6000, y1=6000,
    line=dict(color='#636EFA',width=2)
)
#fig.show()
#pio.write_image(fig, 'ffx_osm_destination_log_scatter.pdf')

FFX stats:

In [None]:
gt_orig_ffx.describe().gt_count.to_frame().rename(columns={'gt_count' : 'GT origin stats'})

In [None]:
gt_dest_ffx.describe().gt_count.to_frame().rename(columns={'gt_count' : 'GT destination stats'})

FFX histograms with 10% bins:

In [None]:
# delta percentage of all origin flows

delta_orig_ffx = gt_orig_ffx[['h_geoid', 'gt_count']]
delta_orig_ffx['pred_count'] = pred_orig_ffx['pred_count']
delta_orig_ffx['delta'] = pred_orig_ffx['pred_count'] - gt_orig_ffx['gt_count']
delta_orig_ffx['delta_percent'] = ((delta_orig_ffx['delta'] / delta_orig_ffx['gt_count'])*100)
delta_orig_ffx = delta_orig_ffx.sort_values(by=['delta_percent'], ascending=False)
delta_orig_ffx

In [None]:
# delta percentage of all destination flows

delta_dest_ffx = gt_dest_ffx[['w_geoid', 'gt_count']]
delta_dest_ffx['pred_count'] = pred_dest_ffx['pred_count']
delta_dest_ffx['delta'] = pred_dest_ffx['pred_count'] - gt_dest_ffx['gt_count']
delta_dest_ffx['delta_percent'] = ((delta_dest_ffx['delta'] / delta_dest_ffx['gt_count'])*100)
delta_dest_ffx = delta_dest_ffx.sort_values(by=['delta_percent'], ascending=False)
delta_dest_ffx

In [None]:
# OSM orign flows

delta_below_zero = delta_orig_ffx[delta_orig_ffx.delta_percent < 0]
delta_equals_zero = delta_orig_ffx[delta_orig_ffx.delta_percent == 0]
delta_above_zero = delta_orig_ffx[delta_orig_ffx.delta_percent > 0]
print(delta_below_zero.shape)
print(delta_equals_zero.shape)
print(delta_above_zero.shape)

In [None]:
bin_min = delta_orig_ffx.delta_percent.min()
bin_max = delta_orig_ffx.delta_percent.max()
print(bin_min)
print(bin_max)

In [None]:
bins_below_zeros = [-50.000000001, -40.000000001, -30.000000001, 
                    -20.000000001, -10.000000001,  -0.000000001]
bins_above_zeros = [0.000000001, 10.000000001, 20.000000001, 30.000000001, 40.000000001, 50.000000001,
                     60.000000001, 70.000000001, 80.000000001, 90.000000001, 100.000000001, bin_max]

labels_below = ['- 41-50%', '- 31-40%','- 21-30%', '- 11-20%','- up to 10%']
lables_above = ['+ up to 10%', '+ 11-20%','+ 21-30%', '+ 31-40%','+ 41-50%', '+ 51-60%','+ 61-70%', '+ 71-80%',
                '+ 81-90%', '+ 91-100%',' > 100%']

In [None]:
pd.cut(delta_below_zero['delta_percent'], precision=9,
                                bins=bins_below_zeros,retbins=True,include_lowest=True)[0].cat.categories

In [None]:
pd.cut(delta_above_zero['delta_percent'], precision=9,
                                bins=bins_above_zeros,retbins=True,include_lowest=True)[0].cat.categories

In [None]:
delta_below_zero.insert(delta_below_zero.shape[1],
                        'delta_label',
                        pd.cut(delta_below_zero['delta_percent'], precision=9,
                                bins=bins_below_zeros,
                                labels=labels_below, include_lowest = True))

delta_below_zero

In [None]:
delta_above_zero.insert(delta_above_zero.shape[1],
                        'delta_label',
                        pd.cut(delta_above_zero['delta_percent'], precision=9,
                                bins=bins_above_zeros,
                                labels=lables_above, include_lowest = True))

delta_above_zero                                

In [None]:
delta_orig_ffx_binned = pd.concat([delta_below_zero, delta_above_zero], ignore_index=True)
delta_orig_ffx_binned = delta_orig_ffx_binned.sort_values(by=['delta_percent'])
delta_orig_ffx_binned.shape

In [None]:
delta_orig_ffx_binned.isnull().values.any()

In [None]:
data = [[0,0.000000001,0,0,0,' < 100%'], [0,0.000000001,0,0,0,'- 91-100%'],
        [0,0.000000001,0,0,0,'- 81-90%'],[0,0.000000001,0,0,0,'- 71-80%'],
        [0,0.000000001,0,0,0,'- 61-70%'],[0,0.000000001,0,0,0,'- 51-60%']]
empty_bins = pd.DataFrame(data,columns=delta_orig_ffx_binned.columns.to_list())

In [None]:
delta_orig_ffx_binned = pd.concat([empty_bins, delta_orig_ffx_binned], ignore_index=True)
delta_orig_ffx_binned.shape

In [None]:
# OSM destination flows

delta_below_zero = delta_dest_ffx[delta_dest_ffx.delta_percent < 0]
delta_equals_zero = delta_dest_ffx[delta_dest_ffx.delta_percent == 0]
delta_above_zero = delta_dest_ffx[delta_dest_ffx.delta_percent > 0]
print(delta_below_zero.shape)
print(delta_equals_zero.shape)
print(delta_above_zero.shape)

In [None]:
bin_min = delta_dest_ffx.delta_percent.min()
bin_max = delta_dest_ffx.delta_percent.max()
print(bin_min)
print(bin_max)

In [None]:
bins_below_zeros = [-90.000000001,-80.000000001,-70.000000001, -60.000000001, -50.000000001, -40.000000001,
                    -30.000000001, -20.000000001, -10.000000001,  -0.000000001]
bins_above_zeros = [0.000000001, 10.000000001, 20.000000001, 30.000000001, 40.000000001, 50.000000001,
                     60.000000001, 70.000000001, 80.000000001, 90.000000001, 100.000000001, bin_max]

labels_below = ['- 81-90%', '- 71-80%', '- 61-70%', '- 51-60%','- 41-50%', '- 31-40%','- 21-30%', 
                '- 11-20%','- up to 10%']
lables_above = ['+ up to 10%', '+ 11-20%','+ 21-30%', '+ 31-40%','+ 41-50%', '+ 51-60%','+ 61-70%', '+ 71-80%',
                '+ 81-90%', '+ 91-100%',' > 100%']

In [None]:
pd.cut(delta_below_zero['delta_percent'], precision=9,
                                bins=bins_below_zeros,retbins=True,include_lowest=True)[0].cat.categories

In [None]:
pd.cut(delta_above_zero['delta_percent'], precision=9,
                                bins=bins_above_zeros,retbins=True,include_lowest=True)[0].cat.categories

In [None]:
delta_below_zero.insert(delta_below_zero.shape[1],
                        'delta_label',
                        pd.cut(delta_below_zero['delta_percent'], precision=9,
                                bins=bins_below_zeros,
                                labels=labels_below, include_lowest = True))

delta_below_zero

In [None]:
delta_above_zero.insert(delta_above_zero.shape[1],
                        'delta_label',
                        pd.cut(delta_above_zero['delta_percent'], precision=9,
                                bins=bins_above_zeros,
                                labels=lables_above, include_lowest = True))

delta_above_zero                                

In [None]:
delta_dest_ffx_binned = pd.concat([delta_below_zero, delta_above_zero], ignore_index=True)
delta_dest_ffx_binned = delta_dest_ffx_binned.sort_values(by=['delta_percent'])
delta_dest_ffx_binned.shape

In [None]:
delta_dest_ffx_binned.isnull().values.any()

In [None]:
data = [[0,0.000000001,0,0,0,' < 100%'], [0,0.000000001,0,0,0,'- 91-100%']]
empty_bins = pd.DataFrame(data,columns=delta_dest_ffx_binned.columns.to_list())

In [None]:
delta_dest_ffx_binned = pd.concat([empty_bins, delta_dest_ffx_binned], ignore_index=True)
delta_dest_ffx_binned.shape

In [None]:
# delta percentages histogram of all origin flows using OSM features

fig = px.histogram(delta_orig_ffx_binned,x='delta_label',y='gt_count',
                   labels={'delta_label':'delta percentage','gt_count':'commuters'})
fig.update_layout(yaxis_range=[0,10000])
fig.update_layout(width =1024, height=720, font_size=24); 
#fig.show()
#pio.write_image(fig, 'ffx_osm_origin_hist.pdf')

In [None]:
# delta percentages histogram of all destination flows using OSM features

fig = px.histogram(delta_dest_ffx_binned,x='delta_label',y='gt_count',
                   labels={'delta_label':'delta percentage','gt_count':'commuters'})
fig.update_layout(yaxis_range=[0,10000])
fig.update_layout(width =1024, height=720, font_size=24)
#fig.show()
#pio.write_image(fig, 'ffx_osm_destination_hist.pdf')

Processing FFX transfer learning test set prediction flows using OSM features:   

In [None]:
pred_flows_ffx = pd.read_csv('ffx_predicted_flows_exp_osm_transfer.csv')
pred_flows_ffx.shape

In [None]:
gt_flows_ffx[gt_flows_ffx['gt_count'] == 0].shape

In [None]:
pred_flows_ffx[pred_flows_ffx.pred_count == 0].shape

In [None]:
pred_flows_ffx[pred_flows_ffx['pred_count'] < 0]['pred_count'].shape

In [None]:
gt_flows_ffx[(pred_flows_ffx['pred_count'] < 0) & (gt_flows_ffx.gt_count==0)].shape

In [None]:
pred_flows_ffx[pred_flows_ffx['pred_count'] < 0]['pred_count'].sum()

In [None]:
pred_flows_ffx.loc[pred_flows_ffx['pred_count'] < 0, 'pred_count'] = 0
pred_flows_ffx[pred_flows_ffx['pred_count'] < 0]['pred_count'].sum()

In [None]:
gt_flows_ffx['gt_count'].sum()

In [None]:
pred_flows_ffx['pred_count'].sum()

In [None]:
pred_orig_ffx = pred_flows_ffx.groupby(['h_geoid'], as_index=False).sum()
pred_dest_ffx = pred_flows_ffx.groupby(['w_geoid'], as_index=False).sum()

In [None]:
gt_orig_ffx['gt_count'].sum()

In [None]:
pred_orig_ffx['pred_count'].sum()

In [None]:
gt_dest_ffx['gt_count'].sum()

In [None]:
pred_dest_ffx['pred_count'].sum()

FFX Scatter plots:

In [None]:
fig = px.scatter(x=gt_orig_ffx.gt_count, y=pred_orig_ffx.pred_count, #trendline='ols',
                 labels={
                     'x': 'Ground truth origin count',
                     'y': 'OSM features origin prediction count'
                 })
fig.update_layout(xaxis_range=[-50,950])
fig.update_layout(yaxis_range=[-50,950])
fig.update_layout(width =1024, height=720, font_size=24)

fig.add_shape(type="line",x0=0, y0=0, x1=900, y1=900,
    line=dict(color='#636EFA',width=2)
)
#fig.show()

#pio.write_image(fig, 'ffx_osm_origin_scatter_transfer.pdf')

In [None]:
fig = px.scatter(x=gt_dest_ffx.gt_count, y=pred_dest_ffx.pred_count,log_x=True, log_y=True, #trendline='ols',
                 labels={
                     'x': 'Ground truth destination count',
                     'y': 'OSM features destination prediction count'
                 })
fig.update_layout(width =1024, height=720, font_size=24)

fig.add_shape(type="line",x0=1, y0=1, x1=6000, y1=6000,
    line=dict(color='#636EFA',width=2)
)
#fig.show()

#pio.write_image(fig, 'ffx_osm_destination_log_scatter_transfer.pdf')

FFX histograms with 10% bins:

In [None]:
# delta percentage of all origin flows

delta_orig_ffx = gt_orig_ffx[['h_geoid', 'gt_count']]
delta_orig_ffx['pred_count'] = pred_orig_ffx['pred_count']
delta_orig_ffx['delta'] = pred_orig_ffx['pred_count'] - gt_orig_ffx['gt_count']
delta_orig_ffx['delta_percent'] = ((delta_orig_ffx['delta'] / delta_orig_ffx['gt_count'])*100)
delta_orig_ffx = delta_orig_ffx.sort_values(by=['delta_percent'], ascending=False)
delta_orig_ffx

In [None]:
# delta percentage of all destination flows

delta_dest_ffx = gt_dest_ffx[['w_geoid', 'gt_count']]
delta_dest_ffx['pred_count'] = pred_dest_ffx['pred_count']
delta_dest_ffx['delta'] = pred_dest_ffx['pred_count'] - gt_dest_ffx['gt_count']
delta_dest_ffx['delta_percent'] = ((delta_dest_ffx['delta'] / delta_dest_ffx['gt_count'])*100)
delta_dest_ffx = delta_dest_ffx.sort_values(by=['delta_percent'], ascending=False)
delta_dest_ffx

In [None]:
# OSM orign flows

delta_below_zero = delta_orig_ffx[delta_orig_ffx.delta_percent < 0]
delta_equals_zero = delta_orig_ffx[delta_orig_ffx.delta_percent == 0]
delta_above_zero = delta_orig_ffx[delta_orig_ffx.delta_percent > 0]
print(delta_below_zero.shape)
print(delta_equals_zero.shape)
print(delta_above_zero.shape)

In [None]:
bin_min = delta_orig_ffx.delta_percent.min()
bin_max = delta_orig_ffx.delta_percent.max()
print(bin_min)
print(bin_max)

In [None]:
bins_below_zeros = [-70.000000001, -60.000000001, -50.000000001, -40.000000001, -30.000000001, 
                    -20.000000001, -10.000000001,  -0.000000001]
bins_above_zeros = [0.000000001, 10.000000001, 20.000000001, 30.000000001, 40.000000001, 50.000000001,
                     60.000000001, 70.000000001, 80.000000001, 90.000000001, 100.000000001, bin_max]

labels_below = ['- 61-70%', '- 51-60%', '- 41-50%', '- 31-40%','- 21-30%', '- 11-20%','- up to 10%']
lables_above = ['+ up to 10%', '+ 11-20%','+ 21-30%', '+ 31-40%','+ 41-50%', '+ 51-60%','+ 61-70%', '+ 71-80%',
                '+ 81-90%', '+ 91-100%',' > 100%']

In [None]:
pd.cut(delta_below_zero['delta_percent'], precision=9,
                                bins=bins_below_zeros,retbins=True,include_lowest=True)[0].cat.categories

In [None]:
pd.cut(delta_above_zero['delta_percent'], precision=9,
                                bins=bins_above_zeros,retbins=True,include_lowest=True)[0].cat.categories

In [None]:
delta_below_zero.insert(delta_below_zero.shape[1],
                        'delta_label',
                        pd.cut(delta_below_zero['delta_percent'], precision=9,
                                bins=bins_below_zeros,
                                labels=labels_below, include_lowest = True))

delta_below_zero

In [None]:
delta_above_zero.insert(delta_above_zero.shape[1],
                        'delta_label',
                        pd.cut(delta_above_zero['delta_percent'], precision=9,
                                bins=bins_above_zeros,
                                labels=lables_above, include_lowest = True))

delta_above_zero                                

In [None]:
delta_orig_ffx_binned = pd.concat([delta_below_zero, delta_above_zero], ignore_index=True)
delta_orig_ffx_binned = delta_orig_ffx_binned.sort_values(by=['delta_percent'])
delta_orig_ffx_binned.shape

In [None]:
delta_orig_ffx_binned.isnull().values.any()

In [None]:
data = [[0,0.000000001,0,0,0,' < 100%'], [0,0.000000001,0,0,0,'- 91-100%'],
        [0,0.000000001,0,0,0,'- 81-90%'],[0,0.000000001,0,0,0,'- 71-80%']]
empty_bins = pd.DataFrame(data,columns=delta_orig_ffx_binned.columns.to_list())

In [None]:
delta_orig_ffx_binned = pd.concat([empty_bins, delta_orig_ffx_binned], ignore_index=True)
delta_orig_ffx_binned.shape

In [None]:
# OSM destination flows

delta_below_zero = delta_dest_ffx[delta_dest_ffx.delta_percent < 0]
delta_equals_zero = delta_dest_ffx[delta_dest_ffx.delta_percent == 0]
delta_above_zero = delta_dest_ffx[delta_dest_ffx.delta_percent > 0]
print(delta_below_zero.shape)
print(delta_equals_zero.shape)
print(delta_above_zero.shape)

In [None]:
bin_min = delta_dest_ffx.delta_percent.min()
bin_max = delta_dest_ffx.delta_percent.max()
print(bin_min)
print(bin_max)

In [None]:
bins_below_zeros = [-80.000000001,-70.000000001, -60.000000001, -50.000000001, -40.000000001,
                    -30.000000001, -20.000000001, -10.000000001,  -0.000000001]
bins_above_zeros = [0.000000001, 10.000000001, 20.000000001, 30.000000001, 40.000000001, 50.000000001,
                     60.000000001, 70.000000001, 80.000000001, 90.000000001, 100.000000001, bin_max]

labels_below = ['- 71-80%', '- 61-70%', '- 51-60%','- 41-50%', '- 31-40%','- 21-30%', 
                '- 11-20%','- up to 10%']
lables_above = ['+ up to 10%', '+ 11-20%','+ 21-30%', '+ 31-40%','+ 41-50%', '+ 51-60%','+ 61-70%', '+ 71-80%',
                '+ 81-90%', '+ 91-100%',' > 100%']

In [None]:
pd.cut(delta_below_zero['delta_percent'], precision=9,
                                bins=bins_below_zeros,retbins=True,include_lowest=True)[0].cat.categories

In [None]:
pd.cut(delta_above_zero['delta_percent'], precision=9,
                                bins=bins_above_zeros,retbins=True,include_lowest=True)[0].cat.categories

In [None]:
delta_below_zero.insert(delta_below_zero.shape[1],
                        'delta_label',
                        pd.cut(delta_below_zero['delta_percent'], precision=9,
                                bins=bins_below_zeros,
                                labels=labels_below, include_lowest = True))

delta_below_zero

In [None]:
delta_above_zero.insert(delta_above_zero.shape[1],
                        'delta_label',
                        pd.cut(delta_above_zero['delta_percent'], precision=9,
                                bins=bins_above_zeros,
                                labels=lables_above, include_lowest = True))

delta_above_zero                                

In [None]:
delta_dest_ffx_binned = pd.concat([delta_below_zero, delta_above_zero], ignore_index=True)
delta_dest_ffx_binned = delta_dest_ffx_binned.sort_values(by=['delta_percent'])
delta_dest_ffx_binned.shape

In [None]:
delta_dest_ffx_binned.isnull().values.any()

In [None]:
data = [[0,0.000000001,0,0,0,' < 100%'], [0,0.000000001,0,0,0,'- 91-100%'], [0,0.000000001,0,0,0,'- 81-90%']]
empty_bins = pd.DataFrame(data,columns=delta_dest_ffx_binned.columns.to_list())

In [None]:
delta_dest_ffx_binned = pd.concat([empty_bins, delta_dest_ffx_binned], ignore_index=True)
delta_dest_ffx_binned.shape

In [None]:
# delta percentages histogram of all origin flows using OSM features

fig = px.histogram(delta_orig_ffx_binned,x='delta_label',y='gt_count',
                   labels={'delta_label':'delta percentage','gt_count':'commuters'})
fig.update_layout(yaxis_range=[0,10000])
fig.update_layout(width =1024, height=720, font_size=24); 
#fig.show()
#pio.write_image(fig, 'ffx_osm_origin_hist_transfer.pdf')

In [None]:
# delta percentages histogram of all destination flows using OSM features

fig = px.histogram(delta_dest_ffx_binned,x='delta_label',y='gt_count',
                   labels={'delta_label':'delta percentage','gt_count':'commuters'})
fig.update_layout(yaxis_range=[0,14000])
fig.update_layout(width =1024, height=720, font_size=24)
#fig.show()
#pio.write_image(fig, 'ffx_osm_destination_hist_transfer.pdf')

In [None]:
# All results end!