# 0. Initialization
Run the cell below and restart runtime. This cell should only be run once during runtime, otherwise another clone will get created whenever the cell is run.

In [None]:
# Clone GitHub repository in the virtual environment provided in Google Colab.
!git clone https://github.com/henryhyunwookim/NYC-Transportation-and-Socioeconomic-Data-Analysis.git

# Install requirements.
%cd NYC-Transportation-and-Socioeconomic-Data-Analysis
%cd socio
!pip install -r requirements.txt
     

## After restart, run the cells below.
# 1. Load libraries

In [None]:
import os
os.chdir("/content/NYC-Transportation-and-Socioeconomic-Data-Analysis")

import pandas as pd
import json
from mpl_toolkits.axes_grid1.axes_divider import make_axes_locatable
import matplotlib.pyplot as plt
import numpy as np
import plotly.graph_objs as go
import os
import plotly.express as px
import ipywidgets as widgets

from socio.functions import*

from google.colab import files
%cd socio


In [None]:
#helper functions

def create_scatter(s1,s2,cd='PU_average_fare',  
                   title = '', labels = '', a = '', 
                   x1=0,y1=0,b_size=30.,
                   x2=0,y2=0,t2='',
                   x3=0,y3=0,t3='',
                   annotate = 1
                   ):
    
    colors = {'Bronx':'red', 'Manhattan':'green', 'Queens':'blue', 'Staten Island':'orange', 'Brooklyn':'purple'}
    color_names = {'red':'Bronx', 'green':'Manhattan', 'blue':'Queens', 'orange':'Staten Island', 'purple':'Brooklyn'}


    x = df_taxi_subway_socio[s1]
    y = df_taxi_subway_socio[s2]
    area = df_taxi_subway_socio[cd]
    colors_map = df_taxi_subway_socio['borough_x'].map(colors)


    fig = px.scatter(df_taxi_subway_socio, 
                 x=s1,
                 y=s2, 
                 size=cd,
                 color=colors_map,
                 hover_name="CDTAName", 
                 title=title,
                 labels=labels
                )


    fig.for_each_trace(lambda t: t.update(name = color_names[t.name],
                                          legendgroup = color_names[t.name],
                                          hovertemplate = t.hovertemplate.replace(t.name, color_names[t.name])
                                         )
                  )
    
    fig.update_traces(marker=dict(
                        size=area,
                        sizemode='area',
                        sizeref=2.*max(area)/(b_size**2),
                        sizemin=4
                        )
                     )
    
    
    if a != '':
        if annotate == 1:
            fig.add_annotation(
                x=x1,
                y=y1,
                xref="x",
                yref="y",
                text=a,
                showarrow=True,
                font=dict(
                    family="Courier New, monospace",
                    size=13,
                    color="#ffffff"
                    ),
                align="center",
                arrowhead=2,
                arrowsize=1,
                arrowwidth=2,
                arrowcolor="#636363",
                ax=30,
                ay=60,
                bordercolor="#c7c7c7",
                borderwidth=2,
                borderpad=4,
                bgcolor="#ff7f0e",
                opacity=0.8
                )
            if x2!=0:
                fig.add_annotation(
                    x=x2,
                    y=y2,
                    text=t2)
            if x3!=0:
                fig.add_annotation(
                    x=x3,
                    y=y3,
                    text=t3
                )
        else:
            fig.add_annotation(
                x=x1,
                y=y1,
                xref="x",
                yref="y",
                text=a,
                showarrow=True,
                font=dict(
                    family="Courier New, monospace",
                    size=13,
                    color="#ffffff"
                    ),
                align="center",
                arrowhead=2,
                arrowsize=1,
                arrowwidth=2,
                arrowcolor="#636363",
                ax=-40,
                ay=-130,
                bordercolor="#c7c7c7",
                borderwidth=2,
                borderpad=4,
                bgcolor="#ff7f0e",
                opacity=0.8
                )
            if x2!=0:
                fig.add_annotation(
                    x=x2,
                    y=y2,
                    text=t2)
      

    fig.update_layout(
        autosize=False,
        width=700,
        height=600)
    
    
#     fig.show()
    
    return fig

def create_scatter_interactive(s1,s2,cd='PU_average_fare',  
                   title = '', labels = '', a = '', 
                   x1=0,y1=0,b_size=30.,
                   x2=0,y2=0,t2='',
                   x3=0,y3=0,t3='',
                   annotate = 1
                   ):
    
    colors = {'Bronx':'red', 'Manhattan':'green', 'Queens':'blue', 'Staten Island':'orange', 'Brooklyn':'purple'}
    color_names = {'red':'Bronx', 'green':'Manhattan', 'blue':'Queens', 'orange':'Staten Island', 'purple':'Brooklyn'}


    x = df_taxi_subway_socio[s1]
    y = df_taxi_subway_socio[s2]
    area = df_taxi_subway_socio[cd]
    colors_map = df_taxi_subway_socio['borough_x'].map(colors)


    fig = px.scatter(df_taxi_subway_socio, 
                 x=s1,
                 y=s2, 
                 size=cd,
                 color=colors_map,
                 hover_name="CDTAName", 
                 title=title,
                 labels=labels
                )


    fig.for_each_trace(lambda t: t.update(name = color_names[t.name],
                                          legendgroup = color_names[t.name],
                                          hovertemplate = t.hovertemplate.replace(t.name, color_names[t.name])
                                         )
                  )
    
    fig.update_traces(marker=dict(
                        size=area,
                        sizemode='area',
                        sizeref=2.*max(area)/(b_size**2),
                        sizemin=4
                        )
                     )
    
    
    if a != '':
        if annotate == 1:
            fig.add_annotation(
                x=x1,
                y=y1,
                xref="x",
                yref="y",
                text=a,
                showarrow=True,
                font=dict(
                    family="Courier New, monospace",
                    size=13,
                    color="#ffffff"
                    ),
                align="center",
                arrowhead=2,
                arrowsize=1,
                arrowwidth=2,
                arrowcolor="#636363",
                ax=30,
                ay=60,
                bordercolor="#c7c7c7",
                borderwidth=2,
                borderpad=4,
                bgcolor="#ff7f0e",
                opacity=0.8
                )
            if x2!=0:
                fig.add_annotation(
                    x=x2,
                    y=y2,
                    text=t2)
            if x3!=0:
                fig.add_annotation(
                    x=x3,
                    y=y3,
                    text=t3
                )
        else:
            fig.add_annotation(
                x=x1,
                y=y1,
                xref="x",
                yref="y",
                text=a,
                showarrow=True,
                font=dict(
                    family="Courier New, monospace",
                    size=13,
                    color="#ffffff"
                    ),
                align="center",
                arrowhead=2,
                arrowsize=1,
                arrowwidth=2,
                arrowcolor="#636363",
                ax=-40,
                ay=-130,
                bordercolor="#c7c7c7",
                borderwidth=2,
                borderpad=4,
                bgcolor="#ff7f0e",
                opacity=0.8
                )
            if x2!=0:
                fig.add_annotation(
                    x=x2,
                    y=y2,
                    text=t2)
      

    fig.update_layout(
        autosize=False,
        width=700,
        height=600)
    
    
    fig.show()
    
    return fig

# 2. Load data from the three soures (taxi, subway and socioeconomic)

In [99]:
gdf_cd = pd.read_pickle('data/merge_data.pkl')
df_taxi = pd.read_csv(os.path.dirname(os.getcwd()) +'/taxi/data/cdta_df/cdta_df.csv')
df_socio = pd.read_csv('data/socioecoomic.csv')
df_subway = pd.read_csv(os.path.dirname(os.getcwd()) +'/subway/data/subway_df/subway_cdta.csv')

# 3. Creating CD code mapping and merge all data files

In [100]:
gdf_cd['CD_map'] = gdf_cd.apply (lambda row: fill_value(row), axis=1)

In [101]:
df_taxi_subway = df_taxi.merge(df_subway, left_on='CDTA', right_on='cdtaCode' , how ="inner")
df_taxi_subway = df_taxi_subway.sort_values(["borough_x",'net_entries'], ascending=[True,False])
df_taxi_subway_socio = gdf_cd.merge(df_taxi_subway, left_on='CD_map', right_on='CDTA' , how ="inner")
# df_taxi_subway_socio.columns.tolist()

# 4. Create new dataframe for aggregate data and format columns

In [102]:
df_taxi_subway_socio['PU_average_fare'] = df_taxi_subway_socio['PU_average_fare'].round(2)
df_taxi_subway_socio['PU_average_duration (min)'] = df_taxi_subway_socio['PU_average_duration (min)'].round(2)

In [103]:
df_taxi_subway_aggr = df_taxi_subway.groupby("borough_x").sum()[['PU_total_trip_count' ,'net_entries']]
df_taxi_subway_aggr.reset_index(inplace=True)
df_taxi_subway_aggr['total'] = df_taxi_subway_aggr['PU_total_trip_count'] + df_taxi_subway_aggr['net_entries']
df_taxi_subway_aggr.sort_values(['total'], ascending = False)

Unnamed: 0,borough_x,PU_total_trip_count,net_entries,total
2,Manhattan,17016178,162311230.0,179327408.0
1,Brooklyn,149480,72878938.0,73028418.0
3,Queens,1096299,54052566.0,55148865.0
0,Bronx,34281,31638430.0,31672711.0
4,Staten Island,161,774846.0,775007.0


# 4. Select the indicators for analysis

In [104]:
gdf_cd_small = df_taxi_subway_socio[[
    'Car-free commute (% of commuters)',
    'Disabled population',
    'Foreign-born population',
    'Mean travel time to work (minutes)',
     'Median household income (2021$)',
    'Percent Asian',
     'Percent Black',
     'Percent Hispanic',
     'Percent white',
     'Population',
     'Population aged 25+ without a high school diploma',
     'Population aged 65+',
     'Population density (1,000 persons per square mile)',
     'Poverty rate',
     'Poverty rate, population aged 65+',
     'Poverty rate, population under 18 years old',
     'Public Housing (properties)',
     'Public Housing (units)',
     'Public housing (% of rental units)',
     'Racial diversity index',
     'Serious crime rate (per 1,000 residents)',
     'Serious crime rate, property (per 1,000 residents)',
     'Serious crime rate, violent (per 1,000 residents)',
     'Unemployment rate',
     'PU_average_fare',
     'PU_average_duration (min)',
     'PU_total_trip_count',
     'station_counts',
     'net_entries'
]]

# Visualization 1: The analysis of the net entries to subway stations and total trip counts by taxi aggregated at the borough level 

In [105]:
# Stats based on total trips
newnames = {'net_entries':'Subway (net_entries)', 'PU_total_trip_count': 'Taxi (total trip count)'}

fig = px.bar(df_taxi_subway_aggr, 
             x='borough_x', 
             y=['net_entries','PU_total_trip_count'], 
             color_discrete_sequence=["green", "blue"], 
             barmode="group", log_y = True,
             hover_name="borough_x", 
             title="Subway is predominantly the most used mode of trasit in NYC",
             labels={
                     "variable": "Transit type",
                     "borough_x": 'borough',
                     "value" : "value (log)"
                 }
            )

fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                      legendgroup = newnames[t.name],
                                      hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
                                     )
                  )

fig.update_layout(
    autosize=False,
    width=700,
    height=500)

# Visualization 2: Subway and Taxi aggregate data analysis as a percentage of total traffic

In [106]:
df_taxi_subway_aggr['taxi_percent_borough'] = (df_taxi_subway_aggr['PU_total_trip_count'] / 
                  df_taxi_subway_aggr['total']) * 100

df_taxi_subway_aggr['subway_percent_borough'] = (df_taxi_subway_aggr['net_entries'] / 
                  df_taxi_subway_aggr['total']) * 100

df_taxi_subway_aggr['taxi_percent'] = (df_taxi_subway_aggr['PU_total_trip_count'] / 
                  df_taxi_subway_aggr['PU_total_trip_count'].sum()) * 100

df_taxi_subway_aggr['subway_percent'] = (df_taxi_subway_aggr['net_entries'] / 
                  df_taxi_subway_aggr['net_entries'].sum()) * 100

In [107]:
fig = px.pie(df_taxi_subway_aggr, values='taxi_percent', names='borough_x',title="Taxi usage by Boroughs")

fig.update_layout(
        autosize=False,
        width=400,
        height=400)

fig.show()

In [108]:
fig = px.pie(df_taxi_subway_aggr, values='subway_percent', names='borough_x', title="Subway usage by Boroughs",)
fig.update_layout(
        autosize=False,
        width=400,
        height=400)
fig.show()

# Visualization 3: Poorer community districts have higher taxi fares

In [109]:
s1 = 'Poverty rate'
s2 = 'Unemployment rate'
cd = 'PU_average_fare'
l = {
                     "color": "Borough",
                     "PU_average_fare" :'Taxi (average_fare $)',
                     "Poverty rate" :"Poverty rate (%)",
                     "Unemployment rate" :"Unemployment rate (%)",
                 
                 }
a_text = "high <br> unemployment rate <br> and poverty rate"

create_scatter(s1,s2,cd,"Poorer community districts have<br>higher taxi fares",l , a_text, 40.3,11.49)



# Visualization 4: Poorer community districts have longer commutes

In [110]:
s1 = 'Poverty rate'
s2 = 'Unemployment rate'
cd = 'PU_average_duration (min)'
l = {
                     "color": "Borough",
                     "PU_average_duration (min)" :'Taxi (average duration min)',
                     "Poverty rate" :"Poverty rate (%)",
                     "Unemployment rate" :"Unemployment rate (%)",
                 
                 }
a_text = "high <br> unemployment rate <br> and poverty rate"

create_scatter(s1,s2,cd,"Poorer community districts have<br>longer commutes",l , a_text, 40.3,11.49)



# Visualization 5: Densly populated districts with high unemployment rate have fewer subway stations

In [111]:
s1 = 'Population density (1,000 persons per square mile)'
s2 = 'Unemployment rate'
cd = 'station_counts'


l = {
                     "color": "Borough",
                     "station_counts" :'Number of subway stations',
                     "Unemployment rate" :"Unemployment rate (%)"
                 
                 }
title = "Densly populated districts with high unemployment rate <br>have fewer subway stations"
  
a_text = "BX07 high <br> unemployment rate <br> and poverty rate"

create_scatter(s1,s2,cd,
               title,l , a_text,
               96.7,11.73, 30.,
               50.6,4.53, 'BK11',
               95.8,5.24,'MN03')





# Visualization 6: Commuter traffic similar to CDs with more stations

In [112]:
s1 = 'Population density (1,000 persons per square mile)'
s2 = 'Unemployment rate'
cd = 'net_entries'

l={
                     "color": "Borough",
                     "net_entries" :'Subway (net_entries)',
                     "Unemployment rate" :"Unemployment rate (%)"
                 
                 }

title = "Commuter traffic similar to CDs with more stations"
  
a_text = "BX07 high <br> unemployment rate <br> and population density"

b_size = 40.

create_scatter(s1,s2,cd,title,l , a_text,96.7,11.73, b_size,
               50.6,4.53, 'BK11',
               95.8,5.24,'MN03')




# Visualization 7: Disabled populations have limited options for transit


In [113]:

s1 = 'Disabled population'
s2 = 'station_counts'
cd = 'PU_average_fare'

a_text = "high disabled population<br>,few station counts<br>and higher fares"

l={
                     "color": "Borough",
                     "station_counts" :'Number of subway stations',
                     "Disabled population" :"Disabled population (%)"
                 
                 }
b_size = 30.
title="Disabled populations have limited options for transit"

create_scatter(s1,s2,cd,title,
               l , a_text,
               18.2,6,b_size,
               15.9,10, 'QN14',
               0,0,'',
               2)

# Visualization 8: Longer commute times in districts with high disabled population


In [114]:
s1 = 'Disabled population'
s2 = 'Poverty rate'
cd = 'PU_average_duration (min)'

a_text = "BX02 high disabled population<br>,poverty rate and<br>more commute time"

l={
                     "color": "Borough",
                     "station_counts" :'Number of subway stations',
                     "Disabled population" :"Disabled population (%)"
                 
                 }
b_size = 30.
title="Longer commute times in districts with <br>high disabled population"

create_scatter(s1,s2,cd,title,
               l , a_text,
               18.2,39.6,b_size,
               0,0, '',
               0,0,'',
               1)

# Visualization 9: Interactive visualizations that lets you pick all the different variables for you to explore

In [115]:

print("Please select from the dropdowns")
feature1_drop = widgets.Dropdown(options=gdf_cd_small.columns, value = 'Population density (1,000 persons per square mile)')
feature1_drop

feature2_drop = widgets.Dropdown(options=gdf_cd_small.columns, value = 'Median household income (2021$)')
feature2_drop

feature3_drop = widgets.Dropdown(options=gdf_cd_small.columns, value = 'PU_average_fare')
feature3_drop

out = widgets.interactive_output(create_scatter_interactive, {'s1': feature1_drop, 's2':feature2_drop,'cd':feature3_drop})


widgets.VBox([widgets.HBox([feature1_drop, feature2_drop, feature3_drop]), out])

Please select from the dropdowns


VBox(children=(HBox(children=(Dropdown(index=12, options=('Car-free commute (% of commuters)', 'Disabled popul…

# Other visualizations not included in the report that you might find useful. 

In [116]:
df_taxi_subway = df_taxi_subway.sort_values(['PU_average_trip_distance (mile)'], ascending=[True])
fig = px.scatter(df_taxi_subway, y='CDTA', x=['PU_average_trip_distance (mile)'],log_x = True)
fig.show()


In [117]:
df_taxi_subway = df_taxi_subway.sort_values(['PU_average_fare'], ascending=[True])
fig = px.scatter(df_taxi_subway, y='CDTA', x=['PU_average_fare'],log_x = True)
fig.show()

In [118]:

df_taxi_subway['total'] = df_taxi_subway['PU_total_trip_count'] + df_taxi_subway['net_entries']

labels = df_taxi_subway[['borough_x','CDTA']]['CDTA'].tolist()

labels = df_taxi_subway_aggr['borough_x'].unique().tolist() + labels  

parents = df_taxi_subway[['borough_x','CDTA']]['borough_x'].tolist()
parents = ['Boroughs','Boroughs','Boroughs','Boroughs','Boroughs'] + parents


values = df_taxi_subway_aggr[['borough_x','net_entries']]['net_entries'].tolist()

values = values + df_taxi_subway[['borough_x','CDTA','total']]['total'].tolist()

fig =go.Figure(go.Sunburst(
    labels= labels,
    parents=parents,
    values=values,
))
fig.update_layout(margin = dict(t=0, l=0, r=0, b=0))

fig.show()