## Installation and setup

In [None]:
import pandas as pd
import numpy as np
import altair as alt
import zipfile
from zipfile import ZipFile

import requests
import io

In [None]:
!pip install geopandas
import geopandas as gpd

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# %load_ext google.colab.data_table
# from google.colab import data_table
# %unload_ext google.colab.data_table

In [None]:
# load data
df_20 = pd.read_csv('https://s3.amazonaws.com/tripdata/202009-citibike-tripdata.csv.zip')

# replace spaces in column names with _
df_20.columns = df_20.columns.str.replace(' ', '_')

In [None]:
citibikeZipFile = requests.get('https://s3.amazonaws.com/tripdata/202109-citibike-tripdata.csv.zip')

# Here we are uncompressing it and reading the whole package into a variable
zipFiles = ZipFile(io.BytesIO(citibikeZipFile.content))

# And here we are looping through the package and printing the names of the files
for name in zipFiles.namelist():
  print(name)

# Finally, we can take that loaded uncopressed package and use it to just load one file to the Pandas dataframe:
df_21 = pd.read_csv(zipFiles.open('202109-citibike-tripdata.csv'))

202109-citibike-tripdata.csv
__MACOSX/._202109-citibike-tripdata.csv


  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
# convert ID columns in both to strings
df_20.start_station_id = df_20.start_station_id.astype(str)
df_20.end_station_id = df_20.end_station_id.astype(str)

df_21.start_station_id = df_21.start_station_id.astype(str)
df_21.end_station_id = df_21.end_station_id.astype(str)

## Summary statistics

September 2020

In [None]:
# total number of trips - each row is one trip, so
len(df_20)

2488225

In [None]:
# top 5 stations by number of trip starts
df_20['start_station_name'].value_counts().head(5)

W 21 St & 6 Ave          13875
West St & Chambers St    13225
12 Ave & W 40 St         12869
1 Ave & E 68 St          12816
E 17 St & Broadway       11264
Name: start_station_name, dtype: int64

In [None]:
# top 5 stations by number of trip stops
df_20['end_station_name'].value_counts().head(5)

W 21 St & 6 Ave          13967
West St & Chambers St    13783
12 Ave & W 40 St         12922
1 Ave & E 68 St          12689
E 17 St & Broadway       11446
Name: end_station_name, dtype: int64

In [None]:
# median number of trips that start at a station
df_20['start_station_name'].value_counts().median()

1457.0

In [None]:
# median number of trips that end at a station
df_20['end_station_name'].value_counts().median()

1434.5

September 2021

In [None]:
# total number of trips - each row is one trip, so
len(df_21)

3280560

In [None]:
# top 5 stations by number of trip starts
df_21['start_station_name'].value_counts().head(5)

W 21 St & 6 Ave             14435
E 17 St & Broadway          14065
Broadway & E 14 St          13155
Cleveland Pl & Spring St    12987
W 20 St & 10 Ave            12918
Name: start_station_name, dtype: int64

In [None]:
# top 5 stations by number of trip stops
df_21['end_station_name'].value_counts().head(5)

W 21 St & 6 Ave             14420
E 17 St & Broadway          14092
Cleveland Pl & Spring St    13114
Broadway & E 14 St          12918
W 20 St & 10 Ave            12912
Name: end_station_name, dtype: int64

In [None]:
# median number of trips that start at a station
df_21['start_station_name'].value_counts().median()

1133.5

In [None]:
# median number of trips that start at a station
df_21['end_station_name'].value_counts().median()

1078.0

In [None]:
# Bonus points for highlighting the top 5 stations that have seen the largest increase or decrease in number of start/stops between the two time periods

## Preparing data for mapping

In [None]:
# load NY neighborhood tabulation data
ntaData = gpd.read_file('https://services5.arcgis.com/GfwWNkhOj9bNBqoJ/arcgis/rest/services/NYC_Neighborhood_Tabulation_Areas_2020/FeatureServer/0/query?where=1=1&outFields=*&outSR=4326&f=pgeojson')
# ntaData.head()

In [None]:
# 2020

# create a column just filled with the value 1, which we will add up to get the total number of trips
df_20['trip_count'] = 1

# starts
df_20_starts = df_20.groupby(['start_station_name']).agg({'start_station_latitude':'min','start_station_longitude':'min', 'trip_count':'count'}).reset_index()

# ends
df_20_ends = df_20.groupby(['end_station_name']).agg({'end_station_latitude':'min','end_station_longitude':'min','trip_count':'count'}).reset_index()

# merge start and end data - if wanting to map both on the same map
# df_20_merge = df_20_starts.merge(df_20_ends, left_on='start_station_id', right_on='end_station_id', suffixes=('_start', '_end'))

In [None]:
# 2021

# create a column just filled with the value 1, which we will add up to get the total number of trips
df_21['trip_count'] = 1

# starts
df_21_starts = df_21.groupby(['start_station_name']).agg({'start_lat':'min','start_lng':'min', 'trip_count':'count'}).reset_index()

# ends
df_21_ends = df_21.groupby(['end_station_name']).agg({'end_lat':'min','end_lng':'min','trip_count':'count'}).reset_index()

# merge start and end data - if wanting to map both on the same map
# df_21_merge = df_21_starts.merge(df_21_ends, left_on='start_station_id', right_on='end_station_id', suffixes=('_start', '_end'))

## Create geojsons

In [None]:
# 2020 starts
citiGeo_20_starts = gpd.GeoDataFrame(data=df_20_starts, geometry=gpd.points_from_xy(x=df_20_starts['start_station_longitude'], y=df_20_starts['start_station_latitude']), crs='epsg:4326')

In [None]:
# 2020 ends
citiGeo_20_ends = gpd.GeoDataFrame(data=df_20_ends, geometry=gpd.points_from_xy(x=df_20_ends['end_station_longitude'], y=df_20_ends['end_station_latitude']), crs='epsg:4326')

In [None]:
# 2021 starts
citiGeo_21_starts = gpd.GeoDataFrame(data=df_21_starts, geometry=gpd.points_from_xy(x=df_21_starts['start_lng'], y=df_21_starts['start_lat']), crs='epsg:4326')

In [None]:
# 2021 ends
citiGeo_21_ends = gpd.GeoDataFrame(data=df_21_ends, geometry=gpd.points_from_xy(x=df_21_ends['end_lng'], y=df_21_ends['end_lat']), crs='epsg:4326')

In [None]:
# save as geojson
citiGeo_20_starts.to_file('citibikeData_09_20_starts.geojson', driver='GeoJSON')
citiGeo_20_ends.to_file('citibikeData_09_20_ends.geojson', driver='GeoJSON')

citiGeo_21_starts.to_file('citibikeData_09_21_starts.geojson', driver='GeoJSON')
citiGeo_21_ends.to_file('citibikeData_09_21_ends.geojson', driver='GeoJSON')

In [None]:
# download geojson
from google.colab import files
files.download('citibikeData_09_20_starts.geojson') 
files.download('citibikeData_09_20_ends.geojson') 

files.download('citibikeData_09_21_starts.geojson') 
files.download('citibikeData_09_21_ends.geojson') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Only need following part if trying to map both starts and ends on the same map

In [None]:
# # create a geojson for both starts and ends - 2020
# citiGeo_20 = gpd.GeoDataFrame(data=df_20_merge, geometry=gpd.points_from_xy(x=df_20_merge['start_station_longitude'], y=df_20_merge['start_station_latitude']), crs='epsg:4326')
# citiGeo_20.head()

In [None]:
# # create a geojson for both starts and ends - 2021
# citiGeo_21 = gpd.GeoDataFrame(data=df_21_merge, geometry=gpd.points_from_xy(x=df_21_merge['start_lng'], y=df_21_merge['start_lat']), crs='epsg:4326')
# citiGeo_21.head()

In [None]:
# # save as geojson
# citiGeo_20.to_file('citibikeData_09_20.geojson', driver='GeoJSON')
# citiGeo_21.to_file('citibikeData_09_21.geojson', driver='GeoJSON')

In [None]:
# # download geojson
# from google.colab import files
# files.download('citibikeData_09_20.geojson') 
# files.download('citibikeData_09_21.geojson') 