In [None]:
import pandas as pd

# read in the trips file
trips = pd.read_csv("datasets/202301-bluebikes-tripdata.csv")

In [None]:
# read in the stations file
stations = pd.read_csv("datasets/current_bluebikes_stations.csv", header=1)

In [None]:
# merge the two dataframes
merged_df = trips.merge(stations, left_on='start station name', right_on='Name')

In [None]:
# exclude Salem information (since we are plotting this geospatially, Salem will appear too far away)
merged_df = merged_df[merged_df["District"] != "Salem"]

In [None]:
import geopandas as gpd

# Load the mbta bus station data from a shapefile
gdf = gpd.read_file("datasets/mbtabus/MBTABUSSTOPS_PT.shp")

# Check the current Coordinate Reference System (CRS)
print("Original CRS:", gdf.crs)

# Reproject the GeoDataFrame to EPSG:4326
gdf = gdf.to_crs("EPSG:4326")

# Extract latitude and longitude from the geometry column
gdf["Latitude"] = gdf.geometry.y
gdf["Longitude"] = gdf.geometry.x


In [None]:
# Load the mbta train station file from a shapefile
train_gdf = gpd.read_file("datasets/mbta_rapid_transit/MBTA_NODE.shp")

# Check the current Coordinate Reference System (CRS)
print("Original CRS:", train_gdf.crs)

# Reproject the GeoDataFrame to EPSG:4326
train_gdf = train_gdf.to_crs("EPSG:4326")

# Extract latitude and longitude from the geometry column
train_gdf["Latitude"] = train_gdf.geometry.y
train_gdf["Longitude"] = train_gdf.geometry.x

In [None]:
# exclude unnescessary
train_gdf = train_gdf[["STATION", "LINE", "Latitude", "Longitude"]]

In [None]:
# combine the station name and the station line (Orange, Red, Green, etc.)
train_gdf["STATION"] = train_gdf["STATION"] + " on " + train_gdf["LINE"] + " Line"
train_gdf = train_gdf.drop(columns=["LINE"])

In [None]:
# exclude unnescessary
bus_gdf = gdf[["STOP_NAME", "Latitude", "Longitude"]]

In [None]:
# rename columns
bus_gdf = bus_gdf.rename(columns={"Latitude": "Bus_Stop_Lat", "Longitude": "Bus_Stop_Long"})

In [None]:
from scipy.spatial import cKDTree
import pandas as pd
import numpy as np

# Convert station and bus stop coordinates into 2D NumPy arrays
stations_coordinates = np.array(merged_df[['Latitude', 'Longitude']])
bus_stops_coordinates = np.array(bus_gdf[['Bus_Stop_Lat', 'Bus_Stop_Long']])

# Create a cKDTree object for bus stops for efficient nearest neighbor search
tree = cKDTree(bus_stops_coordinates)

# Query the tree for the nearest bus stop to each station
# distances: The distances to the nearest neighbors.
# indices: The locations of the neighbors in bus_gdf.
distances, indices = tree.query(stations_coordinates, k=1) # k=1 for the closest bus stop

# Making sure bus_gdf has a consistent index for direct row access
bus_gdf_reset = bus_gdf.reset_index()

# Use indices to fetch and add closest bus stop info into merged_df
merged_df['closest_bus_stop_lat'] = bus_gdf_reset.iloc[indices]['Bus_Stop_Lat'].values
merged_df['closest_bus_stop_long'] = bus_gdf_reset.iloc[indices]['Bus_Stop_Long'].values
merged_df['closest_bus_stop_name'] = bus_gdf_reset.iloc[indices]['STOP_NAME'].values
merged_df['distance_to_closest_bus_stop_km'] = distances


In [None]:
# Convert station and bus stop coordinates into 2D NumPy arrays
stations_coordinates = np.array(merged_df[['Latitude', 'Longitude']])
train_stops_coordinates = np.array(train_gdf[['Latitude', 'Longitude']])

# Create a cKDTree object for bus stops for efficient nearest neighbor search
tree = cKDTree(train_stops_coordinates)

# Query the tree for the nearest bus stop to each station
# distances: The distances to the nearest neighbors.
# indices: The locations of the neighbors in bus_gdf.
distances, indices = tree.query(stations_coordinates, k=1) # k=1 for the closest bus stop

# Making sure bus_gdf has a consistent index for direct row access
train_gdf_reset = train_gdf.reset_index()

# Use indices to fetch and add closest bus stop info into merged_df
merged_df['closest_train_stop_lat'] = train_gdf_reset.iloc[indices]['Latitude'].values
merged_df['closest_train_stop_long'] = train_gdf_reset.iloc[indices]['Longitude'].values
merged_df['closest_train_stop_name'] = train_gdf_reset.iloc[indices]['STATION'].values
merged_df['distance_to_closest_train_stop_km'] = distances


In [None]:
import altair as alt

# groupby the attributes we want to include, and count the trips
station_trip_counts = merged_df.groupby(['Name', 'Latitude', 'Longitude', 'closest_bus_stop_name', "closest_train_stop_name"]).size().reset_index(name='Trip Count')

# create an Altair scatter plot
chart = alt.Chart(station_trip_counts).mark_circle().encode(
    x=alt.X('Longitude:Q', title='Longitude', scale=alt.Scale(domain=(-71.248, -70.98))), # longitude of bike stations
    y=alt.Y('Latitude:Q', scale=alt.Scale(domain=(42.255, 42.43)), title='Latitude'), # latitude of bike stations
    size=alt.Size('Trip Count:Q', title='Number of Trips'),  # Size of points based on trip count
    color=alt.value('steelblue'),  # Set a fixed color for all points
    tooltip=['Name:N', 'Trip Count:Q', 'closest_bus_stop_name:N', 'closest_train_stop_name:N']
).properties(
    title='Most Popular BlueBike Stations with Closest Bus and Train Stops',
    width=600,
    height=400
)

chart.display()