# Table of Contents

## 01. Library and Data Import
## 02. Aggregated dataframe for map creation
## 03. Initialize an instance of a kepler.gl map
## 04. Map Customization
## 05. Map Filter
        Find the Top Starting Stations
        Find the Top Ending Stations
        Find the Top Station Pairs (Most Common Routes)
        Insights
## 06. Map Export

### 01. Library and Data Import

In [1]:
# Import necessary libraries

# Pandas for data manipulation and analysis
import pandas as pd  

# OS module for handling file paths and system operations
import os  

# KeplerGl for interactive geospatial visualizations
from keplergl import KeplerGl  

# Pyproj for working with coordinate reference systems (CRS)
from pyproj import CRS  

# NumPy for numerical operations and handling arrays
import numpy as np  

# Matplotlib for static data visualization and plotting
from matplotlib import pyplot as plt

In [2]:
# Define the file path for the 'newyork_data_cleaned_v3.pkl' dataset
file_path = r"C:\Users\HP\Citi_Bike_Dashboard\02 Data\Prepared Data\newyork_data_cleaned_v3.pkl"

# Load the dataset into a Pandas DataFrame
df = pd.read_pickle(file_path)

# Display the first few rows to confirm successful loading (optional)
df.head()

Unnamed: 0_level_0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,avgTemp,bike_rides_daily,trip_duration
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2022-01-01,88237B34DAFDB069,electric_bike,2022-01-01 00:00:13.532,2022-01-01 00:30:28.482,Broadway & W 160 St,8157.06,Broadway & W 160 St,8157.06,40.835773,-73.943398,40.835773,-73.943398,casual,11.6,20198,30.249167
2022-01-01,4545E384F601A7E9,classic_bike,2022-01-01 00:00:32.146,2022-01-01 00:02:15.504,Forsyth St & Broome St,5453.05,Forsyth St & Broome St,5453.05,40.718941,-73.992661,40.718941,-73.992661,member,11.6,20198,1.722633
2022-01-01,7CE9B56332A37463,electric_bike,2022-01-01 00:00:42.801,2022-01-01 00:11:38.969,Myrtle Ave & Marcy Ave,4707.03,Willoughby St & Fleet St,4628.05,40.695396,-73.949547,40.691967,-73.9813,member,11.6,20198,10.936133
2022-01-01,AEAF2FFEBB00845B,electric_bike,2022-01-01 00:00:50.604,2022-01-01 02:39:37.688,W 15 St & 7 Ave,6030.06,W 33 St & 7 Ave,6407.07,40.739357,-73.999321,40.750198,-73.990929,casual,11.6,20198,158.784733
2022-01-01,9B01074C8CABA851,classic_bike,2022-01-01 00:01:05.031,2022-01-01 00:11:53.032,W 15 St & 7 Ave,6030.06,W 36 St & 7 Ave,6483.06,40.739357,-73.999321,40.752148,-73.98954,member,11.6,20198,10.800017


In [3]:
# Resetting the index
df = df.reset_index().rename(columns={"index": "date"})

In [4]:
df.head()

Unnamed: 0,date,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,avgTemp,bike_rides_daily,trip_duration
0,2022-01-01,88237B34DAFDB069,electric_bike,2022-01-01 00:00:13.532,2022-01-01 00:30:28.482,Broadway & W 160 St,8157.06,Broadway & W 160 St,8157.06,40.835773,-73.943398,40.835773,-73.943398,casual,11.6,20198,30.249167
1,2022-01-01,4545E384F601A7E9,classic_bike,2022-01-01 00:00:32.146,2022-01-01 00:02:15.504,Forsyth St & Broome St,5453.05,Forsyth St & Broome St,5453.05,40.718941,-73.992661,40.718941,-73.992661,member,11.6,20198,1.722633
2,2022-01-01,7CE9B56332A37463,electric_bike,2022-01-01 00:00:42.801,2022-01-01 00:11:38.969,Myrtle Ave & Marcy Ave,4707.03,Willoughby St & Fleet St,4628.05,40.695396,-73.949547,40.691967,-73.9813,member,11.6,20198,10.936133
3,2022-01-01,AEAF2FFEBB00845B,electric_bike,2022-01-01 00:00:50.604,2022-01-01 02:39:37.688,W 15 St & 7 Ave,6030.06,W 33 St & 7 Ave,6407.07,40.739357,-73.999321,40.750198,-73.990929,casual,11.6,20198,158.784733
4,2022-01-01,9B01074C8CABA851,classic_bike,2022-01-01 00:01:05.031,2022-01-01 00:11:53.032,W 15 St & 7 Ave,6030.06,W 36 St & 7 Ave,6483.06,40.739357,-73.999321,40.752148,-73.98954,member,11.6,20198,10.800017


In [5]:
df.shape

(29767925, 17)

### 02. Aggregated dataframe for map creation

In [6]:
# Create a value column
df['trips'] = 1  

# Group by start and end station names and count trips
df_kepler = df.groupby(['start_station_name', 'end_station_name'], observed=True)['trips'].count().reset_index()

In [7]:
df_kepler

Unnamed: 0,start_station_name,end_station_name,trips
0,1 Ave & E 110 St,1 Ave & E 110 St,791
1,1 Ave & E 110 St,1 Ave & E 18 St,2
2,1 Ave & E 110 St,1 Ave & E 30 St,4
3,1 Ave & E 110 St,1 Ave & E 39 St,1
4,1 Ave & E 110 St,1 Ave & E 44 St,12
...,...,...,...
1013374,Yankee Ferry Terminal,Water St & Main St,4
1013375,Yankee Ferry Terminal,West St & Chambers St,6
1013376,Yankee Ferry Terminal,West St & Liberty St,4
1013377,Yankee Ferry Terminal,West Thames St,1


In [8]:
print(df_kepler['trips'].sum())
print(df.shape)

29767925
(29767925, 18)


In [9]:
# Extract unique station coordinates
# In order to minimize memory, two separate datasets will be created for the station coordinates

# Extract unique start station coordinates
df_start_coords = df[['start_station_name', 'start_lat', 'start_lng']].drop_duplicates(subset=['start_station_name'])

# Extract unique end station coordinates
df_end_coords = df[['end_station_name', 'end_lat', 'end_lng']].drop_duplicates(subset=['end_station_name'])

In [10]:
# Reducing the dataframe size in hopes of reducing memory
df_kepler = df_kepler[['start_station_name', 'end_station_name', 'trips']]

In [11]:
# Optimize data types for memory efficiency

# Convert station names to categorical to reduce memory
df_kepler['start_station_name'] = df_kepler['start_station_name'].astype('category')
df_kepler['end_station_name'] = df_kepler['end_station_name'].astype('category')

# Convert trip count to int32 to reduce memory
df_kepler['trips'] = df_kepler['trips'].astype('int32')

# Convert lat/lng to float32 to save memory
df_start_coords['start_lat'] = df_start_coords['start_lat'].astype('float32')
df_start_coords['start_lng'] = df_start_coords['start_lng'].astype('float32')
df_end_coords['end_lat'] = df_end_coords['end_lat'].astype('float32')
df_end_coords['end_lng'] = df_end_coords['end_lng'].astype('float32')

In [12]:
# Merge coordinates with aggregated trip data

# Set index on coordinates for faster lookup
df_start_coords.set_index('start_station_name', inplace=True)
df_end_coords.set_index('end_station_name', inplace=True)

# Use join instead of merge for efficiency
df_kepler = df_kepler.join(df_start_coords, on='start_station_name', how='left')
df_kepler = df_kepler.join(df_end_coords, on='end_station_name', how='left')

# Reset index after join
df_kepler.reset_index(drop=True, inplace=True)

In [13]:
# Display memory usage
print(df_kepler.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1013379 entries, 0 to 1013378
Data columns (total 7 columns):
 #   Column              Non-Null Count    Dtype   
---  ------              --------------    -----   
 0   start_station_name  1013379 non-null  category
 1   end_station_name    1013379 non-null  category
 2   trips               1013379 non-null  int32   
 3   start_lat           1013379 non-null  float32 
 4   start_lng           1013379 non-null  float32 
 5   end_lat             1013379 non-null  float32 
 6   end_lng             1013379 non-null  float32 
dtypes: category(2), float32(4), int32(1)
memory usage: 23.3 MB
None


In [14]:
df_kepler.head()

Unnamed: 0,start_station_name,end_station_name,trips,start_lat,start_lng,end_lat,end_lng
0,1 Ave & E 110 St,1 Ave & E 110 St,791,40.792328,-73.938301,40.792328,-73.938301
1,1 Ave & E 110 St,1 Ave & E 18 St,2,40.792328,-73.938301,40.73381,-73.980545
2,1 Ave & E 110 St,1 Ave & E 30 St,4,40.792328,-73.938301,40.741444,-73.975357
3,1 Ave & E 110 St,1 Ave & E 39 St,1,40.792328,-73.938301,40.747139,-73.97113
4,1 Ave & E 110 St,1 Ave & E 44 St,12,40.792328,-73.938301,40.750019,-73.969055


In [15]:
# Define the full file path
file_path_2 = r"C:\Users\HP\Citi_Bike_Dashboard\02 Data\Prepared Data\df_final_locations_for_map.csv"

# Save the dataframe as a CSV file
df_kepler.to_csv(file_path_2, index=False)

# Confirm save location
print(f"File saved to: {file_path_2}")

File saved to: C:\Users\HP\Citi_Bike_Dashboard\02 Data\Prepared Data\df_final_locations_for_map.csv


### 03. Initialize an instance of a kepler.gl map.

In [15]:
# Create KeplerGl instance and add data
m = KeplerGl(height=700)
m.add_data(data=df_kepler, name="Citi Bike Trips")

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


I am unable to load the map via jupyter lab, so I will be customizing the map via browser and documenting the changes.

### 04. Map Customization

#### 1. Point Colors for Start & End Stations
To clearly distinguish between starting and ending bike stations:
- **🟢 Start Stations:** `#1a9850` (Green) → Represents **departure points**.
- **🔴 End Stations:** `#d73027` (Red) → Represents **destination points**.

**Why these colors?**  
Green signals **"go/start"**, while red signals **"stop/end"**, making it intuitive for users.

---

#### 2. Arc Colors for Trip Connections
The arc layer represents **trips between stations**, with colors indicating **trip density**:
- **🔵 Light Blue (`#91bfdb`)** → Low trip volume
- **🟡 Yellow (`#ffbf00`)** → Medium trip volume
- **🔴 Red (`#d73027`)** → High trip volume

**Why these colors?**  
A sequential color scale makes it **easy to see high-traffic routes**, with **red highlighting the busiest connections**.

---

#### 3. Using "Quantile" for trip arc
- Ensures equal distribution of trips across colors.  
- Highlights **high-density routes** in **red** while preserving detail in **low-trip connections**.  
- Creates a more visually informative map.

---

####  4. Additional Customizations
- **Dark base map** → Improves contrast with colored arcs.

---

#### Final Insights from the Map
- Red arcs indicate the busiest Citi Bike routes.
- **Frequent connections appear in clusters**, showing **popular commuting patterns**.

This visualization effectively highlights high-demand routes and provides insights for bike station optimization.

### 05. Map Filter

While messing around with the filters option, since there were so many trips to view it made it difficult to develop insights so I utilized python to pull the top starting stations, top ending stations, and top routes. After pulling the top results, I was able to confirm the stations and routes on the kepler map.

#### Find the Top Starting Stations

In [19]:
# Top 10 starting stations by total trips
top_start_stations = df_kepler.groupby('start_station_name')['trips'].sum().reset_index()
top_start_stations = top_start_stations.sort_values(by='trips', ascending=False).head(10)

# Display Top Starting Stations
print(top_start_stations)

           start_station_name   trips
1587          W 21 St & 6 Ave  128822
1718    West St & Chambers St  123045
495        Broadway & W 58 St  114040
286           6 Ave & W 33 St  106236
8             1 Ave & E 68 St  104685
461        Broadway & E 14 St   98656
485        Broadway & W 25 St   98237
1511  University Pl & E 14 St   96938
463        Broadway & E 21 St   95532
1603          W 31 St & 7 Ave   94030


  top_start_stations = df_kepler.groupby('start_station_name')['trips'].sum().reset_index()


#### Find the Top Ending Stations

In [20]:
# Top 10 ending stations by total trips
top_end_stations = df_kepler.groupby('end_station_name')['trips'].sum().reset_index()
top_end_stations = top_end_stations.sort_values(by='trips', ascending=False).head(10)

# Display results
print(top_end_stations)

             end_station_name   trips
1663          W 21 St & 6 Ave  130178
1796    West St & Chambers St  124335
514        Broadway & W 58 St  110312
8             1 Ave & E 68 St  105121
291           6 Ave & W 33 St  104935
1586  University Pl & E 14 St   99171
504        Broadway & W 25 St   98745
480        Broadway & E 14 St   98272
482        Broadway & E 21 St   95512
1679          W 31 St & 7 Ave   94110


  top_end_stations = df_kepler.groupby('end_station_name')['trips'].sum().reset_index()


#### Find the Top Station Pairs (Most Common Routes)

In [25]:
# Top 10 most common routes (start → end)
top_routes = df_kepler.groupby(['start_station_name', 'end_station_name'])['trips'].sum().reset_index()
top_routes = top_routes.sort_values(by='trips', ascending=False).head(10)

# Display results
top_routes.head(10)

  top_routes = df_kepler.groupby(['start_station_name', 'end_station_name'])['trips'].sum().reset_index()


Unnamed: 0,start_station_name,end_station_name,trips
1035223,Central Park S & 6 Ave,Central Park S & 6 Ave,12041
582079,7 Ave & Central Park South,7 Ave & Central Park South,8541
2530977,Roosevelt Island Tramway,Roosevelt Island Tramway,8213
1777563,Grand Army Plaza & Central Park S,Grand Army Plaza & Central Park S,7287
2599133,Soissons Landing,Soissons Landing,7275
2922013,W 21 St & 6 Ave,9 Ave & W 22 St,6345
445768,5 Ave & E 72 St,5 Ave & E 72 St,6037
12895,1 Ave & E 62 St,1 Ave & E 68 St,5826
3241999,Yankee Ferry Terminal,Yankee Ferry Terminal,5759
911809,Broadway & W 58 St,Broadway & W 58 St,5509


#### Insights

1. **Overlapping Stations suggest round trips**
   - `W 21 St & 6 Ave` appears in the **top start and end stations** suggesting many users start and end from the same place.
   - This could imply casual riders will return to the same spot, and commuters utilize these loops for getting to and from work.
2. **Central Park Dominance**
   - **4 of the top 5** routes are near Central Park
   - This suggests Citi bikes are heavily used for scenic rides and casual biking.
3. **Roosevelt Island**
   - Roosevelt Island is the third highest route, suggesting tourists and locals use Citi Bikes in this area.
4. **Top Communiter Corridors**
   - `W 21 St & 6 Ave → 9 Ave & W 22 St` likely represents **short work commutes** in Chelsea.
   - `1 Ave & E 62 St → 1 Ave & E 68 St` suggests **commutes between subway stations**.
   - Routes along **Broadway & Central Park** are heavily used.

---

**Insights**
- Round-trip rides occur heavily in Central Park and Roosevelt Island.
    - Increase bike availability in parks and greenways to support tourism.
- Commuter-friendly routes happen around Chelsea and Midtown Manhattan.
    - Ensure there are enough bikes at these commuter hubs.

### 06. Map Export

In [26]:
# Save the configuration
config = m.config

In [27]:
# Save configuration to a JSON file
import json

config_path = "config.json"
with open(config_path, "w") as outfile:
    json.dump(config, outfile)

In [28]:
# Save Kepler map as an interactive HTML file
m.save_to_html(file_name="Citi_Bike_Trips_Aggregated.html", read_only=False)

# Confirmation message
print(f"Kepler map saved as 'Citi_Bike_Trips_Aggregated.html' with configuration in '{config_path}'")

Map saved to Citi_Bike_Trips_Aggregated.html!
Kepler map saved as 'Citi_Bike_Trips_Aggregated.html' with configuration in 'config.json'
