## Load Configuration
This cell loads project parameters from `config.json`.

# 2.5: Advanced Geospatial Plotting

This notebook aggregates trips between stations and visualizes the most popular flows using kepler.gl.

In [1]:
# 1. Import libraries
import sys
from pathlib import Path
import os
import numpy as np
import pandas as pd
import matplotlib
import geopandas as gpd
import json
import shapely
import keplergl
from keplergl import KeplerGl

In [3]:
# 2. Path resolution and setup



DATA_DIR   = "../data"
OUTPUT_DIR = "../visualizations"


# Input files
TRIPS_FP   = os.path.join(DATA_DIR,"citibike_weather_2022.csv")
WEATHER_FP = os.path.join(DATA_DIR,"laguardia_2022_weather.csv")




## 3. Load, aggregate and prepare data

In [4]:
# 3. Load, aggregate and prepare data
# Load trips and weather
trips = pd.read_csv(TRIPS_FP, parse_dates=['started_at', 'ended_at', 'date'], low_memory=False)
weather = pd.read_csv(WEATHER_FP, parse_dates=['date'])

# Normalize station column names if needed
if 'start_station_name' not in trips.columns and 'from_station_name' in trips.columns:
    trips = trips.rename(columns={'from_station_name': 'start_station_name'})
if 'end_station_name' not in trips.columns and 'to_station_name' in trips.columns:
    trips = trips.rename(columns={'to_station_name': 'end_station_name'})

# Ensure coordinate columns exist
required_trip_cols = ['start_station_name', 'end_station_name', 'start_lat', 'start_lng', 'end_lat', 'end_lng']
missing = [c for c in required_trip_cols if c not in trips.columns]
if missing:
    raise RuntimeError(f'Missing required trip columns: {missing}')

# Add value for aggregation
trips['value'] = 1

# Group by origin-destination and coordinates
df_grouped = (
    trips.groupby([
        'start_station_name', 'end_station_name',
        'start_lat', 'start_lng',
        'end_lat', 'end_lng'
    ])['value']
    .sum()
    .rename('trips')
    .reset_index()
)

print("Original trip count:", len(trips))
print("Aggregated trip sum:", int(df_grouped['trips'].sum()))

# Prepare arcs DataFrame for kepler.gl
arcs_df = pd.DataFrame({
    'start_lat': df_grouped['start_lat'],
    'start_lng': df_grouped['start_lng'],
    'end_lat': df_grouped['end_lat'],
    'end_lng': df_grouped['end_lng'],
    'start_station_name': df_grouped['start_station_name'],
    'end_station_name': df_grouped['end_station_name'],
    'trips': df_grouped['trips'],
})

# limit to top N flows
TOP_N = 500
top_arcs = arcs_df.nlargest(TOP_N, 'trips').copy()

Original trip count: 786983
Aggregated trip sum: 784166


In [5]:
# 4.Visualize with kepler.gl
# Instantiate kepler.gl map
m = KeplerGl(height=700, data={'popular_trips': top_arcs})

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


#  Customization notes
- Limited to top 500 trips to reduce clutter.
- Arc thickness/color encode trip volume.
- Filters on `trips` can isolate high-frequency flows.

In [6]:
m

KeplerGl(data={'popular_trips':         start_lat  start_lng    end_lat    end_lng  \
63412   40.735938 -74.03â€¦

In [7]:
# 6.Save current configuration and export interactive HTML
config = m.config
config_path = '../config.json'
with open(config_path, 'w') as f:
    json.dump(config, f, indent=2)
print('Saved config to', config_path)

html_path = '../kepler_popular_trips_map.html'
m.save_to_html(file_name=html_path, read_only=False, config=config)
print('Saved HTML map to', html_path)

Saved config to ../config.json
Map saved to ../kepler_popular_trips_map.html!
Saved HTML map to ../kepler_popular_trips_map.html
