In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df1 = pd.read_csv('trips_2018_cleaned.csv')

In [None]:
save_path = 'figures/'

In [None]:
hourly = df1.groupby('hour').size()/len(df1['date'].unique())
plt.figure(figsize=(10,5))
hourly.plot(kind='bar')
plt.title("Average Pickups per Hour (2018)")
plt.xlabel("Hour of Day")
plt.ylabel("Number of Trips")
plt.savefig(save_path + 'average_pickups_per_hour_2018.png')
plt.show()

In [None]:
weekly = df1.groupby('day_of_week').size().reindex(
    ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
)/len(df1["week"].unique())

plt.figure(figsize=(10,5))
weekly.plot(kind='bar')
plt.title("Trips per Day of Week")
plt.ylabel("Trips")
plt.xlabel("Day of Week")
plt.savefig(save_path + 'trips_per_day_of_week.png')
plt.show()


In [None]:
monthly = df1.groupby('month').size()

plt.figure(figsize=(10,5))
monthly.plot(kind='line', marker='o')
plt.title("Monthly Demand Pattern")
plt.xlabel("Month")
plt.ylabel("Total Trips")
plt.savefig(save_path + 'monthly_demand_pattern.png')
plt.show()

In [None]:
pivot = df1.pivot_table(index='day_of_week', 
                       columns='hour', 
                       values='bikeid', 
                       aggfunc='count')

# Reorder rows
pivot = pivot.reindex(["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"])

plt.figure(figsize=(14,6))
sns.heatmap(pivot, cmap="viridis")
plt.title("Heatmap of Trips: Day of Week vs Hour")
plt.show()


In [None]:
all_stations = pd.concat([
    df1[['start_station_id', 'start_station_latitude', 'start_station_longitude']],
    df1[['end_station_id', 'end_station_latitude', 'end_station_longitude']].rename(
        columns={'end_station_id': 'start_station_id',
                 'end_station_latitude': 'start_station_latitude',
                 'end_station_longitude': 'start_station_longitude'}
    )
]).drop_duplicates().dropna()

print(f"Total unique stations: {len(all_stations)}")

In [None]:

lat = all_stations['start_station_latitude'].values
lon = all_stations['start_station_longitude'].values

# Approximate conversion for New York area (40.7°N)
# 1 degree latitude ≈ 111,000 meters
# 1 degree longitude ≈ 85,000 meters (at 40°N latitude)

lat_meters = lat * 111000  # meters
lon_meters = lon * 85000   # meters (adjusted for NYC latitude)

X_meters = np.column_stack([lat_meters, lon_meters])
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=20, random_state=67, n_init=20)
all_stations['cluster'] = kmeans.fit_predict(X_meters)

In [None]:
station_to_cluster = dict(zip(
    all_stations['start_station_id'], 
    all_stations['cluster']
))

# Add to df1
df1['start_cluster'] = df1['start_station_id'].map(station_to_cluster)
df1['end_cluster'] = df1['end_station_id'].map(station_to_cluster)


In [None]:
def aggregate_hourly_demand(df, cluster_id, target='pickups'):
    if target == 'pickups':
        cluster_trips = df[df['start_cluster'] == cluster_id].copy()
    else:
        cluster_trips = df[df['end_cluster'] == cluster_id].copy()
    
    cluster_trips['timestamp'] = pd.to_datetime(cluster_trips['date']) + pd.to_timedelta(cluster_trips['hour'], unit='h')
    hourly_counts = cluster_trips.groupby('timestamp').size().reset_index(name=target)
    hourly_counts = hourly_counts.set_index('timestamp').sort_index()
    
    full_range = pd.date_range(start=hourly_counts.index.min(), end=hourly_counts.index.max(), freq='H')
    hourly_counts = hourly_counts.reindex(full_range, fill_value=0)
    
    return hourly_counts

In [None]:
hourly_data = aggregate_hourly_demand(df1, cluster_id=0, target='pickups')
#plot the hourly demand for cluster 0
plt.figure(figsize=(12,6))
hourly_data['pickups'].plot()
plt.title("Hourly Pickups for Cluster 0")
plt.xlabel("Time")
plt.ylabel("Number of Pickups")
plt.show()

In [None]:
#acf and pacf of the hourly pickups for cluster 0
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
plot_acf(hourly_data['pickups'], lags=50)
plt.title("ACF of Hourly Pickups for Cluster 0")
plt.show()
plot_pacf(hourly_data['pickups'], lags=50)
plt.title("PACF of Hourly Pickups for Cluster 0")
plt.show()
