In [1]:
import glob
import pandas as pd

## Load Data

In [2]:
trip_files = glob.glob("./202510-citibike-tripdata/*")

In [3]:
trip_files

['./202510-citibike-tripdata/202510-citibike-tripdata_3.csv',
 './202510-citibike-tripdata/202510-citibike-tripdata_2.csv',
 './202510-citibike-tripdata/202510-citibike-tripdata_1.csv',
 './202510-citibike-tripdata/202510-citibike-tripdata_5.csv',
 './202510-citibike-tripdata/202510-citibike-tripdata_4.csv']

In [4]:
df = pd.concat([ 
    pd.read_csv(
        tf,
        parse_dates=["started_at", "ended_at"],
        dtype={
            "start_station_name": str,
            "end_station_name": str,
            "start_station_id": str,
            "end_station_id": str
        }
    ) for tf in trip_files
])

In [5]:
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,557FE56828BC7B72,electric_bike,2025-10-02 14:01:28.598,2025-10-02 14:06:46.526,E 2 St & 2 Ave,5593.02,Pike St & E Broadway,5270.05,40.725029,-73.990697,40.714067,-73.992939,member
1,0C147B351D46C123,electric_bike,2025-10-04 19:10:31.195,2025-10-04 19:11:42.659,4 Ave & 9 St,3955.05,4 Ave & 9 St,3955.05,40.670513,-73.988766,40.670513,-73.988766,member
2,F95C5036724DB9E8,electric_bike,2025-10-01 16:33:41.956,2025-10-01 16:47:40.976,Pier 40 - Hudson River Park,5696.03,Pike St & E Broadway,5270.05,40.727714,-74.011296,40.714067,-73.992939,member
3,2E0404A1CDF25730,electric_bike,2025-10-06 01:00:01.922,2025-10-06 01:25:45.023,1 Ave & E 30 St,6079.03,1 Ave & E 30 St,6079.03,40.741444,-73.975361,40.741444,-73.975361,member
4,33F394645EF291E3,classic_bike,2025-10-06 18:47:27.905,2025-10-06 18:54:54.480,74 St & 37 Ave,6332.06,77 St & 31 Ave,6718.02,40.74908,-73.89172,40.75886,-73.89081,member


In [6]:
df.shape

(4731900, 13)

In [7]:
df.dtypes

ride_id                       object
rideable_type                 object
started_at            datetime64[ns]
ended_at              datetime64[ns]
start_station_name            object
start_station_id              object
end_station_name              object
end_station_id                object
start_lat                    float64
start_lng                    float64
end_lat                      float64
end_lng                      float64
member_casual                 object
dtype: object

## Data Basics

In [11]:
df["rideable_type"].value_counts()

rideable_type
electric_bike    3338094
classic_bike     1393806
Name: count, dtype: int64

In [12]:
df["member_casual"].value_counts()

member_casual
member    3932027
casual     799873
Name: count, dtype: int64

In [13]:
df["start_station_name"].nunique()

2166

In [14]:
df["start_station_name"].value_counts()

start_station_name
W 21 St & 6 Ave             18123
Lafayette St & E 8 St       15836
W 31 St & 7 Ave             14641
Broadway & E 14 St          14509
9 Ave & W 33 St             14481
                            ...  
Colonial Rd & 89 St             5
53 St & Roosevelt Ave           4
Whitney Ave & Ketcham Pl        3
84 St & 3 Ave                   3
Valentine Ave & E 198 St        1
Name: count, Length: 2166, dtype: int64

In [15]:
df["end_station_name"].value_counts()

end_station_name
W 21 St & 6 Ave                           18195
Lafayette St & E 8 St                     15899
W 31 St & 7 Ave                           14669
9 Ave & W 33 St                           14643
Broadway & E 14 St                        14533
                                          ...  
City Hall - Washington St & 1 St              1
6 St & Grand St                               1
York St & Marin Blvd                          1
Mama Johnson Field - 4 St & Jackson St        1
Bergen Ave                                    1
Name: count, Length: 2223, dtype: int64

In [16]:
df["end_station_name"].nunique()

2223

In [17]:
df["started_at"]

0        2025-10-02 14:01:28.598
1        2025-10-04 19:10:31.195
2        2025-10-01 16:33:41.956
3        2025-10-06 01:00:01.922
4        2025-10-06 18:47:27.905
                   ...          
999995   2025-10-25 08:20:01.282
999996   2025-10-16 15:10:52.610
999997   2025-10-21 13:19:14.311
999998   2025-10-25 16:09:24.185
999999   2025-10-24 17:40:45.835
Name: started_at, Length: 4731900, dtype: datetime64[ns]

In [18]:
df["started_at"].apply(lambda x: f"{x.day}-{x.month}-{x.year}").value_counts()

started_at
4-10-2025     188107
7-10-2025     186879
3-10-2025     186772
1-10-2025     186152
6-10-2025     173796
9-10-2025     173367
15-10-2025    172792
10-10-2025    170692
2-10-2025     170113
17-10-2025    169982
18-10-2025    169240
5-10-2025     169204
21-10-2025    168812
24-10-2025    164319
16-10-2025    163938
22-10-2025    163714
23-10-2025    162978
29-10-2025    156560
31-10-2025    156195
19-10-2025    155422
14-10-2025    148766
25-10-2025    148611
28-10-2025    147286
8-10-2025     147259
27-10-2025    141736
26-10-2025    133479
20-10-2025    121058
11-10-2025    120557
30-10-2025     76426
12-10-2025     75938
13-10-2025     61111
30-9-2025        639
Name: count, dtype: int64

In [19]:
df.loc[lambda x: x["end_station_name"].isnull()]["member_casual"].value_counts(normalize=True)

member_casual
member    0.644787
casual    0.355213
Name: proportion, dtype: float64

## Discussion

What are some data points we could develop that could help us find similar stations?

----

----

----