In [1]:
import glob
import pandas as pd

## Load Data

In [2]:
trip_files = glob.glob("./202409-citibike-tripdata/*")

In [3]:
trip_files

['./202409-citibike-tripdata/202409-citibike-tripdata_2.csv.zip',
 './202409-citibike-tripdata/202409-citibike-tripdata_3.csv.zip',
 './202409-citibike-tripdata/202409-citibike-tripdata_1.csv.zip',
 './202409-citibike-tripdata/202409-citibike-tripdata_5.csv.zip',
 './202409-citibike-tripdata/202409-citibike-tripdata_4.csv.zip']

In [4]:
df = pd.concat([ 
    pd.read_csv(
        tf,
        parse_dates=["started_at", "ended_at"],
        dtype={
            "start_station_name": str,
            "end_station_name": str,
            "start_station_id": str,
            "end_station_id": str
        }
    ) for tf in trip_files
])

In [5]:
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,BD5299CB404DEC67,classic_bike,2024-09-06 18:53:56.193,2024-09-06 18:57:08.782,Forest Ave & Summerfield St,4798.02,St Felix Ave & 61 St,4719.04,40.69922,-73.89795,40.69715,-73.8936,member
1,743B1F43BCC71F11,electric_bike,2024-09-02 21:09:09.862,2024-09-02 21:13:16.881,E 7 St & Ave B,5584.05,Rivington St & Ridge St,5406.02,40.725129,-73.981317,40.718502,-73.983299,member
2,1642CEBE805FA026,electric_bike,2024-09-17 07:29:14.311,2024-09-17 08:03:10.812,7 Ave & 20 St,3588.01,Central Ave & Woodbine St,4632.1,40.659053,-73.98854,40.69296,-73.91605,member
3,B4D4FA8F4D1B7A79,electric_bike,2024-09-13 12:39:51.392,2024-09-13 12:59:46.772,E 53 St & 3 Ave,6617.02,Rivington St & Ridge St,5406.02,40.757632,-73.969306,40.718502,-73.983299,member
4,B6A18EDFBB29C904,classic_bike,2024-09-23 08:52:08.749,2024-09-23 08:55:59.812,Cadman Plaza E & Red Cross Pl,4821.06,Main St & Plymouth St,4936.21,40.699918,-73.989718,40.703782,-73.990734,member


In [6]:
df.dtypes

ride_id                       object
rideable_type                 object
started_at            datetime64[ns]
ended_at              datetime64[ns]
start_station_name            object
start_station_id              object
end_station_name              object
end_station_id                object
start_lat                    float64
start_lng                    float64
end_lat                      float64
end_lng                      float64
member_casual                 object
dtype: object

## Data Basics

In [7]:
df["rideable_type"].value_counts()

rideable_type
electric_bike    3321206
classic_bike     1676692
Name: count, dtype: int64

In [8]:
df["member_casual"].value_counts()

member_casual
member    3954888
casual    1043010
Name: count, dtype: int64

In [9]:
df["start_station_name"].value_counts()

start_station_name
W 21 St & 6 Ave                    17445
Broadway & E 14 St                 16525
Broadway & W 58 St                 16509
8 Ave & W 31 St                    16109
West St & Chambers St              15910
                                   ...  
Andrew Ave N & Hall of Fame Tce       28
Brooklyn Ave & Snyder Ave             25
E 182 St & Park Ave                   15
Pier 40 X2                             5
Lab - NYC - M                          1
Name: count, Length: 2140, dtype: int64

In [10]:
df["end_station_name"].value_counts()

end_station_name
W 21 St & 6 Ave          17455
Broadway & E 14 St       16269
8 Ave & W 31 St          16257
Broadway & W 58 St       16044
West St & Chambers St    16032
                         ...  
Brunswick St                 1
Newport Pkwy                 1
Madison St & 10 St           1
Washington St                1
Astor Place                  1
Name: count, Length: 2188, dtype: int64

In [11]:
df["started_at"].apply(lambda x: f"{x.day}-{x.month}-{x.year}").value_counts()

started_at
20-9-2024    194315
19-9-2024    192615
21-9-2024    189916
13-9-2024    189697
12-9-2024    189245
11-9-2024    189173
18-9-2024    185879
10-9-2024    185296
14-9-2024    185270
17-9-2024    183013
27-9-2024    181046
4-9-2024     179690
24-9-2024    178192
6-9-2024     176778
5-9-2024     173029
25-9-2024    172867
16-9-2024    169636
9-9-2024     167796
30-9-2024    167474
3-9-2024     165431
22-9-2024    165242
15-9-2024    164711
26-9-2024    160454
8-9-2024     160120
23-9-2024    152069
2-9-2024     144419
7-9-2024     143523
1-9-2024     128478
28-9-2024     90069
29-9-2024     71332
31-8-2024      1123
Name: count, dtype: int64

## Discussion

What are some data points we could develop that could help us find similar stations?