# Preprocessing Data

[Citi Bike Data](https://citibikenyc.com/system-data)

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/2025-citibike.csv', dtype='str', index_col=[0])
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,56BD148A05E26915,electric_bike,2025-01-01 22:19:06.324,2025-01-01 22:23:32.899,W 36 St & 7 Ave,6483.06,W 24 St & 7 Ave,6257.03,40.752149,-73.989539,40.74487634,-73.99529885,member
1,700CCCDF00C08077,electric_bike,2025-01-09 15:20:43.991,2025-01-09 15:26:19.137,Broadway & E 19 St,5980.11,W 24 St & 7 Ave,6257.03,40.73829,-73.99006,40.74487634,-73.99529885,member
2,B1A3FAFA5FE651CE,electric_bike,2025-01-01 12:54:26.571,2025-01-01 13:03:03.565,Jackson Ave & 46 Rd,6203.02,Crescent St & Broadway,6827.11,40.74524768,-73.94733276,40.7633589,-73.9286471,member
3,9CB0F28054BA5EBC,electric_bike,2025-01-14 13:43:59.151,2025-01-14 13:49:04.543,Broadway & E 19 St,5980.11,W 24 St & 7 Ave,6257.03,40.73829,-73.99006,40.74487634,-73.99529885,member
4,A1B8FDD4FAF0C2B2,electric_bike,2025-01-13 07:20:37.176,2025-01-13 07:29:18.015,Willis Ave & E 143 St,7798.02,1 Ave & E 110 St,7522.02,40.812299,-73.92037,40.7923272,-73.9383,member


In [3]:
df.shape

(15374152, 13)

In [4]:
df.isna().sum()

ride_id                   0
rideable_type             0
started_at                0
ended_at                  0
start_station_name     5375
start_station_id       5375
end_station_name      38086
end_station_id        39983
start_lat                 0
start_lng                 0
end_lat                2605
end_lng                2605
member_casual             0
dtype: int64

In [5]:
def fix_ids(column_name):
    ids_series = df[column_name].str.replace('_', '')
    return pd.to_numeric(ids_series, errors='coerce')

df['start_station_id'] = fix_ids('start_station_id')
df['end_station_id'] = fix_ids('end_station_id')

In [6]:
stations_df =\
pd.concat(
    objs=[
        df[['start_station_name', 'start_station_id', 'start_lat', 'start_lng']].rename(
            columns={
                'start_station_name': 'station_name', 'start_station_id': 'station_id',
                'start_lat': 'latitude', 'start_lng': 'longitude'
                }
        ),
        df[['end_station_name', 'end_station_id', 'end_lat', 'end_lng']].rename(
            columns={
                'end_station_name': 'station_name', 'end_station_id': 'station_id',
                'end_lat': 'latitude', 'end_lng': 'longitude'
                }
        )
    ], ignore_index=True
)

stations_df = stations_df.dropna().drop_duplicates()

stations_mapper = stations_df.set_index('station_name')['station_id'].to_dict()

print(f'Stations: {len(stations_mapper)}')

Stations: 2159


In [7]:
stations_df.to_csv('../data/citibike-stations.csv')

In [8]:
df['start_station_id'] = df['start_station_name'].map(stations_mapper)
df['end_station_id'] = df['end_station_name'].map(stations_mapper)

In [9]:
df.dropna(inplace=True)

In [10]:
df['started_at'] = pd.to_datetime(df['started_at'])
df['ended_at'] = pd.to_datetime(df['ended_at'])

df = df[2024 < df['started_at'].dt.year]

df['month'] = df['started_at'].dt.month
df['day_name'] = df['started_at'].dt.day_name()
df['trip_duration'] = (df['ended_at'] - df['started_at']).dt.total_seconds()

In [11]:
df['rideable_type'] = df['rideable_type'].str.replace('_', ' ').str.title()
df['member_casual'] = df['member_casual'].str.title()

In [12]:
df.reset_index(drop=True)
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,month,day_name,trip_duration
0,56BD148A05E26915,Electric Bike,2025-01-01 22:19:06.324,2025-01-01 22:23:32.899,W 36 St & 7 Ave,6483.06,W 24 St & 7 Ave,6257.03,40.752149,-73.989539,40.74487634,-73.99529885,Member,1,Wednesday,266.575
1,700CCCDF00C08077,Electric Bike,2025-01-09 15:20:43.991,2025-01-09 15:26:19.137,Broadway & E 19 St,5980.11,W 24 St & 7 Ave,6257.03,40.73829,-73.99006,40.74487634,-73.99529885,Member,1,Thursday,335.146
2,B1A3FAFA5FE651CE,Electric Bike,2025-01-01 12:54:26.571,2025-01-01 13:03:03.565,Jackson Ave & 46 Rd,6203.02,Crescent St & Broadway,6827.11,40.74524768,-73.94733276,40.7633589,-73.9286471,Member,1,Wednesday,516.994
3,9CB0F28054BA5EBC,Electric Bike,2025-01-14 13:43:59.151,2025-01-14 13:49:04.543,Broadway & E 19 St,5980.11,W 24 St & 7 Ave,6257.03,40.73829,-73.99006,40.74487634,-73.99529885,Member,1,Tuesday,305.392
4,A1B8FDD4FAF0C2B2,Electric Bike,2025-01-13 07:20:37.176,2025-01-13 07:29:18.015,Willis Ave & E 143 St,7798.02,1 Ave & E 110 St,7522.02,40.812299,-73.92037,40.7923272,-73.9383,Member,1,Monday,520.839


In [13]:
df.to_csv('../data/2025-citibike-clean.csv')

In [14]:
df.to_parquet('../data/2025-citibike.parquet')