In [115]:
'''
This notebook cleans csv files downloaded from Citi Bike. 
'''

'\nThis notebook cleans csv files downloaded from Citi Bike. \n'

In [116]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# plotting-- for future use
plt.style.use('default')
sns.set_palette("husl")

# loading
data_dir = Path("../data/raw")
csv_files = list(data_dir.glob("*.csv"))
print(f"Available data files: {[f.name for f in csv_files]}")

# load first file, explicit index
df = pd.read_csv(csv_files[0], index_col='ride_id', low_memory=False)
print(f"Dataset shape: {df.shape}")
df.head()

Available data files: ['202503-citibike-tripdata.csv', '202401-citibike-tripdata.csv']
Dataset shape: (3168271, 12)


Unnamed: 0_level_0,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
ride_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
A60B1C35CE5E45FD,electric_bike,2025-03-05 09:08:03.136,2025-03-05 09:15:52.755,E 55 St & 2 Ave,6650.07,E 40 St & Park Ave,6432.11,40.757973,-73.966033,40.750756,-73.978326,member
469B08AA25EDCB64,electric_bike,2025-03-06 07:40:49.163,2025-03-06 07:48:29.447,E 55 St & 2 Ave,6650.07,E 43 St & Madison Ave,6551.11,40.757973,-73.966033,40.753547,-73.978966,member
7C5B0D17ADC24243,electric_bike,2025-03-14 09:20:10.576,2025-03-14 09:27:00.757,Sterling Pl & 5 Ave,4208.01,Nevins St & Schermerhorn St,4437.09,40.67897,-73.978553,40.687372,-73.981761,member
5D31A24EC019BA0D,electric_bike,2025-03-10 08:01:32.317,2025-03-10 08:20:22.409,Lexington Ave & E 111 St,7567.06,E 40 St & Park Ave,6432.11,40.795412,-73.944123,40.750756,-73.978326,member
8A65DD5EAB71F1D5,electric_bike,2025-03-14 08:22:50.328,2025-03-14 08:36:35.953,W 90 St & Amsterdam Ave,7458.09,E 43 St & Madison Ave,6551.11,40.79018,-73.97289,40.753547,-73.978966,member


In [117]:
# overview data

# column names for reference
print("Column names:")
print(df.columns.tolist())

# review. correct data types next if necessary.
print("\nData types:")
print(df.dtypes)

# determine if these are necessary to keep for analytics-- volume, granularity
print("\nMissing values:")
print(df.isnull().sum())

Column names:
['rideable_type', 'started_at', 'ended_at', 'start_station_name', 'start_station_id', 'end_station_name', 'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng', 'member_casual']

Data types:
rideable_type          object
started_at             object
ended_at               object
start_station_name     object
start_station_id       object
end_station_name       object
end_station_id         object
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
dtype: object

Missing values:
rideable_type            0
started_at               0
ended_at                 0
start_station_name    1053
start_station_id      1053
end_station_name      8069
end_station_id        8454
start_lat                0
start_lng                0
end_lat                527
end_lng                527
member_casual            0
dtype: int64


In [118]:
# change data types for dates and objects
# use coerce to avoid raising failures 
df['started_at'] = pd.to_datetime(df['started_at'], errors='coerce') 
df['ended_at'] = pd.to_datetime(df['ended_at'], errors='coerce') 

# change objects to string
df[['rideable_type', 'start_station_name', 'end_station_name', 'member_casual']] = df[['rideable_type', 'start_station_name', 'end_station_name', 'member_casual']].astype('string')

In [119]:
# count missing values & percentages & flag with text
missing_data = pd.DataFrame({
    'Missing_Count': df.isnull().sum(),
    'Missing_Percentage': (df.isnull().sum() / len(df)) * 100
})
missing_data = missing_data[missing_data['Missing_Count'] > 0]  # only show columns with missing counts
print(missing_data)

max_missing_pct = missing_data['Missing_Percentage'].max() # percentages not counts
print(f"Maximum missing data percentage: {max_missing_pct:.3f}%")

if max_missing_pct < 1.0:
    print("✓ Excellent data quality: All missing data is below 1% threshold.")
    print("  Missing data should not affect analysis results.")

                    Missing_Count  Missing_Percentage
start_station_name           1053            0.033236
start_station_id             1053            0.033236
end_station_name             8069            0.254681
end_station_id               8454            0.266833
end_lat                       527            0.016634
end_lng                       527            0.016634
Maximum missing data percentage: 0.267%
✓ Excellent data quality: All missing data is below 1% threshold.
  Missing data should not affect analysis results.


In [120]:
# drop rows with nulls & confirm counts
df_clean = df.dropna().copy()
df_clean.reset_index(drop=False, inplace=True)
df_clean.isnull().sum()

ride_id               0
rideable_type         0
started_at            0
ended_at              0
start_station_name    0
start_station_id      0
end_station_name      0
end_station_id        0
start_lat             0
start_lng             0
end_lat               0
end_lng               0
member_casual         0
dtype: int64

In [122]:
# export cleaned data for future use
# using parquet to maintain data types 
df_clean.to_parquet('../data/processed/cleaned_citibike_data.parquet')
