In [8]:
print('''
This notebook cleans csv files downloaded from Citi Bike. 
''')


This notebook cleans csv files downloaded from Citi Bike. 



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# plotting-- for future use
plt.style.use('default')
sns.set_palette("husl")

# loading
data_dir = Path("../data/raw")
csv_files = list(data_dir.glob("*.csv"))
print(f"Available data files: {[f.name for f in csv_files]}")

# load first file, explicit index
df = pd.read_csv(csv_files[0], index_col='ride_id', low_memory=False)
print(f"Dataset shape: {df.shape}")
df.head()

Available data files: ['202404-citibike-tripdata.csv', '202405-citibike-tripdata_1.csv', '202412-citibike-tripdata_1.csv', '202503-citibike-tripdata.csv', '202504-citibike-tripdata_3.csv', '202411-citibike-tripdata_3.csv', '202406-citibike-tripdata_5.csv', '202409-citibike-tripdata_1.csv', '202402-citibike-tripdata.csv', '202502-citibike-tripdata_3.csv', '202408-citibike-tripdata_3.csv', '202410-citibike-tripdata_6.csv', '202501-citibike-tripdata_1.csv', '202407-citibike-tripdata_1.csv', '202505-citibike-tripdata_4.csv', '202401-citibike-tripdata.csv', '202403-citibike-tripdata.csv']
Dataset shape: (3217063, 12)


Unnamed: 0_level_0,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
ride_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
F561526822C9D60B,electric_bike,2024-04-27 13:56:13.940,2024-04-27 14:05:23.629,FDR Drive & E 35 St,6230.04,E 10 St & 2 Ave,5746.02,40.743955,-73.971391,40.729708,-73.986598,member
359BAF91507F4998,electric_bike,2024-04-25 15:23:14.529,2024-04-25 15:27:52.895,Forsyth St & Grand St,5382.07,E 10 St & 2 Ave,5746.02,40.717741,-73.993388,40.729708,-73.986598,member
AAEE95A1C0106C97,electric_bike,2024-04-06 11:15:18.132,2024-04-06 11:22:10.081,E 20 St & 2 Ave,5971.08,Mott St & Prince St,5561.04,40.73579,-73.981693,40.72318,-73.9948,member
95B077C9C619D404,electric_bike,2024-04-06 16:19:25.749,2024-04-06 16:21:43.098,Eastern Pkwy & Washington Ave,3928.08,Eastern Pkwy & Franklin Ave (SW Corner),3919.12,40.671649,-73.963115,40.670529,-73.958222,member
1A33C864454C4692,electric_bike,2024-04-10 17:40:14.700,2024-04-10 17:48:11.571,W 27 St & 6 Ave,6215.07,E 25 St & 1 Ave,6004.07,40.745446,-73.990591,40.738177,-73.977387,member


In [3]:
# overview data

# column names for reference
print("Column names:")
print(df.columns.tolist())

# review. correct data types next if necessary.
print("\nData types:")
print(df.dtypes)

# determine if these are necessary to keep for analytics-- volume, granularity
print("\nMissing values:")
print(df.isnull().sum())

Column names:
['rideable_type', 'started_at', 'ended_at', 'start_station_name', 'start_station_id', 'end_station_name', 'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng', 'member_casual']

Data types:
rideable_type          object
started_at             object
ended_at               object
start_station_name     object
start_station_id       object
end_station_name       object
end_station_id         object
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
dtype: object

Missing values:
rideable_type            0
started_at               0
ended_at                 0
start_station_name    2506
start_station_id      2506
end_station_name      6650
end_station_id        6974
start_lat                0
start_lng                0
end_lat                836
end_lng                836
member_casual            0
dtype: int64


In [4]:
# change data types for dates and objects
# use coerce to avoid raising failures 
df['started_at'] = pd.to_datetime(df['started_at'], errors='coerce') 
df['ended_at'] = pd.to_datetime(df['ended_at'], errors='coerce') 

# change objects to string
df[['rideable_type', 'start_station_name', 'end_station_name', 'member_casual']] = df[['rideable_type', 'start_station_name', 'end_station_name', 'member_casual']].astype('string')

In [5]:
# count missing values & percentages & flag with text
missing_data = pd.DataFrame({
    'Missing_Count': df.isnull().sum(),
    'Missing_Percentage': (df.isnull().sum() / len(df)) * 100
})
missing_data = missing_data[missing_data['Missing_Count'] > 0]  # only show columns with missing counts
print(missing_data)

max_missing_pct = missing_data['Missing_Percentage'].max() # percentages not counts
print(f"Maximum missing data percentage: {max_missing_pct:.3f}%")

if max_missing_pct < 1.0:
    print("✓ Excellent data quality: All missing data is below 1% threshold.")
    print("  Missing data should not affect analysis results.")

                    Missing_Count  Missing_Percentage
start_station_name           2506            0.077897
start_station_id             2506            0.077897
end_station_name             6650            0.206710
end_station_id               6974            0.216782
end_lat                       836            0.025986
end_lng                       836            0.025986
Maximum missing data percentage: 0.217%
✓ Excellent data quality: All missing data is below 1% threshold.
  Missing data should not affect analysis results.


In [6]:
# drop rows with nulls & confirm counts
df_clean = df.dropna().copy()
df_clean.reset_index(drop=False, inplace=True)
df_clean.isnull().sum()

ride_id               0
rideable_type         0
started_at            0
ended_at              0
start_station_name    0
start_station_id      0
end_station_name      0
end_station_id        0
start_lat             0
start_lng             0
end_lat               0
end_lng               0
member_casual         0
dtype: int64

In [7]:
# export cleaned data for future use
# using parquet to maintain data types 
df_clean.to_parquet('../data/processed/cleaned_citibike_data.parquet')
