In [7]:
# Helper function: convert only if not already datetime
def safe_convert_to_datetime(series):
    if not pd.api.types.is_datetime64_any_dtype(series):
        # Clean multiple spaces, strip whitespace
        series = series.astype(str).str.replace(r"\s+", " ", regex=True).str.strip()
        return pd.to_datetime(series, errors='coerce', infer_datetime_format=True)
    return series

# === Step 2: Normalize different schemas to common format ===
def normalize_columns(df):
    col_map = {
        # Time columns
        'starttime': 'starttime',
        'Start Time': 'starttime',
        'started_at': 'starttime',

        'stoptime': 'stoptime',
        'Stop Time': 'stoptime',
        'ended_at': 'stoptime',

        # Optional: unify naming for trip duration or other fields
        'tripduration': 'tripduration',
        'Trip Duration': 'tripduration',

        # User type
        'usertype': 'usertype',
        'User Type': 'usertype',
        'member_casual': 'usertype',

        # Start station ID
        'start station id': 'start_station_id',
        'Start Station ID': 'start_station_id',
        'start_station_id': 'start_station_id',

        # End station ID
        'end station id': 'end_station_id',
        'End Station ID': 'end_station_id',
        'end_station_id': 'end_station_id',

        # Optional: bike id
        'bikeid': 'bike_id',
        'Bike ID': 'bike_id',

        # Ride ID (Divvy)
        'ride_id': 'ride_id',
        'rideable_type': 'rideable_type',
        
        'start station name': 'start station name' ,
        'start_station_name': 'start station name' ,
        'end station name':  'end station name',
        'end_station_name':  'end station name',
        
        'end station latitude': 'end station latitude',
        'end_lat':'end station latitude',
        'end station longitude': 'end station longitude',
        'end_lng':'end station longitude',
        
        
        
        'bike_type': 'bike_type',
        'rideable_type': 'bike_type',
        
        'start station latitude': 'start station latitude',
        'start_lat':'start station latitude',
        'start station longitude': 'start station longitude',
        'start_lng':'start station longitude'

        
    }

    # Rename columns based on known mappings
    df = df.rename(columns={k: v for k, v in col_map.items() if k in df.columns})

    return df


SyntaxError: invalid syntax (683482057.py, line 63)

In [5]:
import os
import pandas as pd

def merge_csv_files(input_dir, output_file):
    all_csv_files = [
        os.path.join(input_dir, f)
        for f in os.listdir(input_dir)
        if f.lower().endswith('.csv')
    ]

    print(f"\n📁 Found {len(all_csv_files)} CSV files in {input_dir}")

    df_list = []
    for file in all_csv_files:
        print(f"  📖 Reading: {file}")
        try:
            df = pd.read_csv(file)
            # Normalize columns
            df = normalize_columns(df)
            # Apply safe conversion
            df['starttime'] = safe_convert_to_datetime(df['starttime'])
            df['stoptime'] = safe_convert_to_datetime(df['stoptime'])
            df_list.append(df)
        except Exception as e:
            print(f"  ❌ Error reading {file}: {e}")

    if df_list:
        
        merged_df = pd.concat(df_list, ignore_index=True)
        #cleaning
        print(f"Original rows: {len(merged_df)}")
        # Remove duplicates (based on all columns)
        merged_df = merged_df.drop_duplicates()
        merged_df = merged_df.sort_values(by='starttime')

    	#print(f"Rows after removing duplicates: {len(merged_df)}")
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        merged_df.to_csv(output_file, index=False)
        print(f"  ✅ Merged CSV saved to: {output_file}")
    else:
        print("  ⚠️ No valid CSV files to merge.")

# === CONFIG ===
input_root = r"extracted_tripdata"              # Parent folder with year subfolders
output_root = r"merged_tripdata3"                # Where to save merged CSVs
os.makedirs(output_root, exist_ok=True)
selected_years = ['2022','2023','2024','2025']

for year in selected_years:
# Loop over all subfolders (years)
#for folder_name in os.listdir(input_root):
    folder_path = os.path.join(input_root, year)
    
    if os.path.isdir(folder_path) and year.isdigit():
        year = year
        output_file = os.path.join(output_root, f"{year}_merged.csv")
        merge_csv_files(folder_path, output_file)



📁 Found 12 CSV files in extracted_tripdata\2022
  📖 Reading: extracted_tripdata\2022\JC-202201-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2022\JC-202202-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2022\JC-202203-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2022\JC-202204-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2022\JC-202205-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2022\JC-202206-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2022\JC-202207-citbike-tripdata.csv
  📖 Reading: extracted_tripdata\2022\JC-202208-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2022\JC-202209-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2022\JC-202210-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2022\JC-202211-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2022\JC-202212-citibike-tripdata.csv
Original rows: 895485
  ✅ Merged CSV saved to: merged_tripdata3\2022_merged.csv

📁 Found 12 CSV files in extracted_tripdata\

  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202401-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202402-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202402-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202402-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202403-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202403-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202403-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202404-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202404-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202404-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202404-citibike-tripdata_4.csv
  📖 Reading: extracted_tripdata\2024\202405-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202405-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202405-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202405-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202405-citibike-tripdata_5.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202406-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202406-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202406-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202406-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202406-citibike-tripdata_5.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202407-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202407-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202407-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202407-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202407-citibike-tripdata_5.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202408-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202408-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202408-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202408-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202408-citibike-tripdata_5.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202409-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202409-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202409-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202409-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202409-citibike-tripdata_5.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202410-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202410-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202410-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202410-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202410-citibike-tripdata_5.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202410-citibike-tripdata_6.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202411-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202411-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202411-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202411-citibike-tripdata_4.csv
  📖 Reading: extracted_tripdata\2024\202412-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202412-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202412-citibike-tripdata_3.csv
  📖 Reading: extracted_tripdata\2024\JC-202401-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2024\JC-202402-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2024\JC-202403-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2024\JC-202404-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2024\JC-202405-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2024\JC-202406-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2024\JC-202407-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2024\JC-202408-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2024\JC-202409-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2024\JC-202410-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2024\JC-202411-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2024\JC-202412-citibike-tripdata.csv
Original rows: 45355660
  ✅ Merged CSV saved to: merged_tripdata3\2024_merged.csv

📁 Found 50 CSV files 

  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202501-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202501-citibike-tripdata_3.csv
  📖 Reading: extracted_tripdata\2025\202502-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202502-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202502-citibike-tripdata_3.csv
  📖 Reading: extracted_tripdata\2025\202503-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202503-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202503-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202503-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202504-citibike-tripdata_1.csv
  📖 Reading: extracted_tripdata\2025\202504-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202504-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202504-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202505-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202505-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202505-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202505-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202505-citibike-tripdata_5.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202506-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202506-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202506-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202506-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202506-citibike-tripdata_5.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202507-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202507-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202507-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202507-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202507-citibike-tripdata_5.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202508-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202508-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202508-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202508-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202508-citibike-tripdata_5.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202508-citibike-tripdata_6.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202509-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202509-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202509-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202509-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202509-citibike-tripdata_5.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202509-citibike-tripdata_6.csv
  📖 Reading: extracted_tripdata\2025\JC-202501-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2025\JC-202502-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2025\JC-202503-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2025\JC-202504-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2025\JC-202505-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2025\JC-202506-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2025\JC-202507-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2025\JC-202508-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2025\JC-202509-citibike-tripdata.csv
Original rows: 36303831
  ✅ Merged CSV saved to: merged_tripdata3\2025_merged.csv
