In [1]:
from pathlib import Path

# Base folder: where this script is located
BASE_DIR = Path.cwd()

In [None]:
import os
import pandas as pd

def merge_csv_files(input_dir, output_file):
    all_csv_files = [
        os.path.join(input_dir, f)
        for f in os.listdir(input_dir)
        if f.lower().endswith('.csv')
    ]

    print(f"Found {len(all_csv_files)} CSV files in {input_dir}")

    # Read and concatenate all DataFrames
    df_list = []
    for file in all_csv_files:
        print(f"Reading: {file}")
        try:
            df = pd.read_csv(file)
            df_list.append(df)
        except Exception as e:
            print(f"Error reading {file}: {e}")

    if df_list:
        merged_df = pd.concat(df_list, ignore_index=True)
        merged_df.to_csv(output_file, index=False)
        print(f"\n✅ Merged CSV saved to: {output_file}")
    else:
        print("⚠️ No valid CSV files to merge.")

# Example usage
input_folder = "clean_data_bike/2013"
output_csv = "clean_data_bike/2013_merged.csv"

merge_csv_files(input_folder, output_csv)


In [9]:

import re
# Define possible formats
formats = [
    "%Y-%m-%d %H:%M:%S",
    "%Y-%m-%d %H:%M",
    "%m/%d/%Y %I:%M:%S %p"
]

In [12]:
import pandas as pd
import numpy as np

# Load your DataFrame (example)
# df = pd.read_csv("your_file.csv")

# Helper function: convert only if not already datetime
def safe_convert_to_datetime(series):
    if not pd.api.types.is_datetime64_any_dtype(series):
        # Clean multiple spaces, strip whitespace
        series = series.astype(str).str.replace(r"\s+", " ", regex=True).str.strip()
        return pd.to_datetime(series, errors='coerce', infer_datetime_format=True)
    return series

# this one


In [14]:
import os
import pandas as pd

def merge_csv_files(input_dir, output_file):
    all_csv_files = [
        os.path.join(input_dir, f)
        for f in os.listdir(input_dir)
        if f.lower().endswith('.csv')
    ]

    print(f"\n📁 Found {len(all_csv_files)} CSV files in {input_dir}")

    df_list = []
    for file in all_csv_files:
        print(f"  📖 Reading: {file}")
        try:
            df = pd.read_csv(file)
            # Apply safe conversion
            df['starttime'] = safe_convert_to_datetime(df['starttime'])
            df['stoptime'] = safe_convert_to_datetime(df['stoptime'])
            df_list.append(df)
        except Exception as e:
            print(f"  ❌ Error reading {file}: {e}")

    if df_list:
        
        merged_df = pd.concat(df_list, ignore_index=True)
        #cleaning
        print(f"Original rows: {len(merged_df)}")
        # Remove duplicates (based on all columns)
        merged_df = merged_df.drop_duplicates()
        merged_df = merged_df.sort_values(by='starttime')

    	#print(f"Rows after removing duplicates: {len(merged_df)}")
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        merged_df.to_csv(output_file, index=False)
        print(f"  ✅ Merged CSV saved to: {output_file}")
    else:
        print("  ⚠️ No valid CSV files to merge.")

# === CONFIG ===
input_root = r"extracted_tripdata"              # Parent folder with year subfolders
output_root = r"merged_tripdata3"                # Where to save merged CSVs
os.makedirs(output_root, exist_ok=True)

# Loop over all subfolders (years)
for folder_name in os.listdir(input_root):
    folder_path = os.path.join(input_root, folder_name)
    
    if os.path.isdir(folder_path) and folder_name.isdigit():
        year = folder_name
        output_file = os.path.join(output_root, f"{year}_merged.csv")
        merge_csv_files(folder_path, output_file)



📁 Found 9 CSV files in extracted_tripdata\2013
  📖 Reading: extracted_tripdata\2013\201306-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2013\201306-citibike-tripdata_1.csv
  📖 Reading: extracted_tripdata\2013\201307-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2013\201307-citibike-tripdata_1.csv
  📖 Reading: extracted_tripdata\2013\201308-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2013\201308-citibike-tripdata_1.csv
  📖 Reading: extracted_tripdata\2013\201308-citibike-tripdata_2.csv
  📖 Reading: extracted_tripdata\2013\201309-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2013\201309-citibike-tripdata_1.csv
Original rows: 6880513
  ✅ Merged CSV saved to: merged_tripdata3\2013_merged.csv

📁 Found 0 CSV files in extracted_tripdata\2014
  ⚠️ No valid CSV files to merge.

📁 Found 0 CSV files in extracted_tripdata\2015
  ⚠️ No valid CSV files to merge.

📁 Found 0 CSV files in extracted_tripdata\2016
  ⚠️ No valid CSV files to merge.

📁 Found 0 CSV files 

In [8]:
# for year 2022 -2025 fixed
import os
import pandas as pd

def merge_csv_files2022(input_dir, output_file, year):
    all_csv_files = [
        os.path.join(input_dir, f)
        for f in os.listdir(input_dir)
        if f.lower().endswith('.csv')
    ]

    print(f"\n📁 Found {len(all_csv_files)} CSV files in {input_dir}")

    df_list = []
    for file in all_csv_files:
        print(f"  📖 Reading: {file}")
        try:
            df = pd.read_csv(file)

            if int(year) >= 2022:
                # Rename new format to match old format
                df = df.rename(columns={
                    'ride_id': 'trip_id',
                    'rideable_type': 'bike_type',
                    'started_at': 'starttime',
                    'ended_at': 'stoptime',
                    'start_station_name': 'start station name',
                    'start_station_id': 'start station id',
                    'end_station_name': 'end station name',
                    'end_station_id': 'end station id',
                    'start_lat': 'start station latitude',
                    'start_lng': 'start station longitude',
                    'end_lat': 'end station latitude',
                    'end_lng': 'end station longitude',
                    'member_casual': 'usertype',
                })

                # Parse datetimes
                # Apply to starttime
                #df['starttime'] = df['starttime'].apply(parse_datetime)
            # (Optional) Drop helper columns if you don't need them
                #df['stoptime'] = df['stoptime'].apply(parse_datetime)
                df['starttime'] = pd.to_datetime(df['starttime'], errors='coerce')
                df['stoptime'] = pd.to_datetime(df['stoptime'], errors='coerce')

                # Calculate tripduration in seconds
                df['tripduration'] = (df['stoptime'] - df['starttime']).dt.total_seconds()

                # Add missing old columns with NaN
                for col in ['birth year', 'gender', 'bikeid']:
                    if col not in df.columns:
                        df[col] = pd.NA

            df_list.append(df)
        except Exception as e:
            print(f"  ❌ Error reading {file}: {e}")

    if df_list:
        merged_df = pd.concat(df_list, ignore_index=True)
        print(f"  🔍 Original rows: {len(merged_df)}")

        # Remove duplicates
        merged_df = merged_df.drop_duplicates()

        # Sort by start time if available
        if 'starttime' in merged_df.columns:
            merged_df = merged_df.sort_values(by='starttime')

        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        merged_df.to_csv(output_file, index=False)
        print(f"  ✅ Merged CSV saved to: {output_file}")
    else:
        print("  ⚠️ No valid CSV files to merge.")

# === CONFIG ===
input_root = r"extracted_tripdata"              # Folder with year subfolders
output_root = r"merged_tripdata2"               # Where to save merged files
os.makedirs(output_root, exist_ok=True)

# Only merge 2022–2025
years_to_process = {'2022', '2023', '2024', '2025'}

for folder_name in os.listdir(input_root):
    folder_path = os.path.join(input_root, folder_name)
    
    if os.path.isdir(folder_path) and folder_name.isdigit() and folder_name in years_to_process:
        year = folder_name
        output_file = os.path.join(output_root, f"{year}_merged.csv")
        merge_csv_files2022(folder_path, output_file, year)



📁 Found 12 CSV files in extracted_tripdata\2022
  📖 Reading: extracted_tripdata\2022\JC-202201-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2022\JC-202202-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2022\JC-202203-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2022\JC-202204-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2022\JC-202205-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2022\JC-202206-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2022\JC-202207-citbike-tripdata.csv
  📖 Reading: extracted_tripdata\2022\JC-202208-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2022\JC-202209-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2022\JC-202210-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2022\JC-202211-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2022\JC-202212-citibike-tripdata.csv
  🔍 Original rows: 895485
  ✅ Merged CSV saved to: merged_tripdata2\2022_merged.csv

📁 Found 12 CSV files in extracted_tripd

  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202401-citibike-tripdata_1.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202401-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202401-citibike-tripdata_2.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202402-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202402-citibike-tripdata_1.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202402-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202402-citibike-tripdata_2.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202402-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202402-citibike-tripdata_3.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202403-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202403-citibike-tripdata_1.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202403-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202403-citibike-tripdata_2.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202403-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202403-citibike-tripdata_3.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202404-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202404-citibike-tripdata_1.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202404-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202404-citibike-tripdata_2.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202404-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202404-citibike-tripdata_3.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202404-citibike-tripdata_4.csv
  ❌ Error reading extracted_tripdata\2024\202404-citibike-tripdata_4.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202405-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202405-citibike-tripdata_1.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202405-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202405-citibike-tripdata_2.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202405-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202405-citibike-tripdata_3.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202405-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202405-citibike-tripdata_4.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202405-citibike-tripdata_5.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202405-citibike-tripdata_5.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202406-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202406-citibike-tripdata_1.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202406-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202406-citibike-tripdata_2.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202406-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202406-citibike-tripdata_3.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202406-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202406-citibike-tripdata_4.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202406-citibike-tripdata_5.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202406-citibike-tripdata_5.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202407-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202407-citibike-tripdata_1.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202407-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202407-citibike-tripdata_2.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202407-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202407-citibike-tripdata_3.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202407-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202407-citibike-tripdata_4.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202407-citibike-tripdata_5.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202407-citibike-tripdata_5.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202408-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202408-citibike-tripdata_1.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202408-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202408-citibike-tripdata_2.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202408-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202408-citibike-tripdata_3.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202408-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202408-citibike-tripdata_4.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202408-citibike-tripdata_5.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202408-citibike-tripdata_5.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202409-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202409-citibike-tripdata_1.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202409-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202409-citibike-tripdata_2.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202409-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202409-citibike-tripdata_3.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202409-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202409-citibike-tripdata_4.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202409-citibike-tripdata_5.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202409-citibike-tripdata_5.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202410-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202410-citibike-tripdata_1.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202410-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202410-citibike-tripdata_2.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202410-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202410-citibike-tripdata_3.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202410-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202410-citibike-tripdata_4.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202410-citibike-tripdata_5.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202410-citibike-tripdata_5.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202410-citibike-tripdata_6.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202410-citibike-tripdata_6.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202411-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202411-citibike-tripdata_1.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202411-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202411-citibike-tripdata_2.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202411-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202411-citibike-tripdata_3.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202411-citibike-tripdata_4.csv
  ❌ Error reading extracted_tripdata\2024\202411-citibike-tripdata_4.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202412-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202412-citibike-tripdata_1.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202412-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2024\202412-citibike-tripdata_2.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\202412-citibike-tripdata_3.csv
  ❌ Error reading extracted_tripdata\2024\202412-citibike-tripdata_3.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\JC-202401-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2024\JC-202402-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2024\JC-202403-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2024\JC-202404-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2024\JC-202405-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2024\JC-202406-citibike-tripdata.csv
  ❌ Error reading extracted_tripdata\2024\JC-202406-citibike-tripdata.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2024\JC-202407-citibike-tripdata.csv
  ❌ Error reading extracted_tripdata\2024\JC-202407-citibike-tripdata.csv: c

  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2025\202501-citibike-tripdata_1.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2025\202501-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2025\202501-citibike-tripdata_2.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2025\202501-citibike-tripdata_3.csv
  ❌ Error reading extracted_tripdata\2025\202501-citibike-tripdata_3.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2025\202502-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2025\202502-citibike-tripdata_1.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2025\202502-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2025\202502-citibike-tripdata_2.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2025\202502-citibike-tripdata_3.csv
  ❌ Error reading extracted_tripdata\2025\202502-citibike-tripdata_3.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2025\202503-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2025\202503-citibike-tripdata_1.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2025\202503-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2025\202503-citibike-tripdata_2.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2025\202503-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2025\202503-citibike-tripdata_3.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2025\202503-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2025\202503-citibike-tripdata_4.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2025\202504-citibike-tripdata_1.csv
  ❌ Error reading extracted_tripdata\2025\202504-citibike-tripdata_1.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2025\202504-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2025\202504-citibike-tripdata_2.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2025\202504-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2025\202504-citibike-tripdata_3.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2025\202504-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2025\202504-citibike-tripdata_4.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2025\202505-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2025\202505-citibike-tripdata_1.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2025\202505-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2025\202505-citibike-tripdata_2.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2025\202505-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2025\202505-citibike-tripdata_3.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2025\202505-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2025\202505-citibike-tripdata_4.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2025\202505-citibike-tripdata_5.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2025\202505-citibike-tripdata_5.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2025\202506-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2025\202506-citibike-tripdata_1.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2025\202506-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2025\202506-citibike-tripdata_2.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2025\202506-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2025\202506-citibike-tripdata_3.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2025\202506-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2025\202506-citibike-tripdata_4.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2025\202506-citibike-tripdata_5.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2025\202506-citibike-tripdata_5.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2025\202507-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2025\202507-citibike-tripdata_1.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2025\202507-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2025\202507-citibike-tripdata_2.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2025\202507-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2025\202507-citibike-tripdata_3.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2025\202507-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2025\202507-citibike-tripdata_4.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2025\202507-citibike-tripdata_5.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2025\202507-citibike-tripdata_5.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2025\202508-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2025\202508-citibike-tripdata_1.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2025\202508-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  ❌ Error reading extracted_tripdata\2025\202508-citibike-tripdata_2.csv: cannot safely cast non-equivalent object to int64
  📖 Reading: extracted_tripdata\2025\202508-citibike-tripdata_3.csv


KeyboardInterrupt: 

In [20]:
import os
import pandas as pd

def merge_csv_files(input_dir, output_file, year):
    all_csv_files = [
        os.path.join(input_dir, f)
        for f in os.listdir(input_dir)
        if f.lower().endswith('.csv')
    ]

    print(f"\n📁 Found {len(all_csv_files)} CSV files in {input_dir}")

    df_list = []
    for file in all_csv_files:
        print(f"  📖 Reading: {file}")
        try:
            df = pd.read_csv(file)
            #df.head(5)

            year_int = int(year)

            if year_int >= 2024:
                # Columns for 2024–2025 format
                df = df.rename(columns={
                    'ride_id': 'trip_id',
                    'rideable_type': 'bike_type',
                    'started_at': 'starttime',
                    'ended_at': 'stoptime',
                    'start_station_name': 'start station name',
                    'start_station_id': 'start station id',
                    'end_station_name': 'end station name',
                    'end_station_id': 'end station id',
                    'start_lat': 'start station latitude',
                    'start_lng': 'start station longitude',
                    'end_lat': 'end station latitude',
                    'end_lng': 'end station longitude',
                    'member_casual': 'usertype',
                })

            elif year_int >= 2022:
                # 2022–2023 format
                df = df.rename(columns={
                    'ride_id': 'trip_id',
                    'rideable_type': 'bike_type',
                    'started_at': 'starttime',
                    'ended_at': 'stoptime',
                    'start_station_name': 'start station name',
                    'start_station_id': 'start station id',
                    'end_station_name': 'end station name',
                    'end_station_id': 'end station id',
                    'start_lat': 'start station latitude',
                    'start_lng': 'start station longitude',
                    'end_lat': 'end station latitude',
                    'end_lng': 'end station longitude',
                    'member_casual': 'usertype',
                })

            # Parse datetime and calculate duration
            df['starttime'] = pd.to_datetime(df['starttime'], errors='coerce')
            df['stoptime'] = pd.to_datetime(df['stoptime'], errors='coerce')
            #df['tripduration'] = (df['stoptime'] - df['starttime']).dt.total_seconds().astype('Int64')
            # Compute tripduration in seconds, safely
            df['tripduration'] = (df['stoptime'] - df['starttime']).dt.total_seconds()



            # Add missing legacy columns
            for col in ['birth year', 'gender', 'bikeid']:
                if col not in df.columns:
                    df[col] = pd.NA

            df_list.append(df)

        except Exception as e:
            print(f"  ❌ Error reading {file}: {e}")

    if df_list:
        merged_df = pd.concat(df_list, ignore_index=True)
        print(f"  🔍 Original rows: {len(merged_df)}")

        # Drop duplicates
        merged_df = merged_df.drop_duplicates()

        # Sort by starttime
        if 'starttime' in merged_df.columns:
            merged_df = merged_df.sort_values(by='starttime')

        # Save
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        merged_df.to_csv(output_file, index=False)
        print(f"  ✅ Merged CSV saved to: {output_file}")
    else:
        print("  ⚠️ No valid CSV files to merge.")

# === CONFIG ===
input_root = r"extracted_tripdata"
output_root = r"merged_tripdata2"
os.makedirs(output_root, exist_ok=True)

# Only merge 2022–2025
years_to_process = {'2024', '2025'}

for folder_name in os.listdir(input_root):
    folder_path = os.path.join(input_root, folder_name)
    
    if os.path.isdir(folder_path) and folder_name.isdigit() and folder_name in years_to_process:
        year = folder_name
        output_file = os.path.join(output_root, f"{year}_merged.csv")
        merge_csv_files(folder_path, output_file, year)



📁 Found 62 CSV files in extracted_tripdata\2024
  📖 Reading: extracted_tripdata\2024\202401-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202401-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202402-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202402-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202402-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202403-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202403-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202403-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202404-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202404-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202404-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202404-citibike-tripdata_4.csv
  📖 Reading: extracted_tripdata\2024\202405-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202405-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202405-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202405-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202405-citibike-tripdata_5.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202406-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202406-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202406-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202406-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202406-citibike-tripdata_5.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202407-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202407-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202407-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202407-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202407-citibike-tripdata_5.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202408-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202408-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202408-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202408-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202408-citibike-tripdata_5.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202409-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202409-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202409-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202409-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202409-citibike-tripdata_5.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202410-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202410-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202410-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202410-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202410-citibike-tripdata_5.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202410-citibike-tripdata_6.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202411-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202411-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202411-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202411-citibike-tripdata_4.csv
  📖 Reading: extracted_tripdata\2024\202412-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202412-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2024\202412-citibike-tripdata_3.csv
  📖 Reading: extracted_tripdata\2024\JC-202401-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2024\JC-202402-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2024\JC-202403-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2024\JC-202404-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2024\JC-202405-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2024\JC-202406-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2024\JC-202407-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2024\JC-202408-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2024\JC-202409-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2024\JC-202410-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2024\JC-202411-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2024\JC-202412-citibike-tripdata.csv
  🔍 Original rows: 45355660
  ✅ Merged CSV saved to: merged_tripdata2\2024_merged.csv

📁 Found 50 CSV fi

  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202501-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202501-citibike-tripdata_3.csv
  📖 Reading: extracted_tripdata\2025\202502-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202502-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202502-citibike-tripdata_3.csv
  📖 Reading: extracted_tripdata\2025\202503-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202503-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202503-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202503-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202504-citibike-tripdata_1.csv
  📖 Reading: extracted_tripdata\2025\202504-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202504-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202504-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202505-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202505-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202505-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202505-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202505-citibike-tripdata_5.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202506-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202506-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202506-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202506-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202506-citibike-tripdata_5.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202507-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202507-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202507-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202507-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202507-citibike-tripdata_5.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202508-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202508-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202508-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202508-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202508-citibike-tripdata_5.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202508-citibike-tripdata_6.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202509-citibike-tripdata_1.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202509-citibike-tripdata_2.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202509-citibike-tripdata_3.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202509-citibike-tripdata_4.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202509-citibike-tripdata_5.csv


  df = pd.read_csv(file)


  📖 Reading: extracted_tripdata\2025\202509-citibike-tripdata_6.csv
  📖 Reading: extracted_tripdata\2025\JC-202501-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2025\JC-202502-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2025\JC-202503-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2025\JC-202504-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2025\JC-202505-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2025\JC-202506-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2025\JC-202507-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2025\JC-202508-citibike-tripdata.csv
  📖 Reading: extracted_tripdata\2025\JC-202509-citibike-tripdata.csv
  🔍 Original rows: 36303831
  ✅ Merged CSV saved to: merged_tripdata2\2025_merged.csv


In [15]:
filepath = r"C:\Users\zhao\Documents\workspace\task\extracted_tripdata\2024\202401-citibike-tripdata_1.csv"
df = pd.read_csv(filepath)

  df = pd.read_csv(filepath)


In [16]:
df

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,8E865410DBDE0CA9,electric_bike,2024-01-01 13:00:04.563,2024-01-01 13:04:04.652,3 St & 3 Ave,4028.03,Carroll St & Smith St,4225.14,40.675070,-73.987752,40.680611,-73.994758,casual
1,0403D0B3FC9CA77D,electric_bike,2024-01-08 19:36:43.520,2024-01-08 19:53:16.266,Franklin Ave & St Marks Ave,4107.05,Bedford Ave & Bergen St,4066.15,40.675832,-73.956168,40.676368,-73.952918,casual
2,F6DE7BB42FF550BE,electric_bike,2024-01-12 15:00:41.580,2024-01-12 15:36:29.622,W 67 St & Broadway,7116.04,Central Park W & W 103 St,7577.27,40.774925,-73.982666,40.795590,-73.961884,casual
3,84A995BFD98030D4,classic_bike,2024-01-12 16:52:19.025,2024-01-12 17:17:29.773,Central Park West & W 68 St,7079.06,E 5 St & Ave C,5545.04,40.773407,-73.977825,40.722992,-73.979955,member
4,7BBEAD4F2B535813,electric_bike,2024-01-05 19:50:19.202,2024-01-05 20:34:42.517,W 67 St & Broadway,7116.04,Ave A & E 14 St,5779.11,40.774925,-73.982666,40.730311,-73.980472,member
...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,53C5A7DDC930FE68,electric_bike,2024-01-19 08:11:31.812,2024-01-19 08:18:23.659,Devoe St & Lorimer St,5259.06,Meserole Ave & Manhattan Ave,5666.04,40.713352,-73.949103,40.727086,-73.952991,member
999996,80C14103C204CD47,electric_bike,2024-01-23 08:50:57.782,2024-01-23 09:12:37.637,Fulton St & Waverly Ave,4345.11,Broad St & Bridge St,4962.08,40.683239,-73.965996,40.703652,-74.011678,member
999997,E1EE9037A8CB6101,electric_bike,2024-01-21 14:16:24.984,2024-01-21 14:22:14.498,Driggs Ave & N 9 St,5411.08,Meserole Ave & Manhattan Ave,5666.04,40.718170,-73.955201,40.727086,-73.952991,member
999998,DA5E681B117190A0,electric_bike,2024-01-23 09:02:49.848,2024-01-23 09:21:19.513,Driggs Ave & N 9 St,5411.08,Pearl St & Hanover Square,4993.02,40.718170,-73.955201,40.704718,-74.009260,member


In [17]:
df = df.rename(columns={
                    'ride_id': 'trip_id',
                    'rideable_type': 'bike_type',
                    'started_at': 'starttime',
                    'ended_at': 'stoptime',
                    'start_station_name': 'start station name',
                    'start_station_id': 'start station id',
                    'end_station_name': 'end station name',
                    'end_station_id': 'end station id',
                    'start_lat': 'start station latitude',
                    'start_lng': 'start station longitude',
                    'end_lat': 'end station latitude',
                    'end_lng': 'end station longitude',
                    'member_casual': 'usertype',
                })

            # Parse datetime and calculate duration
df['starttime'] = pd.to_datetime(df['starttime'], errors='coerce')
df['stoptime'] = pd.to_datetime(df['stoptime'], errors='coerce')

In [18]:
df['tripduration'] = (df['stoptime'] - df['starttime']).dt.total_seconds()


In [19]:
df['tripduration'] 

0          240.089
1          992.746
2         2148.042
3         1510.748
4         2663.315
            ...   
999995     411.847
999996    1299.855
999997     349.514
999998    1109.665
999999     210.209
Name: tripduration, Length: 1000000, dtype: float64