In [23]:
import pandas as pd
import os

def merge_usage_with_matched_crashes_2013():
    # File paths
    usage_file = r"results\dataAnalysis\2013\bike_usage_per_month.csv"
    matched_crash_file = r"results\matched_trips_2013.csv"
    output_file = r"results\dataAnalysis\2013\monthly_usage_with_crashes.csv"

    # Step 1: Load matched crash trips
    try:
        crashes = pd.read_csv(matched_crash_file)
        if 'starttime' not in crashes.columns:
            print("❌ 'starttime' column missing in matched_trips_2013.csv")
            return

        crashes['starttime'] = pd.to_datetime(crashes['starttime'], errors='coerce')
        crashes = crashes.dropna(subset=['starttime'])

        crashes['year'] = crashes['starttime'].dt.year
        crashes['month'] = crashes['starttime'].dt.month

        # Step 2: Count crashes per month
        crash_counts = crashes.groupby(['year', 'month']).size().reset_index(name='crash_count')
        
    except Exception as e:
        print(f"❌ Failed to process matched_trips_2013.csv: {e}")
        return

    # Step 3: Load bike usage
    try:
        usage = pd.read_csv(usage_file)
        print(usage)
        usage.columns = ['time', 'trip_count']
        usage['time'] = pd.to_datetime(usage['time'], errors='coerce')
        usage['year'] = usage['time'].dt.year
        usage['month'] = usage['time'].dt.month
        usage = usage.drop(columns=['time'])
        #usage['trip_count'] = usage['trip_count']

    except Exception as e:
        print(f"❌ Failed to load bike usage data: {e}")
        return
    print(crashes)
    # Step 4: Merge usage and crash counts
    merged = pd.merge(usage, crash_counts, on=['year', 'month'], how='left')
    merged['crash_count'] = merged['crash_count'].fillna(0).astype(int)

    # After merging
    merged['trip_count'] = pd.to_numeric(merged['trip_count'], errors='coerce')
    merged['crash_count'] = pd.to_numeric(merged['crash_count'], errors='coerce')

    # Fill any missing values just in case
    merged['trip_count'] = merged['trip_count'].fillna(0).astype(int)
    merged['crash_count'] = merged['crash_count'].fillna(0).astype(int)

    # Now safe to calculate percentage
    merged['percentage'] = (merged['crash_count'] / merged['trip_count'].replace(0, pd.NA)) * 100
    merged['percentage'] = merged['percentage'].fillna(0)#.round(2)

    # Step 6: Reorder columns
    merged = merged[['year', 'month', 'trip_count', 'crash_count', 'percentage']]
    merged = merged.sort_values(by=['year', 'month'])

    # Step 7: Save
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    merged.to_csv(output_file, index=False)
    print(f"✅ Saved: {output_file}")

# Run it
merge_usage_with_matched_crashes_2013()


  Unnamed: 0  trip_count
0    2013-06      577702
1    2013-07      843417
2    2013-08     1001958
3    2013-09     1034359
4    2013-10     1037712
5    2013-11      675774
6    2013-12      443966
     tripduration           starttime             stoptime  start_station_id  \
0            1354 2013-06-03 17:08:18  2013-06-03 17:30:52               311   
1            1525 2013-06-05 17:34:04  2013-06-05 17:59:29               352   
2             627 2013-06-05 18:00:34  2013-06-05 18:11:01               352   
3            1154 2013-06-06 12:19:17  2013-06-06 12:38:31               152   
4             866 2013-06-06 18:33:21  2013-06-06 18:47:47               375   
..            ...                 ...                  ...               ...   
197           396 2013-12-07 17:00:26  2013-12-07 17:07:02               410   
198           761 2013-12-07 20:16:33  2013-12-07 20:29:14               507   
199           396 2013-12-07 20:30:42  2013-12-07 20:37:18               290   


# Final

In [25]:
import pandas as pd
import os

def merge_usage_with_matched_crashes(year):
    usage_file = f"results/dataAnalysis/{year}/bike_usage_per_month.csv"
    matched_crash_file = f"results/matched_trips_{year}.csv"
    output_file = f"results/dataAnalysis/{year}/monthly_usage_with_crashes.csv"

    # Step 1: Load matched crash trips
    try:
        crashes = pd.read_csv(matched_crash_file)
        if 'starttime' not in crashes.columns:
            print(f"❌ 'starttime' missing in {matched_crash_file}")
            return

        crashes['starttime'] = pd.to_datetime(crashes['starttime'], errors='coerce')
        crashes = crashes.dropna(subset=['starttime'])
        crashes['year'] = crashes['starttime'].dt.year
        crashes['month'] = crashes['starttime'].dt.month
        crash_counts = crashes.groupby(['year', 'month']).size().reset_index(name='crash_count')

    except Exception as e:
        print(f"❌ Failed to process crash file for {year}: {e}")
        return

    # Step 2: Load bike usage
    try:
        usage = pd.read_csv(usage_file, header=None if year == '2013' else 'infer')
        
        # Handle missing headers (e.g., for 2013 file)
        if usage.shape[1] == 2:
            usage.columns = ['time', 'trip_count']
        usage['time'] = pd.to_datetime(usage['time'], errors='coerce')
        usage['year'] = usage['time'].dt.year
        usage['month'] = usage['time'].dt.month
        usage = usage.drop(columns=['time'])
        usage['trip_count'] = pd.to_numeric(usage['trip_count'], errors='coerce').fillna(0).astype(int)

    except Exception as e:
        print(f"❌ Failed to load usage data for {year}: {e}")
        return

    # Step 3: Merge
    merged = pd.merge(usage, crash_counts, on=['year', 'month'], how='left')
    merged['crash_count'] = pd.to_numeric(merged['crash_count'], errors='coerce').fillna(0).astype(int)
    merged = merged[merged['trip_count'] > 0]  # Remove any bad rows with 0 trips

    # Step 4: Calculate percentage
    merged['percentage'] = (merged['crash_count'] / merged['trip_count'].replace(0, pd.NA)) * 100
    merged['percentage'] = merged['percentage'].fillna(0)#.round(2)

    # Step 5: Order and Save
    merged = merged[['year', 'month', 'trip_count', 'crash_count', 'percentage']]
    merged = merged.sort_values(by=['year', 'month'])

    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    merged.to_csv(output_file, index=False)
    print(f"✅ Saved: {output_file}")


# === Run for all available years ===
def run_for_all_years():
    base_dir = "results/dataAnalysis"
    years = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d)) and d.isdigit()]
    years.sort()  # Optional: sort years chronologically

    for year in years:
        print(f"\n📅 Processing year: {year}")
        merge_usage_with_matched_crashes(year)


# Run it
run_for_all_years()



📅 Processing year: 2013
✅ Saved: results/dataAnalysis/2013/monthly_usage_with_crashes.csv

📅 Processing year: 2014
✅ Saved: results/dataAnalysis/2014/monthly_usage_with_crashes.csv

📅 Processing year: 2015
✅ Saved: results/dataAnalysis/2015/monthly_usage_with_crashes.csv

📅 Processing year: 2016
✅ Saved: results/dataAnalysis/2016/monthly_usage_with_crashes.csv

📅 Processing year: 2017
✅ Saved: results/dataAnalysis/2017/monthly_usage_with_crashes.csv

📅 Processing year: 2018
✅ Saved: results/dataAnalysis/2018/monthly_usage_with_crashes.csv

📅 Processing year: 2019
✅ Saved: results/dataAnalysis/2019/monthly_usage_with_crashes.csv

📅 Processing year: 2020
✅ Saved: results/dataAnalysis/2020/monthly_usage_with_crashes.csv

📅 Processing year: 2021
✅ Saved: results/dataAnalysis/2021/monthly_usage_with_crashes.csv

📅 Processing year: 2022


  if not (lk == lk.astype(rk.dtype))[~np.isnan(lk)].all():


✅ Saved: results/dataAnalysis/2022/monthly_usage_with_crashes.csv

📅 Processing year: 2023
✅ Saved: results/dataAnalysis/2023/monthly_usage_with_crashes.csv

📅 Processing year: 2024
❌ Failed to process crash file for 2024: [Errno 2] No such file or directory: 'results/matched_trips_2024.csv'

📅 Processing year: 2025
❌ Failed to process crash file for 2025: [Errno 2] No such file or directory: 'results/matched_trips_2025.csv'
