In [2]:
import os
import pandas as pd

DATA_FOLDER = "..//merged_tripdata3" #C:\Users\yanhu\Documents\Python_Notebook\AXA_task\
BASE_OUTPUT_FOLDER = "..//results//dataAnalysis"

os.makedirs(BASE_OUTPUT_FOLDER, exist_ok=True)

def save_age_distribution(file_path, year):
    print(f"Processing age distribution for {year} from file {file_path}...")
    df = pd.read_csv(file_path)
    df.columns = [col.strip() for col in df.columns]
    #df['starttime2'] = df['starttime'].fillna(df['stoptime'])
    #df['starttime2'] = pd.to_datetime(df['starttime2'], format='mixed') #, format='mixed' , errors='coerce'
    #df = df.dropna(subset=['starttime'])

    if 'birth year' in df.columns:
        df['birth year'] = pd.to_numeric(df['birth year'], errors='coerce').fillna(0).astype(int)
        df['age'] = year - df['birth year']
        df['age_plot'] = df['age'].where(df['age'].between(1, 120)).astype('Int64')
        age_dist = df['age_plot'].value_counts().sort_index()
    else:
        print(f"No 'birth year' column in {file_path}, skipping age distribution.")
        #return

    

        # --- Gender Distribution ---
    if 'gender' in df.columns:
        gender_map = {0: 'Female', 1: 'Male', 2: 'Unknow'}
        gender_counts = df['gender'].map(gender_map).value_counts()

    else:
        gender_counts = pd.Series(dtype=int)
        print(f"No 'gender' column in {file_path} — skipping gender distribution.")
    df['starttime'] = pd.to_datetime(df['starttime'], errors='coerce')
        # --- Usage Distribution ---
    df['start_hour'] = df['starttime'].dt.hour
    df['weekday'] = df['starttime'].dt.day_name()
    df['year'] = df['starttime'].dt.year
    df['month'] = df['starttime'].dt.to_period('M')

    # Count number of trips per month
    monthly_counts = df['month'].value_counts().sort_index()
    monthly_counts.index = monthly_counts.index.astype(str)  # Convert Period to string
    
    # Trips by hour of day
    hourly_usage = df['start_hour'].value_counts().sort_index()

    # Trips by weekday (sorted)
    weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    weekly_usage = df['weekday'].value_counts().reindex(weekday_order).fillna(0).astype(int)
    # Check and process 'usertype'
    if 'usertype' in df.columns:
        usertype_counts = df['usertype'].value_counts()
    else:
        print(f"❌ No 'usertype' column found in {file_path}")
        

    # Trip duration in minutes
    if 'tripduration' not in df.columns:
        
        print("❌ 'tripduration' column missing.")
        df['starttime'] = pd.to_datetime(df['starttime'], errors='coerce')
        df['stoptime'] = pd.to_datetime(df['stoptime'], errors='coerce')
        df['tripduration'] = (df['stoptime'] - df['starttime']).dt.total_seconds() 

    df['tripduration_min'] = df['tripduration'] / 60

    # Check station name column
    if 'start station name' not in df.columns:
        print("❌ 'start station name' column missing.")
        

    # Risk proxy 1: Long trips over 60 minutes
    long_trips = df[df['tripduration_min'] > 60]

    # Risk proxy 2: Top 10 busiest start stations
    top_stations = df['start station name'].value_counts().head(10)
# Calculate % of trips from each top station that have long trips

    # Long trip counts per station
    long_trips_counts = long_trips['start station name'].value_counts()

    # Align top station long trip counts (fill missing with 0)
    #long_trip_counts_top = long_trips_counts.loc[top_stations.index].fillna(0).astype(int)
    long_trip_counts_top = long_trips_counts.reindex(top_stations.index).fillna(0).astype(int)


    # Calculate percentage of long trips
    percent_long = (long_trip_counts_top / top_stations) * 100

    # Build final DataFrame
    result_df = pd.DataFrame({
        'total_trips': top_stations,
        'long_trips': long_trip_counts_top,
        'percent_long_trips': percent_long.round(2)
    })
# Calculate % of trips from each station that have long trips

    results = []
    for station in long_trips_counts.head(10).index:
        station_trips = df[df['start station name'] == station]
        total_trips = station_trips.shape[0]
        long_count = long_trips_counts[station]
        percent_long = (long_count / total_trips * 100) if total_trips > 0 else 0

        results.append({
            'station': station,
            'total_trips': total_trips,
            'long_trips': long_count,
            'percent_long_trips': round(percent_long, 2)
        })

    percent_summary = pd.DataFrame(results) 
    
    year_folder = os.path.join(BASE_OUTPUT_FOLDER, str(year))
    os.makedirs(year_folder, exist_ok=True)
    if 'birth year' in df.columns:
        age_dist.to_csv(os.path.join(year_folder, 'age_distribution.csv'), header=['count'])

        print(f"Saved age distribution for {year} to {year_folder}")


    # Save gender distribution CSV
    gender_counts.to_csv(os.path.join(year_folder, 'gender_distribution.csv'), header=['count'])
    print(f"Saved gender distribution for {year}.")

    hourly_usage.to_csv(os.path.join(year_folder, 'trips_by_hour.csv'), header=['count'])
    weekly_usage.to_csv(os.path.join(year_folder, 'trips_by_weekday.csv'), header=['count'])

    print(f"✅ Saved trips_by_hour.csv and trips_by_weekday.csv for {year}.\n")
    usertype_counts.to_csv(os.path.join(year_folder, 'usertype_counts.csv'), header=['count'])
    print(f"✅ Saved usertype_counts.csv for {year}\n")
    result_df.to_csv(os.path.join(year_folder, 'long_trip_station_stats_top10.csv'))
    print(f"✅ Saved long_trip_station_stats_top10.csv for {year}\n")
    percent_summary.to_csv(os.path.join(year_folder, "long_trip_station_summary.csv"), index=False)

    print(f"✅ Saved long_trip_station_summary.csv for {year}\n")

    output_path = os.path.join(year_folder, 'bike_usage_per_month.csv')
    monthly_counts.to_csv(output_path, header=['trip_count'])

    print(f"✅ Saved monthly bike usage for {year} to {output_path}\n")

    del df
    del result_df


# Only process 2013 and 2014 files
for filename in os.listdir(DATA_FOLDER):
    if filename.endswith('.csv') and '_merged' in filename:
        try:
            year = int(filename[:4])
        except:
            continue

        #if year in [2013, 2022]:  # process only these years
        if 2020 <= year <= 2025:
            file_path = os.path.join(DATA_FOLDER, filename)
            save_age_distribution(file_path, year)

print("Age distribution extraction for 2013 to 2025 completed!")

Processing age distribution for 2022 from file ..//merged_tripdata3\2022_merged.csv...
No 'birth year' column in ..//merged_tripdata3\2022_merged.csv, skipping age distribution.
No 'gender' column in ..//merged_tripdata3\2022_merged.csv — skipping gender distribution.
❌ 'tripduration' column missing.
Saved gender distribution for 2022.
✅ Saved trips_by_hour.csv and trips_by_weekday.csv for 2022.

✅ Saved usertype_counts.csv for 2022

✅ Saved long_trip_station_stats_top10.csv for 2022

✅ Saved long_trip_station_summary.csv for 2022

✅ Saved monthly bike usage for 2022 to ..//results//dataAnalysis\2022\bike_usage_per_month.csv

Processing age distribution for 2023 from file ..//merged_tripdata3\2023_merged.csv...
No 'birth year' column in ..//merged_tripdata3\2023_merged.csv, skipping age distribution.
No 'gender' column in ..//merged_tripdata3\2023_merged.csv — skipping gender distribution.
❌ 'tripduration' column missing.
Saved gender distribution for 2023.
✅ Saved trips_by_hour.csv an

In [16]:
year

2015