In [1]:
import pandas as pd
import os

def combine_age_distributions(input_root, output_file):
    total_counts = pd.Series(dtype=int)

    # Traverse each year's subfolder
    for folder_name in os.listdir(input_root):
        folder_path = os.path.join(input_root, folder_name)
        age_file = os.path.join(folder_path, 'age_distribution.csv')

        if os.path.isfile(age_file):
            print(f"📂 Processing: {age_file}")
            try:
                df = pd.read_csv(age_file)
                # Handle columns: assume first column is age, second is count
                df.columns = ['age', 'count']
                df = df.dropna()
                #df['age'] = pd.to_numeric(df['age'], errors='coerce')
                #df['count'] = pd.to_numeric(df['count'], errors='coerce')
                df = df.dropna()

                age_counts = df.groupby('age')['count'].sum()
                total_counts = total_counts.add(age_counts, fill_value=0)

            except Exception as e:
                print(f"❌ Failed to process {age_file}: {e}")

    # Final combined DataFrame
    combined_df = total_counts.astype(int).sort_index().reset_index()
    combined_df.columns = ['age', 'total_usage_count']

    # Save to file
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    combined_df.to_csv(output_file, index=False)
    print(f"✅ Saved combined age distribution → {output_file}")

# === CONFIG ===
input_root = r"results\dataAnalysis"
output_file = r"results\age_distribution\all_years_usage_age_distribution.csv"

combine_age_distributions(input_root, output_file)


📂 Processing: results\dataAnalysis\2013\age_distribution.csv
📂 Processing: results\dataAnalysis\2014\age_distribution.csv
📂 Processing: results\dataAnalysis\2015\age_distribution.csv
📂 Processing: results\dataAnalysis\2016\age_distribution.csv
📂 Processing: results\dataAnalysis\2017\age_distribution.csv
📂 Processing: results\dataAnalysis\2018\age_distribution.csv
📂 Processing: results\dataAnalysis\2019\age_distribution.csv
📂 Processing: results\dataAnalysis\2020\age_distribution.csv
📂 Processing: results\dataAnalysis\2021\age_distribution.csv
✅ Saved combined age distribution → results\age_distribution\all_years_usage_age_distribution.csv


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Load combined age distribution
input_file = Path("results/age_distribution/all_years_usage_age_distribution.csv")
df = pd.read_csv(input_file)

# Set up style
sns.set(style="whitegrid", font_scale=1.5)

# Plot
plt.figure(figsize=(16, 8))
sns.barplot(data=df, x="age", y="total_usage_count", color="skyblue")

plt.title("Total Bike Usage by Age Group (All Years)", fontsize=20)
plt.xlabel("Age", fontsize=16)
plt.ylabel("Total Usage Count", fontsize=16)
plt.xticks(rotation=45)
plt.tight_layout()

# Save the plot
output_path = Path("results/dataAnalysis/extra_analysis/age_distribution_presentation.png")
plt.savefig(output_path, dpi=300)
plt.show()
