In [18]:
import os
import pandas as pd
from collections import defaultdict, Counter

DATA_FOLDER = r"C:\Users\zhao\Documents\workspace\task\merged_tripdata2"

# year -> age_group -> count
year_age_group_counts = defaultdict(Counter)

bins = [10, 20, 30, 40, 50, 60, 100]
labels = ['10s', '20s', '30s', '40s', '50s', '60+']

for filename in os.listdir(DATA_FOLDER):
    if filename.endswith('.csv') and '_merged' in filename:
        file_path = os.path.join(DATA_FOLDER, filename)
        print(f"Processing {filename}...")

        df = pd.read_csv(file_path)
        df.columns = [col.strip() for col in df.columns]  # keep original case, just strip spaces

        if 'starttime' not in df.columns:
            print("No starttime column, skipping file.")
            continue

        if 'birth year' not in df.columns:
            print(f"'birth year' column missing in {filename}, skipping age group extraction.")
            continue

        df['starttime'] = pd.to_datetime(df['starttime'], errors='coerce')
        df = df.dropna(subset=['starttime'])

        year = int(filename[:4])

        df['age'] = year - pd.to_numeric(df['birth year'], errors='coerce')
        df = df[df['age'].between(1, 150)]

        df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels)

        counts = df['age_group'].value_counts()
        year_age_group_counts[year].update(counts.to_dict())

# Convert to DataFrame
result_df = pd.DataFrame(year_age_group_counts).fillna(0).T  # years as rows, age groups as columns
result_df = result_df[labels]  # ensure correct column order

# Save to CSV
result_df.to_csv('age_group_counts_by_year.csv', index_label='year')
print("Saved age group counts by year to 'age_group_counts_by_year.csv'")


Processing 2013_merged.csv...


  df = pd.read_csv(file_path)


Processing 2014_merged.csv...


  df = pd.read_csv(file_path)


Processing 2015_merged.csv...


  df = pd.read_csv(file_path)


Processing 2016_merged.csv...


  df = pd.read_csv(file_path)


Processing 2017_merged.csv...


  df = pd.read_csv(file_path)


Processing 2018_merged.csv...
Processing 2019_merged.csv...
Processing 2020_merged.csv...
Processing 2021_merged.csv...


  df = pd.read_csv(file_path)


Processing 2022_merged.csv...
Processing 2023_merged.csv...
Processing 2024_merged.csv...


MemoryError: Unable to allocate 3.04 GiB for an array with shape (9, 45355660) and data type object

In [16]:
import os
import pandas as pd
from collections import Counter

DATA_FOLDER = r"C:\Users\zhao\Documents\workspace\task\merged_tripdata2"

age_group_counts = Counter()

bins = [10, 20, 30, 40, 50, 60, 100]
labels = ['10s', '20s', '30s', '40s', '50s', '60+']

for filename in os.listdir(DATA_FOLDER):
    if filename.endswith('.csv') and '_merged' in filename:
        file_path = os.path.join(DATA_FOLDER, filename)
        print(f"Processing {filename}...")

        df = pd.read_csv(file_path)
        df.columns = [col.strip() for col in df.columns]  # keep original case, just strip spaces

        if 'starttime' not in df.columns:
            print("No starttime column, skipping file.")
            continue

        # Only proceed if exact 'birth year' column exists (case-sensitive)
        if 'birth year' not in df.columns:
            print(f"'birth year' column missing in {filename}, skipping age group extraction.")
            continue

        df['starttime'] = pd.to_datetime(df['starttime'], errors='coerce')
        df = df.dropna(subset=['starttime'])

        year = int(filename[:4])

        df['age'] = year - pd.to_numeric(df['birth year'], errors='coerce')
        df = df[df['age'].between(1, 150)]

        df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels)

        counts = df['age_group'].value_counts()
        age_group_counts.update(counts.to_dict())

# Save aggregated results
age_group_df = pd.Series(age_group_counts).sort_index().reset_index()
age_group_df.columns = ['age_group', 'count']

age_group_df.to_csv('all_years_age_group_counts.csv', index=False)
print("Age group counts saved to 'all_years_age_group_counts.csv'")


Processing 2013_merged.csv...


  df = pd.read_csv(file_path)


Processing 2014_merged.csv...


  df = pd.read_csv(file_path)


Processing 2015_merged.csv...


  df = pd.read_csv(file_path)


Processing 2016_merged.csv...


  df = pd.read_csv(file_path)


Processing 2017_merged.csv...


  df = pd.read_csv(file_path)


Processing 2018_merged.csv...
Processing 2019_merged.csv...
Processing 2020_merged.csv...
Processing 2021_merged.csv...


  df = pd.read_csv(file_path)


Processing 2022_merged.csv...
Processing 2023_merged.csv...
Processing 2024_merged.csv...
Processing 2025_merged.csv...
Age group counts saved to 'all_years_age_group_counts.csv'


In [17]:
age_group_df

Unnamed: 0,age_group,count
0,10s,1107986
1,20s,20610387
2,30s,21122588
3,40s,15463195
4,50s,8419908
5,60+,3072549


In [15]:
df

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,...,start station latitude.1,start station longitude.1,end station id.1,end station name.1,end station latitude.1,end station longitude,bike id,user type,birth year,gender
0,1346.0,2015-01-01 00:01:00,1/1/2015 0:24,455.0,1 Ave & E 44 St,40.750020,-73.969053,265.0,Stanton St & Chrystie St,40.722293,...,,,,,,,,,,
1,363.0,2015-01-01 00:02:00,1/1/2015 0:08,434.0,9 Ave & W 18 St,40.743174,-74.003664,482.0,W 15 St & 7 Ave,40.739355,...,,,,,,,,,,
2,346.0,2015-01-01 00:04:00,1/1/2015 0:10,491.0,E 24 St & Park Ave S,40.740964,-73.986022,505.0,6 Ave & W 33 St,40.749013,...,,,,,,,,,,
3,182.0,2015-01-01 00:04:00,1/1/2015 0:07,384.0,Fulton St & Waverly Ave,40.683178,-73.965964,399.0,Lafayette Ave & St James Pl,40.688515,...,,,,,,,,,,
4,969.0,2015-01-01 00:05:00,1/1/2015 0:21,474.0,5 Ave & E 29 St,40.745168,-73.986831,432.0,E 7 St & Avenue A,40.726218,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6383544,361.0,2015-06-09 09:59:00,6/9/2015 10:05,327.0,Vesey Pl & River Terrace,40.715338,-74.016584,248.0,Laight St & Hudson St,40.721854,...,,,,,,,,,,
6383545,568.0,2015-06-09 09:59:00,6/9/2015 10:08,519.0,Pershing Square North,40.751873,-73.977706,497.0,E 17 St & Broadway,40.737050,...,,,,,,,,,,
6383546,817.0,2015-06-09 09:59:00,6/9/2015 10:12,151.0,Cleveland Pl & Spring St,40.721816,-73.997203,307.0,Canal St & Rutgers St,40.714275,...,,,,,,,,,,
6383547,690.0,2015-06-09 09:59:00,6/9/2015 10:10,2021.0,W 45 St & 8 Ave,40.759291,-73.988597,522.0,E 51 St & Lexington Ave,40.757148,...,,,,,,,,,,


In [9]:
df['birth year']

Unnamed: 0,birth year,birth year.1
0,1960.0,
1,1963.0,
2,1974.0,
3,1969.0,
4,1977.0,
...,...,...
6383544,1970.0,
6383545,1982.0,
6383546,1981.0,
6383547,1984.0,


In [11]:
df.columns

Index(['tripduration', 'starttime', 'stoptime', 'start station id',
       'start station name', 'start station latitude',
       'start station longitude', 'end station id', 'end station name',
       'end station latitude', 'end station longitude', 'bikeid', 'usertype',
       'birth year', 'gender', 'trip duration', 'start time', 'stop time',
       'start station id', 'start station name', 'start station latitude',
       'start station longitude', 'end station id', 'end station name',
       'end station latitude', 'end station longitude', 'bike id', 'user type',
       'birth year', 'gender'],
      dtype='object')