In [1]:
import os
import re
import pandas as pd

In [2]:
def extract_year(file_name):
    match = re.search(r'\d{4}', file_name)
    return match.group(0) if match else None

In [3]:
def merge_csvs_by_year(csv_dir, merged_csv_dir):
    os.makedirs(merged_csv_dir, exist_ok=True)
    dataframes_by_year = {}

    for file_name in os.listdir(csv_dir):
        if file_name.endswith(".csv"):
            year = extract_year(file_name)
            if year:
                file_path = os.path.join(csv_dir, file_name)
                try:
                    df = pd.read_csv(file_path)
                    dataframes_by_year.setdefault(year, []).append(df)
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")

    for year, dfs in dataframes_by_year.items():
        if dfs:
            merged_df = pd.concat(dfs, ignore_index=True)
            output_path = os.path.join(merged_csv_dir, f"merged_data_{year}.csv")
            merged_df.to_csv(output_path, index=False)
            print(f"Merged CSV for year {year} created at '{output_path}'")
        else:
            print(f"No valid data for year {year}.")

In [4]:
def merge_all_yearly_csvs(csv_dir, merged_output_path):
    os.makedirs(os.path.dirname(merged_output_path), exist_ok=True)
    dataframes = []

    for file_name in os.listdir(csv_dir):
        if file_name.endswith(".csv"):
            file_path = os.path.join(csv_dir, file_name)
            try:
                df = pd.read_csv(file_path)
                dataframes.append(df)
            except Exception as e:
                print(f"Error reading {file_path}: {e}")

    if dataframes:
        merged_df = pd.concat(dataframes, ignore_index=True)
        merged_df.to_csv(merged_output_path, index=False)
        print(f"Final merged CSV created at '{merged_output_path}'")
    else:
        print("No CSVs found to merge.")

In [None]:
merge_csvs_by_year("../data/interim", "../data/interim")

merge_all_yearly_csvs("../data/interim", "../data/processed/dataset.csv")
