In [37]:
import os
import pandas as pd
from pandas.testing import assert_frame_equal
def print_side_by_side_differences(df_base, df_copy, diff_mask):
    diff_rows = diff_mask.any(axis=1)
    indices = diff_rows[diff_rows].index

    for idx in indices:
        print(f"\nRow {idx} differences:")

        combined = pd.DataFrame()
        for col in df_base.columns:
            combined[f"{col}_base"] = [df_base.at[idx, col]]
            combined[f"{col}_copy"] = [df_copy.at[idx, col]]

        print(combined.to_string(index=False))


def compare_csv_files(base_dir, copy_dir):
    base_files = [f for f in os.listdir(base_dir) if f.endswith('.csv')]
    copy_files = [f for f in os.listdir(copy_dir) if f.endswith('.csv')]

    common_files = set(base_files) & set(copy_files)

    for filename in common_files:
        if filename == "spent_all.csv":  # or remove this if you want to compare all csv files
            base_path = os.path.join(base_dir, filename)
            copy_path = os.path.join(copy_dir, filename)

            try:
                df_base = pd.read_csv(base_path)
                df_copy = pd.read_csv(copy_path)

                assert_frame_equal(df_base.reset_index(drop=True), df_copy.reset_index(drop=True), check_dtype=False)
                print(f"[OK] {filename} is identical.")
            except AssertionError:
                print(f"[DIFFER] {filename} differs:")
                diff_mask = (df_base != df_copy) & ~(df_base.isna() & df_copy.isna())
                if diff_mask.any().any():
                    print_side_by_side_differences(df_base, df_copy, diff_mask)
                else:
                    print("Difference detected but unable to pinpoint differences clearly.")
            except Exception as ex:
                print(f"[ERROR] Could not compare {filename}: {ex}")

    base_only = set(base_files) - common_files
    copy_only = set(copy_files) - common_files
    if base_only:
        print(f"Files only in base directory: {base_only}")
    if copy_only:
        print(f"Files only in copy directory: {copy_only}")
        
if __name__ == "__main__":
    base_repo = "datasets"  # replace with your base repo path
    copy_repo = os.path.join(base_repo, "COPY")
    compare_csv_files(base_repo, copy_repo)


[DIFFER] spent_all.csv differs:

Row 4224 differences:
 year_base  year_copy  month_base  month_copy  day_base  day_copy category_base category_copy fix_variable_base fix_variable_copy Description_base                      Description_copy  Amount_base  Amount_copy Description_original_base             Description_original_copy
      2025       2025           1           1        27        27       housing       housing               fix               fix           yassou gaspard lhermitte and yasmine starein      -9783.0      -9783.0                    yassou Gaspard lhermitte and Yasmine starein
