<a href="https://colab.research.google.com/github/jmcconne100/Pandas_Notebook_Project/blob/main/data_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

def clean_data(config: dict):
    """
    Clean and preprocess a DataFrame using one configuration dictionary.

    Parameters:
        config (dict): {
            "df": <pd.DataFrame>,             # required
            "rename_map": {...},              # optional
            "drop_columns": [...],
            "replace_map": {...},
            "numeric_cols": [...],
            "datetime_cols": [...],
            "fill_values": {...},
            "fill_stats": {...},
            "drop_duplicates": True,
            "standardize_case": True,
            "verbose": True
        }

    Returns:
        (cleaned_df, report)
    """

    # --- Validate input ---
    if "df" not in config:
        raise ValueError("config must include a 'df' key with a pandas DataFrame")

    df = config["df"].copy()
    report = {"steps": []}

    # --- Pull config values with defaults ---
    rename_map = config.get("rename_map")
    drop_columns = config.get("drop_columns")
    replace_map = config.get("replace_map")
    numeric_cols = config.get("numeric_cols")
    datetime_cols = config.get("datetime_cols")
    fill_values = config.get("fill_values")
    fill_stats = config.get("fill_stats")
    drop_duplicates = config.get("drop_duplicates", True)
    standardize_case = config.get("standardize_case", True)
    verbose = config.get("verbose", True)

    # --- Cleaning steps ---
    if standardize_case:
        df.columns = df.columns.str.strip().str.lower()
        report["steps"].append("Standardized column case and stripped whitespace")

    if rename_map:
        df = df.rename(columns=rename_map)
        report["steps"].append(f"Renamed columns: {rename_map}")

    if drop_columns:
        missing = [c for c in drop_columns if c not in df.columns]
        if missing:
            raise ValueError(f"Cannot drop missing columns: {missing}")
        df = df.drop(columns=drop_columns)
        report["steps"].append(f"Dropped columns: {drop_columns}")

    if replace_map:
        df = df.replace(replace_map)
        report["steps"].append(f"Replaced values: {replace_map}")

    if numeric_cols:
        for col in numeric_cols:
            df[col] = pd.to_numeric(df[col], errors="coerce")
        report["steps"].append(f"Converted numeric cols: {numeric_cols}")

    if datetime_cols:
        for col in datetime_cols:
            df[col] = pd.to_datetime(df[col], errors="coerce")
        report["steps"].append(f"Converted datetime cols: {datetime_cols}")

    if fill_values:
        df = df.fillna(fill_values)
        report["steps"].append(f"Filled missing with constants: {fill_values}")

    if fill_stats:
        for col, method in fill_stats.items():
            method = method.lower()
            if method == "mean":
                df[col] = df[col].fillna(df[col].mean())
            elif method == "median":
                df[col] = df[col].fillna(df[col].median())
            elif method == "mode":
                df[col] = df[col].fillna(df[col].mode().iloc[0])
            else:
                raise ValueError(f"Unsupported fill method '{method}' for column '{col}'")
        report["steps"].append(f"Filled missing by stats: {fill_stats}")

    if drop_duplicates:
        before = len(df)
        df = df.drop_duplicates()
        after = len(df)
        report["steps"].append(f"Dropped duplicates: {before - after} rows removed")

    report["missing_values"] = df.isnull().sum().to_dict()
    report["final_shape"] = df.shape

    if verbose:
        print("=== Cleaning Report ===")
        for step in report["steps"]:
            print("-", step)
        print("\nMissing Values:")
        print(report["missing_values"])
        print("Final shape:", report["final_shape"])

    return df, report


In [None]:
import pandas as pd
import numpy as np

data = {
    " Name ": [" Alice ", "Bob", None, "Eve", "Frank", "Bob"],
    "Age": ["25", "30", "?", "45", "40", "30"],
    "Salary": [50000, None, 40000, None, 55000, 60000],
    "Joined": ["2021-01-01", "2021-05-07", "invalid", "2022-03-02", "2021-12-31", "2021-05-07"],
}

config = {
    "df": pd.DataFrame(data),
    "rename_map": {" Name ": "name"},
    "replace_map": {"?": np.nan},
    "numeric_cols": ["age"],
    "datetime_cols": ["joined"],
    "fill_values": {"name": "Unknown"},
    "fill_stats": {"salary": "mean"},
    "verbose": True
}

cleaned_df, report = clean_data(config)
print(pd.DataFrame(data))
print(cleaned_df)