<a href="https://colab.research.google.com/github/jburchfield76/datasharing/blob/master/Clean_Coerce_CSV_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#pipeline to clean and coerce csv file
import pandas as pd

def clean_csv_data(
    filepath,
    numeric_columns=None,
    datetime_columns=None,
    fillna_numeric=None,
    drop_na_columns=None,
    verbose=True
):
    """
    Load and clean a CSV file.

    Parameters:
        filepath (str): Path to the CSV file.
        numeric_columns (list): Columns to convert to numeric.
        datetime_columns (list): Columns to convert to datetime.
        fillna_numeric (dict): Dictionary of {col: value} to fill NaNs.
        drop_na_columns (list): Drop rows with NaN in these columns.
        verbose (bool): If True, print cleaning summary.

    Returns:
        pd.DataFrame: Cleaned DataFrame.
    """

    df = pd.read_csv(filepath)

    if verbose:
        print(f"\n📄 Loaded DataFrame with {df.shape[0]} rows and {df.shape[1]} columns.")

    # Coerce numeric columns
    if numeric_columns:
        for col in numeric_columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
            if verbose:
                print(f"🔢 Coerced '{col}' to numeric. Nulls: {df[col].isna().sum()}")

    # Coerce datetime columns
    if datetime_columns:
        for col in datetime_columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')
            if verbose:
                print(f"🕒 Coerced '{col}' to datetime. Nulls: {df[col].isna().sum()}")

    # Fill NaNs in numeric columns
    if fillna_numeric:
        for col, val in fillna_numeric.items():
            df[col] = df[col].fillna(val)
            if verbose:
                print(f"💧 Filled NaNs in '{col}' with {val}.")

    # Drop rows with NaNs in specified columns
    if drop_na_columns:
        before = df.shape[0]
        df = df.dropna(subset=drop_na_columns)
        after = df.shape[0]
        if verbose:
            print(f"🗑️ Dropped {before - after} rows with NaNs in {drop_na_columns}.")

    if verbose:
        print("\n✅ Cleaning complete!\n")

    return df


In [None]:
#example usage
df_clean = clean_csv_data(
    "your_file.csv",
    numeric_columns=["review_count", "rating"],
    datetime_columns=["created_at"],
    fillna_numeric={"review_count": 0},
    drop_na_columns=["rating"]
)
