In [2]:

def review_data_structure(df):
    """
    Review the data structure of the DataFrame.
    Parameters:
        - df: DataFrame
    Returns:
        - Summary of data structure and formatting issues
    """
    summary = {
        "Column Name": [],
        "Dtype": []
    }

    for col in df.columns:
        summary["Column Name"].append(col)
        summary["Dtype"].append(df[col].dtype)
    return pd.DataFrame(summary)

In [3]:
def handle_missing_values(df, num_strategy="mean", cat_strategy="most_frequent"):
    """
    Handle missing values in numerical and non-numerical columns.
    Parameters:
        - df: DataFrame
        - num_strategy: Strategy for numerical imputation ('mean', 'median', 'most_frequent', 'constant')
        - cat_strategy: Strategy for categorical imputation ('most_frequent', 'constant')
    Returns:
        - DataFrame with imputed values
    """
    # Identify numerical and non-numerical columns
    numerical_cols = [col for col in df.columns if df[col].dtype in ['int64', 'float64']]
    categorical_cols = [col for col in df.columns if df[col].dtype == 'object']

    # Filter out columns with all missing values
    num_valid_cols = [col for col in numerical_cols if df[col].notna().any()]
    cat_valid_cols = [col for col in categorical_cols if df[col].notna().any()]

    # Warn about columns with all missing values
    num_skipped = [col for col in numerical_cols if col not in num_valid_cols]
    cat_skipped = [col for col in categorical_cols if col not in cat_valid_cols]

    if num_skipped:
        print(f"Skipping numerical columns with all missing values: {num_skipped}")
    if cat_skipped:
        print(f"Skipping categorical columns with all missing values: {cat_skipped}")

    # Impute numerical columns
    if num_valid_cols:
        num_imputer = SimpleImputer(strategy=num_strategy)
        df[num_valid_cols] = num_imputer.fit_transform(df[num_valid_cols])

    # Impute categorical columns
    if cat_valid_cols:
        cat_imputer = SimpleImputer(strategy=cat_strategy)
        df[cat_valid_cols] = cat_imputer.fit_transform(df[cat_valid_cols])

    return df