## Best Practices for Data Preprocessing

#### Always Explore & Visualize Data First

In [None]:
# Task 1: Summary Statistics


import pandas as pd

def display_summary_statistics(df):
    print("Summary Statistics:\n")
    print(df.describe(include='all'))



# Task 2: Visualize Data Distribution


import matplotlib.pyplot as plt
import seaborn as sns

def plot_data_distribution(df, numeric_columns):
    for column in numeric_columns:
        plt.figure(figsize=(8, 4))
        sns.histplot(df[column].dropna(), kde=True, bins=30)
        plt.title(f'Distribution of {column}')
        plt.xlabel(column)
        plt.ylabel('Frequency')
        plt.grid(True)
        plt.show()



# Task 3: Checking for Null Values
def check_missing_values(df):
    print("\nMissing Values:\n")
    print(df.isnull().sum())




## Handle Missing & Inconsistent Data Before Applying ML Models

In [None]:
# Task 4: Drop Missing Values

def drop_missing_values(df):
    df_dropped = df.dropna()
    print("Dropped rows with missing values.")
    return df_dropped




# Task 5: Fill Missing Values

from sklearn.impute import SimpleImputer

def fill_missing_values(df, strategy='mean'):
    imputer = SimpleImputer(strategy=strategy)
    numeric_df = df.select_dtypes(include='number')
    df[numeric_df.columns] = imputer.fit_transform(numeric_df)
    print(f"Filled missing values using strategy: '{strategy}'")
    return df



# Task 6: Handling Outliers with Capping

def cap_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    df[column] = df[column].apply(
        lambda x: lower_bound if x < lower_bound else (upper_bound if x > upper_bound else x)
    )
    print(f"Capped outliers in column: {column}")
    return df




## Choose the Right Scaling Method

In [None]:
# Task 7: Min-Max Scaling





from sklearn.preprocessing import MinMaxScaler

def min_max_scaling(df, columns):
    scaler = MinMaxScaler()
    df[columns] = scaler.fit_transform(df[columns])
    print(f"Applied Min-Max Scaling on: {columns}")
    return df


# Task 8: Robust Scaling

from sklearn.preprocessing import RobustScaler

def robust_scaling(df, columns):
    scaler = RobustScaler()
    df[columns] = scaler.fit_transform(df[columns])
    print(f"Applied Robust Scaling on: {columns}")
    return df





# Task 9: MaxAbs Scaling


from sklearn.preprocessing import MaxAbsScaler

def maxabs_scaling(df, columns):
    scaler = MaxAbsScaler()
    df[columns] = scaler.fit_transform(df[columns])
    print(f"Applied MaxAbs Scaling on: {columns}")
    return df




## Keep Track of Data Transformations for Reproducibility

In [None]:
# Task 10: Log Data Preprocessing Steps


import logging

# Configure logging
logging.basicConfig(filename='data_preprocessing.log', level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')

def log_preprocessing_step(step_description):
    logging.info(step_description)
    print(f"Logged: {step_description}")




# Task 11: Store Transformation Parameters
import json

def save_transformation_params(params_dict, filename='transformation_params.json'):
    """
    Save transformation parameters (e.g., scaler means, stds, or other params) to a JSON file.
    
    params_dict: dict
        Dictionary containing parameters to store.
    filename: str
        File name to save parameters.
    """
    with open(filename, 'w') as f:
        json.dump(params_dict, f, indent=4)
    print(f"Transformation parameters saved to {filename}")




