In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer, QuantileTransformer

In [7]:
def analyze_and_recommend_scaler(data):
    # Select numeric columns
    numeric_data = data.select_dtypes(include=[np.number])
    
    # Initialize results dictionary
    results = {
        "Column": [],
        "Normality (p-value)": [],
        "Skewness": [],
        "Kurtosis": [],
        "Outliers": [],
        "Recommended Scaler": []
    }
    
    # Analyzing each numeric feature
    for column in numeric_data.columns:
        # Check if the column has zero variance
        if numeric_data[column].nunique() == 1:
            p_value = np.nan
            skewness = 0
            kurtosis = 0
            outliers = 0
            scaler = "No scaling needed (constant value)"
        else:
            # Shapiro-Wilk test for normality
            shapiro_test = stats.shapiro(numeric_data[column].dropna())
            p_value = shapiro_test.pvalue
            
            # Calculate skewness and kurtosis
            skewness = numeric_data[column].skew()
            kurtosis = numeric_data[column].kurtosis()
            
            # Detect outliers using IQR method
            Q1 = numeric_data[column].quantile(0.25)
            Q3 = numeric_data[column].quantile(0.75)
            IQR = Q3 - Q1
            outliers = ((numeric_data[column] < (Q1 - 1.5 * IQR)) | (numeric_data[column] > (Q3 + 1.5 * IQR))).sum()
            
            # Determine recommended scaler
            if p_value > 0.05 and abs(skewness) < 0.5 and outliers == 0:
                scaler = "StandardScaler"
            elif outliers > 0:
                scaler = "RobustScaler"
            elif abs(skewness) >= 0.5:
                scaler = "PowerTransformer" if skewness >= 0.5 else "QuantileTransformer"
            else:
                scaler = "MinMaxScaler"
        
        # Store results
        results["Column"].append(column)
        results["Normality (p-value)"].append(round(p_value, 4) if not np.isnan(p_value) else "N/A")
        results["Skewness"].append(round(skewness, 4))
        results["Kurtosis"].append(round(kurtosis, 4))
        results["Outliers"].append(outliers)
        results["Recommended Scaler"].append(scaler)
    
    # Convert results to DataFrame for better readability
    results_df = pd.DataFrame(results)
    
    # Return the results DataFrame
    return results_df

def main():
    # Get user input for the file path
    file_path = input("Please enter the path to your dataset (CSV file): ")
    
    # Load the data
    data = pd.read_csv(file_path)
    
    # Perform the analysis and get the results
    results_df = analyze_and_recommend_scaler(data)
    
    # Display the analysis results
    print("\nAnalysis of Numeric Features:")
    print(results_df)

if __name__ == "__main__":
    main()


Analysis of Numeric Features:
                      Column Normality (p-value)  Skewness  Kurtosis  \
0                        Age                 0.0    0.4133   -0.4041   
1                  DailyRate                 0.0   -0.0035   -1.2038   
2           DistanceFromHome                 0.0    0.9581   -0.2248   
3                  Education                 0.0   -0.2897   -0.5591   
4              EmployeeCount                 N/A    0.0000    0.0000   
5             EmployeeNumber                 0.0    0.0166   -1.2232   
6    EnvironmentSatisfaction                 0.0   -0.3217   -1.2025   
7                 HourlyRate                 0.0   -0.0323   -1.1964   
8             JobInvolvement                 0.0   -0.4984    0.2710   
9                   JobLevel                 0.0    1.0254    0.3992   
10           JobSatisfaction                 0.0   -0.3297   -1.2222   
11             MonthlyIncome                 0.0    1.3698    1.0052   
12               MonthlyRate     