In [3]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from scipy.stats import skew

class MLModelEvaluator:
    def __init__(self, data_path, model_path):
        self.data = pd.read_csv(data_path)
        with open(model_path, 'rb') as file:
            self.model = pickle.load(file)
        self.scaler = StandardScaler()
    
    def check_missing_values(self):
        missing = self.data.isnull().sum()
        print("Missing Values:")
        print(missing[missing > 0])
    
    def check_skewness(self):
        numeric_cols = self.data.select_dtypes(include=['number'])
        skewed = numeric_cols.apply(skew, axis=0)
        print("Feature Skewness:")
        print(skewed)
    
    def check_outliers(self):
        numeric_cols = self.data.select_dtypes(include=['number'])
        outlier_counts = ((numeric_cols - numeric_cols.mean()).abs() > (3 * numeric_cols.std())).sum()
        print("Outliers per feature:")
        print(outlier_counts)
    
    def check_scaling(self):
        numeric_cols = self.data.select_dtypes(include=['number'])
        scaled_data = self.scaler.fit_transform(numeric_cols)
        print("Checking if data is scaled:")
        print(f"Mean: {np.mean(scaled_data, axis=0)}")
        print(f"Std: {np.std(scaled_data, axis=0)}")
    
    def evaluate_clustering(self):
        labels = self.model.labels_
        score = silhouette_score(self.data.select_dtypes(include=['number']), labels)
        print(f"Silhouette Score: {score:.4f}")
    
    def run_all_checks(self):
        print("Running ML Model Evaluations...")
        self.check_missing_values()
        self.check_skewness()
        self.check_outliers()
        self.check_scaling()
        self.evaluate_clustering()
        print("Evaluation Complete.")

# Example usage
evaluator = MLModelEvaluator('clusters.csv', 'KMeans_model.pickle')
evaluator.run_all_checks()


Running ML Model Evaluations...
Missing Values:
Series([], dtype: int64)
Feature Skewness:
Unnamed: 0                0.000000
CustomerID                0.000000
Gender                   -0.241747
Age                       0.481919
Annual Income (k$)        0.319424
Spending Score (1-100)   -0.046865
Cluster                  -0.233811
dtype: float64
Outliers per feature:
Unnamed: 0                0
CustomerID                0
Gender                    0
Age                       0
Annual Income (k$)        0
Spending Score (1-100)    0
Cluster                   0
dtype: int64
Checking if data is scaled:
Mean: [ 0.00000000e+00  0.00000000e+00 -7.54951657e-17 -1.02140518e-16
 -2.13162821e-16 -1.46549439e-16 -8.88178420e-17]
Std: [1. 1. 1. 1. 1. 1. 1.]
Silhouette Score: 0.1132
Evaluation Complete.
