In [1]:
import os 

In [2]:
%pwd

'c:\\Users\\tchok\\OneDrive\\Bureau\\My_github\\clustering-insured-population\\research'

In [3]:
os.chdir("../")
os.getcwd()

'c:\\Users\\tchok\\OneDrive\\Bureau\\My_github\\clustering-insured-population'

In [4]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    transformed_data_path: Path
    transforming_data_path: Path
    transformer_path : Path 
    metric_features : list
    non_metric_features : list


In [5]:
import os 
from insuredSegmenter.constants import * 
from insuredSegmenter.utils.common import read_yaml, create_directories

In [6]:
class ConfigurationManager:
    def __init__(
        self, 
        config_path: Path = CONFIG_FILE_PATH, 
        params_path: Path = PARAMS_FILE_PATH):
        
        self.config = read_yaml(str(config_path))
        self.params = read_yaml(str(params_path))
        
        create_directories([self.config.artifacts_root]) # create directories if they do not exist

    def get_transformation_config(self) -> DataTransformationConfig:
        transforming_data_path = self.config.data_ingestion.csv
        transformation = self.config.data_transformation
        transformed_data_path = transformation.transformed_data_path
        transforer_path = transformation.transformer_path
        metric_features = transformation.metric_features
        non_metric_features = transformation.non_metric_features
        
        
        create_directories([transformation.root_dir])
    
        # create_directories([transformation.root_dir])
        transformation_config = DataTransformationConfig(
            root_dir= Path(transformation.root_dir),
            transforming_data_path= Path(transforming_data_path),
            transformed_data_path= Path(transformed_data_path),
            transformer_path= Path(transforer_path),
            metric_features= metric_features,
            non_metric_features= non_metric_features
            
        )
        return transformation_config

In [7]:
import pickle
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer, FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from scipy import stats

class OutlierRemover(BaseEstimator, TransformerMixin):
    """
    Custom transformer to remove outliers using Z-score method.
    This implements scikit-learn's transformer interface.
    """
    def __init__(self, z_threshold=3.0):
        self.z_threshold = z_threshold
        self.feature_indices_ = None
        
    def fit(self, X, y=None):
        # Store indices of features with outliers to be used in transform
        return self
        
    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            X_transformed = X.copy()
            # Apply z-score thresholding to each column
            for col in X_transformed.columns:
                if X_transformed[col].dtype in [np.float64, np.int64]:
                    z_scores = np.abs(stats.zscore(X_transformed[col], nan_policy='omit'))
                    X_transformed.loc[z_scores >= self.z_threshold, col] = np.nan
            return X_transformed
        else:
            # If not DataFrame, convert to numpy array
            X_transformed = np.copy(X)
            # Apply z-score thresholding to each column
            for col in range(X_transformed.shape[1]):
                z_scores = np.abs(stats.zscore(X_transformed[:, col], nan_policy='omit'))
                X_transformed[z_scores >= self.z_threshold, col] = np.nan
            return X_transformed

class SkewnessTransformer(BaseEstimator, TransformerMixin):
    """
    Custom transformer to handle skewed data automatically.
    Applies log transformation to highly skewed features.
    """
    def __init__(self, skew_threshold=1.0):
        self.skew_threshold = skew_threshold
        self.skewed_features_ = {}  # Will store skewed features and their min values
    
    def fit(self, X, y=None):
        # Identify skewed features
        if isinstance(X, pd.DataFrame):
            for col in X.columns:
                if X[col].dtype in [np.float64, np.int64]:
                    skewness = X[col].skew()
                    if abs(skewness) > self.skew_threshold:
                        # Store min value to use in transformation
                        min_val = X[col].min()
                        self.skewed_features_[col] = min_val
        else:
            # For numpy arrays, we'll transform all numeric columns
            for col in range(X.shape[1]):
                if np.issubdtype(X[:, col].dtype, np.number):
                    # Use pandas Series for skew calculation
                    skewness = pd.Series(X[:, col]).skew()
                    if abs(skewness) > self.skew_threshold:
                        min_val = np.min(X[:, col])
                        self.skewed_features_[col] = min_val
        return self
    
    def transform(self, X):
        X_transformed = X.copy() if isinstance(X, pd.DataFrame) else np.copy(X)
        
        if isinstance(X, pd.DataFrame):
            for col, min_val in self.skewed_features_.items():
                if col in X_transformed.columns:
                    # Apply log transformation (adding small constant to handle zeros)
                    X_transformed[col] = np.log1p(X_transformed[col] - min_val + 0.01)
        else:
            for col, min_val in self.skewed_features_.items():
                # Apply log transformation to numpy array
                X_transformed[:, col] = np.log1p(X_transformed[:, col] - min_val + 0.01)
                
        return X_transformed

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        
    def _save_object(self, obj, file_path: Path):
        """
        This function is used to save the object to the specified path.
        """
        try:
            with open(file_path, 'wb') as file:
                pickle.dump(obj, file)
        except Exception as e:
            raise (e)
    
    def get_data_transformer_object(self, remove_outliers: bool = True) -> Pipeline:
        """
        This function is used to transform the data using the following steps:
        1. remove outliers from numerical columns (optional)
        2. handle skewed numerical features
        3. impute missing values with median for numerical columns
        4. transform numerical data using PowerTransformer
        5. impute missing values with most frequent value for categorical columns
        6. one-hot encode categorical data
        
        The entire transformation is wrapped in a single Pipeline for easy saving and reuse.
        """
        try:
            # Define the preprocessor steps
            preprocessing_steps = []
            
            # Step 1: Add outlier removal if requested
            if remove_outliers:
                preprocessing_steps.append(("outlier_remover", OutlierRemover(z_threshold=3.0)))
                
            # Step 2: Add skewness transformer
            preprocessing_steps.append(("skewness_transformer", SkewnessTransformer(skew_threshold=1.0)))
            
            # Step 3: Column transformer for different column types
            num_pipeline = Pipeline(steps=[
                ("imputer", SimpleImputer(strategy="median")),  # impute missing values with median
                ("power_transform", PowerTransformer(standardize=False)),  # additional handling for skewed data
                ("scaler", StandardScaler())  # scale the data
            ])
            
            cat_pipeline = Pipeline(steps=[
                ("imputer", SimpleImputer(strategy="most_frequent")),  # impute missing values with most frequent value
                ("onehotencoder", OneHotEncoder(handle_unknown='ignore')),  # one hot encode the categorical variables
                ("scaler", StandardScaler(with_mean=False))  # scale the data (with_mean=False for sparse matrices)
            ])
            
            column_transformer = ColumnTransformer(
                transformers=[
                    ("num_pipeline", num_pipeline, self.config.metric_features),
                    ("cat_pipeline", cat_pipeline, self.config.non_metric_features)
                ]
            )
            
            preprocessing_steps.append(("column_transformer", column_transformer))
            
            # Create the full pipeline
            preprocessor = Pipeline(preprocessing_steps)
            
            return preprocessor
            
        except Exception as e:
            raise (e)
    
    def initiate_data_transformation(
        self, 
        transforming_data_path: Path, 
        remove_outliers: bool = True,
        save_transformer: bool = True
        ) -> pd.DataFrame:
        """
        This function is responsible for transforming the data.
        
        Args:
            transforming_data_path: Path to the CSV file to transform
            numerical_columns: List of numerical column names
            categorical_columns: List of categorical column names
            remove_outliers: Whether to remove outliers (default: True)
            save_transformer: Whether to save the transformer for later use (default: True)
            
        Returns:
            Transformed data
        """
        try:
            # read the data from the specified path
            df = pd.read_csv(transforming_data_path)
            
            # get the data transformer object with outlier removal integrated into the pipeline
            data_transformer = self.get_data_transformer_object( remove_outliers)
            
            # transform the data
            transformed_data = data_transformer.fit_transform(df)
            
            # save the transformed data
            self._save_object(transformed_data, self.config.transformed_data_path)
            
            # save the transformer if requested
            if save_transformer:
                self._save_object(data_transformer, self.config.transformer_path)
            
            return transformed_data
        
        except Exception as e:
            raise (e)

In [8]:
try :
    config = ConfigurationManager()
    transformation_config = config.get_transformation_config()
    transformation_instance = DataTransformation(config=transformation_config)
    transformation_instance.get_data_transformer_object()
    transformation_instance.initiate_data_transformation(transformation_config.transforming_data_path)
except Exception as e:
    raise e

[2025-05-14 15:54:42,329: INFO: common: YAML file loaded successfully: C:\Users\tchok\OneDrive\Bureau\My_github\clustering-insured-population\config\config.yaml]
[2025-05-14 15:54:42,356: INFO: common: YAML file loaded successfully: C:\Users\tchok\OneDrive\Bureau\My_github\clustering-insured-population\params.yaml]
[2025-05-14 15:54:42,361: INFO: common: created directory at: artifacts]
[2025-05-14 15:54:42,364: INFO: common: created directory at: artifacts/data_transformation]


FileNotFoundError: [Errno 2] No such file or directory: 'artifacts\\data_ingestion\\data.csv'