In [1]:
import os
import logging 

In [2]:
%pwd

'c:\\Cancer-Prediction-\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Cancer-Prediction-'

In [5]:
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    validated_data_file: Path
    transformed_train_data_path: Path
    transformed_test_data_path: Path
    target_column: str
    ordinal_features: List[str]
    nominal_features: List[str]
    

from CancerPrediction.utils.common import read_yaml, create_directories
from CancerPrediction.constants import *

# Clase para gestionar la configuración
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH, schema_filepath=SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        
        create_directories([self.config['artifacts_root']])
        
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config['data_transformation']
        schema = self.schema['COLUMNS']
        
        create_directories([Path(config['root_dir'])])
        
        return DataTransformationConfig(
            root_dir=Path(config['root_dir']),
            validated_data_file=Path(config['validated_data_file']),
            transformed_train_data_path=Path(config['transformed_train_data_path']),
            transformed_test_data_path=Path(config['transformed_test_data_path']),
            target_column=config['target_column'],
            ordinal_features=config['ordinal_features'],
            nominal_features=config['nominal_features']
        )

        
# src/CancerPrediction/pipeline/stages_03_data_transformation.py
from CancerPrediction import logger
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
import pandas as pd
import numpy as np


# src/CancerPrediction/components/data_transformation.py
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from CancerPrediction import logger


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    validated_data_file: Path
    transformed_train_data_path: Path
    transformed_test_data_path: Path
    target_column: str
    ordinal_features: List[str]
    nominal_features: List[str]

# Implementar DataTransformation en una celda de notebook para prueba
class DataTransformation:
    def __init__(self, df: pd.DataFrame, config: DataTransformationConfig):
        self.df = df
        self.config = config
        self.label_encoders = {}
    
    def encode_labels(self):
        # Codificar las características nominales usando LabelEncoder
        for feature in self.config.nominal_features:
            le = LabelEncoder()
            self.df[feature] = le.fit_transform(self.df[feature])
            self.label_encoders[feature] = le
        
        # Codificar la columna objetivo
        le = LabelEncoder()
        self.df[self.config.target_column] = le.fit_transform(self.df[self.config.target_column])
        self.label_encoders[self.config.target_column] = le
    
    def get_preprocessor(self):
        # Identificar características numéricas y categóricas
        numeric_features = self.df.select_dtypes(include=[float, int]).columns.tolist()
        ordinal_features = self.config.ordinal_features
        nominal_features = self.config.nominal_features

        # Eliminar las características ordinales, nominales y la columna objetivo de las características numéricas
        numeric_features = [feature for feature in numeric_features if feature not in ordinal_features + nominal_features + [self.config.target_column]]

        # Preprocesamiento para las características numéricas
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),  # Imputación con la mediana
            ('scaler', StandardScaler())  # Estandarización
        ])

        # Preprocesamiento para las características categóricas ordinales
        ordinal_transformer = Pipeline(steps=[
            ('ordinal', OrdinalEncoder(dtype=int))  # Codificación Ordinal
        ])

        # Preprocesamiento para las características categóricas nominales (binarias)
        nominal_transformer = Pipeline(steps=[
            ('onehot', OneHotEncoder(drop='if_binary', dtype=int))  # Codificación binaria
        ])

        # Combinación de los transformadores en un preprocesador
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('ord', ordinal_transformer, ordinal_features),
                ('nom', nominal_transformer, nominal_features)
            ],
            remainder='passthrough'
        )
        
        return preprocessor
    
    def transform(self):
        # Crear directorio si no existe
        self.config.transformed_train_data_path.parent.mkdir(parents=True, exist_ok=True)
        
        # Codificar etiquetas
        self.encode_labels()
        
        # Dividir los datos en conjuntos de entrenamiento y prueba
        train_df, test_df = train_test_split(self.df, test_size=0.3, random_state=42)
        
        # Separar la columna objetivo para evitar que sea escalada
        y_train = train_df.pop(self.config.target_column)
        y_test = test_df.pop(self.config.target_column)
        
        preprocessor = self.get_preprocessor()
        
        # Aplicar la transformación
        train_df_transformed = preprocessor.fit_transform(train_df)
        test_df_transformed = preprocessor.transform(test_df)
        
        # Obtener los nombres de las columnas transformadas
        numeric_features = preprocessor.transformers_[0][2]
        ordinal_features = preprocessor.transformers_[1][2]
        nominal_features = preprocessor.transformers_[2][2]
        nominal_feature_names = preprocessor.transformers_[2][1]['onehot'].get_feature_names_out(nominal_features)
        
        # Combinar los nombres de las columnas transformadas
        feature_names = np.concatenate([numeric_features, ordinal_features, nominal_feature_names])
        
        # Convertir los datos transformados en DataFrame con las columnas originales
        train_df_transformed = pd.DataFrame(train_df_transformed, columns=feature_names)
        test_df_transformed = pd.DataFrame(test_df_transformed, columns=feature_names)
        
        # Añadir de nuevo la columna objetivo a los DataFrames transformados
        train_df_transformed[self.config.target_column] = y_train.values
        test_df_transformed[self.config.target_column] = y_test.values
        
        # Guardar los conjuntos de datos de entrenamiento y prueba transformados en Excel
        train_df_transformed.to_excel(self.config.transformed_train_data_path, index=False)
        test_df_transformed.to_excel(self.config.transformed_test_data_path, index=False)
        
        print("Data transformation completed successfully")
        print(f"Training data saved to {self.config.transformed_train_data_path}")
        print(f"Test data saved to {self.config.transformed_test_data_path}")


try:
    # Inicializar el ConfigurationManager
    config = ConfigurationManager()
    
    # Obtener la configuración de transformación de datos
    data_transformation_config = config.get_data_transformation_config()
    
    # Cargar datos desde el archivo validado
    df = pd.read_excel(data_transformation_config.validated_data_file)
    print("DataFrame loaded successfully:")
    print(df.head())
    
    # Crear instancia de DataTransformation
    data_transformation = DataTransformation(df, data_transformation_config)
    
    # Ejecutar la transformación completa
    data_transformation.transform()
    print("Transformation stage completed successfully")
    
except Exception as e:
    print(f"An error occurred: {e}")

[2024-06-27 20:42:10,261: INFO: common: YAML file: config\config.yaml loaded successfully]
[2024-06-27 20:42:10,265: INFO: common: YAML file: params.yaml loaded successfully]
[2024-06-27 20:42:10,270: INFO: common: YAML file: schema.yaml loaded successfully]
[2024-06-27 20:42:10,272: INFO: common: Created directory at: artifacts]
[2024-06-27 20:42:10,273: INFO: common: Created directory at: artifacts\data_transformation]
DataFrame loaded successfully:
   Tumor type AJCC Stage  AFP (pg/ml)  Angiopoietin-2 (pg/ml)  AXL (pg/ml)  \
0  Colorectum          I     1583.450                 5598.50      3621.04   
1  Colorectum          I      715.308                20936.35      2772.96   
2  Colorectum         II     4365.530                 2350.93      4120.77   
3  Colorectum         II      715.308                 1604.34      2029.96   
4  Colorectum         II      801.300                 2087.57      2069.17   

   CA-125 (U/ml)  CA 15-3 (U/ml)  CA19-9 (U/ml)  CD44 (ng/ml)  CEA (pg/ml) 