In [1]:
import os

In [2]:
%pwd

'c:\\Cancer-Prediction-\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Cancer-Prediction-'

In [5]:
from dataclasses import dataclass
from pathlib import Path
from typing import List

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    model_name: str
    important_features: List[str]
    target_column: str

In [6]:
from CancerPrediction.constants import *
from CancerPrediction.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(
        self, 
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
        schema_filepath=SCHEMA_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        
        create_directories([Path(self.config['artifacts_root'])])
        
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config['model_trainer']
        
        create_directories([Path(config['root_dir'])])
        
        return ModelTrainerConfig(
            root_dir=Path(config['root_dir']),
            train_data_path=Path(config['train_data_path']),
            test_data_path=Path(config['test_data_path']),
            model_name=config['model_name'],
            important_features=config['important_features'],
            target_column=config['target_column']
        )

In [12]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib
from imblearn.over_sampling import SMOTE
from ctgan import CTGAN
import lightgbm as lgb
import xgboost as xgb
from CancerPrediction import logger

class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def train(self):
        # Cargar datos
        train_data = pd.read_excel(self.config.train_data_path)
        test_data = pd.read_excel(self.config.test_data_path)

        # Seleccionar características importantes
        X_train = train_data[self.config.important_features]
        y_train = train_data[self.config.target_column]
        X_test = test_data[self.config.important_features]
        y_test = test_data[self.config.target_column]

        # Identificar clases minoritarias
        class_counts = y_train.value_counts()
        minority_classes = class_counts[class_counts < class_counts.median()].index

        # Separar datos de clases minoritarias
        X_minority = X_train[y_train.isin(minority_classes)]
        y_minority = y_train[y_train.isin(minority_classes)]

        # Entrenar el modelo CTGAN solo con las clases minoritarias
        model = CTGAN(epochs=300)
        model.fit(X_minority)

        # Generar datos sintéticos para las clases minoritarias
        synthetic_data_minority = model.sample(len(X_minority))

        # Asignar etiquetas correctas a los datos sintéticos generados
        synthetic_data_minority['Tumor type'] = np.random.choice(minority_classes, len(synthetic_data_minority))

        # Separar características y etiquetas de los datos sintéticos generados
        X_synthetic = synthetic_data_minority[self.config.important_features]
        y_synthetic = synthetic_data_minority[self.config.target_column]

        # Combinar datos reales y datos sintéticos generados
        X_combined = pd.concat([X_train, X_synthetic], axis=0)
        y_combined = pd.concat([y_train, y_synthetic], axis=0)

        # Aplicar SMOTE para sobremuestrear las clases minoritarias en el conjunto combinado
        smote = SMOTE(random_state=42, k_neighbors=5)
        X_resampled, y_resampled = smote.fit_resample(X_combined, y_combined)

        # Dividir en conjuntos de entrenamiento y prueba
        X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

        # Definir los modelos individuales con regularización
        rf_clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
        gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
        lgbm_clf = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, num_leaves=31, random_state=42)
        xgb_clf = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

        # Definir el Voting Classifier
        voting_clf = VotingClassifier(
            estimators=[
                ('rf', rf_clf),
                ('gb', gb_clf),
                ('lgbm', lgbm_clf),
                ('xgb', xgb_clf)
            ],
            voting='soft'  # 'soft' uses predicted probabilities
        )

        # Validación cruzada estratificada
        skf = StratifiedKFold(n_splits=5)
        cv_scores = cross_val_score(voting_clf, X_train, y_train, cv=skf, scoring='accuracy')
        print(f'Cross-Validation Accuracy Scores: {cv_scores}')
        print(f'Mean Cross-Validation Accuracy: {cv_scores.mean()}')

        # Entrenar el Voting Classifier con todos los datos resampleados
        voting_clf.fit(X_train, y_train)

        # Guardar el modelo entrenado
        joblib.dump(voting_clf, os.path.join(self.config.root_dir, self.config.model_name))

        # Realizar predicciones y evaluar el modelo en el conjunto de prueba
        y_pred = voting_clf.predict(X_test)
        print(classification_report(y_test, y_pred))
        print(confusion_matrix(y_test, y_pred))
        logger.info(f"Model training completed and saved to {os.path.join(self.config.root_dir, self.config.model_name)}")

In [13]:
STAGE_NAME = "Model Training"

try:
    logger.info(f">>>>> stage {STAGE_NAME} started <<<<<")
    
    # Inicializar el ConfigurationManager
    config_manager = ConfigurationManager()
    
    # Obtener la configuración de entrenamiento del modelo
    model_trainer_config = config_manager.get_model_trainer_config()
    
    # Crear instancia de ModelTrainer
    model_trainer = ModelTrainer(config=model_trainer_config)
    
    # Ejecutar el entrenamiento del modelo
    model_trainer.train()
    logger.info(f">>>>> stage {STAGE_NAME} completed <<<<<\n\nx==========x")
    
except Exception as e:
    logger.exception(e)
    raise e

[2024-06-27 08:45:01,838: INFO: 3124570509: >>>>> stage Model Training started <<<<<]
[2024-06-27 08:45:01,842: INFO: common: YAML file: config\config.yaml loaded successfully]
[2024-06-27 08:45:01,844: INFO: common: YAML file: params.yaml loaded successfully]
[2024-06-27 08:45:01,849: INFO: common: YAML file: schema.yaml loaded successfully]
[2024-06-27 08:45:01,851: INFO: common: Created directory at: artifacts]
[2024-06-27 08:45:01,851: INFO: common: Created directory at: artifacts\model_trainer]
[2024-06-27 08:45:02,506: INFO: null: Guidance: There are no missing values in column sFas (pg/ml). Extra column not created.]
[2024-06-27 08:45:03,035: INFO: null: Guidance: There are no missing values in column sHER2/sEGFR2/sErbB2 (pg/ml). Extra column not created.]
[2024-06-27 08:45:03,288: INFO: null: Guidance: There are no missing values in column CA 15-3 (U/ml). Extra column not created.]
[2024-06-27 08:45:03,333: INFO: null: Guidance: There are no missing values in column CA19-9 (U/m