In [2]:
import os

In [1]:
%pwd


'c:\\Cancer-Prediction-\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Cancer-Prediction-'

In [5]:
os.environ["MLFLOW_TRACKING_URI"]="https://dagshub.com/kevoa/Cancer-Prediction-.mlflow"
os.environ["MLFLOW_TRACKING_USERNAME"]="kevoa"
os.environ["MLFLOW_TRACKING_PASSWORD"]="05ede0b26068798db0131aa3a65a1251ceaa1a4b"

In [46]:
from dataclasses import dataclass
from pathlib import Path
from typing import List

@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path
    test_data_path: Path
    model_path: Path
    metric_file_name: Path
    target_column: str
    mlflow_uri: str
    mlflow_username: str
    mlflow_password: str
    important_features: List[str]


In [47]:
from CancerPrediction.constants import *
from CancerPrediction.utils.common import read_yaml, create_directories
class ConfigurationManager:
    def __init__(
        self, 
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
        schema_filepath=SCHEMA_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        
        create_directories([Path(self.config['artifacts_root'])])

    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        config = self.config['model_evaluation']
        
        create_directories([Path(config['root_dir'])])
        
        return ModelEvaluationConfig(
            root_dir=Path(config['root_dir']),
            test_data_path=Path(config['test_data_path']),
            model_path=Path(config['model_path']),
            metric_file_name=Path(config['metric_file_name']),
            mlflow_uri=config['mlflow_uri'],
            mlflow_username=config['mlflow_username'],
            mlflow_password=config['mlflow_password'],
            target_column=config['target_column'],
            important_features=config['important_features']  # Añadir esta línea
        )

In [52]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score
from urllib.parse import urlparse
import mlflow
import mlflow.sklearn
import joblib
from CancerPrediction import logger
from CancerPrediction.utils.common import save_json

class ModelEvaluation:
    def __init__(self, config: ModelEvaluationConfig):
        self.config = config

    def eval_metrics(self, y_true, y_pred):
        precision = precision_score(y_true, y_pred, average='weighted')
        recall = recall_score(y_true, y_pred, average='weighted')
        f1 = f1_score(y_true, y_pred, average='weighted')
        return precision, recall, f1

    def log_into_mlflow(self):
        test_data = pd.read_excel(self.config.test_data_path)
        model = joblib.load(self.config.model_path)

        # Seleccionar características importantes directamente de la configuración
        X_test = test_data[self.config.important_features]
        y_test = test_data[self.config.target_column]

        # Configurar MLflow
        os.environ["MLFLOW_TRACKING_URI"] = self.config.mlflow_uri
        os.environ["MLFLOW_TRACKING_USERNAME"] = self.config.mlflow_username
        os.environ["MLFLOW_TRACKING_PASSWORD"] = self.config.mlflow_password

        with mlflow.start_run():
            # Realizar predicciones y evaluar el modelo en el conjunto de prueba
            y_pred = model.predict(X_test)

            # Calcular métricas
            precision, recall, f1 = self.eval_metrics(y_test, y_pred)
            conf_matrix = confusion_matrix(y_test, y_pred)
            class_report = classification_report(y_test, y_pred, output_dict=True)

            # Guardar métricas localmente
            scores = {
                "precision": precision,
                "recall": recall,
                "f1_score": f1,
                "confusion_matrix": conf_matrix.tolist(),
                "classification_report": class_report
            }
            save_json(path=self.config.metric_file_name, data=scores)

            # Registrar métricas en MLflow
            mlflow.log_metric("precision", precision)
            mlflow.log_metric("recall", recall)
            mlflow.log_metric("f1_score", f1)
            mlflow.log_artifact(self.config.metric_file_name)

            # Registrar el modelo
            mlflow.sklearn.log_model(model, "model", registered_model_name="VotingClassifierModel")

try:
    config_manager = ConfigurationManager()
    model_evaluation_config = config_manager.get_model_evaluation_config()
    model_evaluation = ModelEvaluation(config=model_evaluation_config)
    model_evaluation.log_into_mlflow()
except Exception as e:
    logger.exception(e)
    raise e


[2024-06-27 13:29:18,835: INFO: common: YAML file: config\config.yaml loaded successfully]
[2024-06-27 13:29:18,840: INFO: common: YAML file: params.yaml loaded successfully]
[2024-06-27 13:29:18,844: INFO: common: YAML file: schema.yaml loaded successfully]
[2024-06-27 13:29:18,847: INFO: common: Created directory at: artifacts]
[2024-06-27 13:29:18,849: INFO: common: Created directory at: artifacts\model_evaluation]
[2024-06-27 13:29:20,022: INFO: common: JSON file saved at: artifacts\model_evaluation\metrics.json]


Registered model 'VotingClassifierModel' already exists. Creating a new version of this model...
2024/06/27 13:29:34 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: VotingClassifierModel, version 6
Created version '6' of model 'VotingClassifierModel'.


In [29]:
test_df = pd.read_excel(r"C:\Cancer-Prediction-\artifacts\data_transformation\test_df.xlsx")
train_df = pd.read_excel(r"C:\Cancer-Prediction-\artifacts\data_transformation\train_df.xlsx")

In [30]:
df = pd.concat([train_df, test_df], axis=0)

In [31]:
df

Unnamed: 0,AFP (pg/ml),Angiopoietin-2 (pg/ml),AXL (pg/ml),CA-125 (U/ml),CA 15-3 (U/ml),CA19-9 (U/ml),CD44 (ng/ml),CEA (pg/ml),CYFRA 21-1 (pg/ml),DKK1 (ng/ml),...,sHER2/sEGFR2/sErbB2 (pg/ml),sPECAM-1 (pg/ml),TGFa (pg/ml),Thrombospondin-2 (pg/ml),TIMP-1 (pg/ml),TIMP-2 (pg/ml),Omega score,AJCC Stage,Sex_1,Tumor type
0,-0.162730,-0.399967,0.161177,-0.155628,-0.208773,-0.126586,0.100087,-0.177265,-0.118556,-0.748740,...,0.613042,-0.739088,-0.190133,-0.501581,-0.709657,0.506555,-0.168524,0,1,1
1,-0.161972,-0.412345,-0.862979,-0.155856,-0.139035,-0.126858,-0.260427,-0.162648,-0.117902,-0.268988,...,-0.502477,-0.480916,-0.191864,-0.504778,-0.669592,-0.306917,-0.239250,1,1,4
2,-0.155496,-0.507819,-0.284533,-0.155194,-0.255232,0.053126,-0.630079,-0.184200,-0.115457,0.867268,...,-0.710243,-0.461172,-0.182823,-0.367130,-0.404068,-1.120939,-0.243808,1,1,1
3,-0.150629,-0.584907,-0.928337,-0.155787,-0.230770,-0.106049,0.047755,-0.181969,-0.011969,0.210765,...,-0.584788,-0.635555,-0.198597,-0.488231,-0.154741,1.505864,-0.042227,1,0,1
4,-0.147157,-0.464835,1.144186,-0.131803,0.317763,-0.085735,2.054673,-0.192859,-0.119066,-1.203243,...,0.827342,1.377542,-0.207830,-0.195666,-0.340628,1.733127,-0.215178,0,1,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,-0.162529,-0.168476,-0.876422,-0.155536,-0.239253,-0.126601,-0.726438,-0.186948,-0.065832,-0.092237,...,-0.172675,-0.547599,-0.189364,-0.360039,-0.860562,-0.525791,-0.224797,0,0,0
298,-0.142995,-0.562578,-0.431811,-0.106853,-0.020372,-0.032200,0.319386,-0.146195,0.002598,-0.672990,...,0.559587,0.196013,0.507743,-0.076363,-0.091691,2.174226,-0.267385,1,0,6
299,-0.148364,-0.353302,-0.605628,-0.155856,-0.237970,-0.126858,-0.399981,-0.201005,-0.117902,-1.051742,...,-0.210727,-0.548275,-0.191864,-0.504778,-0.467507,0.435235,-0.242274,0,0,0
300,-0.161157,-0.206974,0.054145,-0.156015,-0.186974,-0.126786,-0.694872,-0.208696,-0.119163,-0.874991,...,-0.232628,-0.567034,-0.204752,-0.378567,-1.113493,-0.971085,-0.240743,0,0,1


In [33]:
import pandas as pd
from ctgan import CTGAN
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import classification_report, confusion_matrix
import lightgbm as lgb
import xgboost as xgb
from imblearn.over_sampling import SMOTE
import numpy as np

important_features = [
    'sFas (pg/ml)', 'sHER2/sEGFR2/sErbB2 (pg/ml)', 'CA 15-3 (U/ml)', 'CA19-9 (U/ml)', 'CA-125 (U/ml)',
    'TIMP-2 (pg/ml)', 'TGFa (pg/ml)', 'Sex_1', 'Leptin (pg/ml)', 'IL-8 (pg/ml)', 'IL-6 (pg/ml)',
    'AFP (pg/ml)', 'GDF15 (ng/ml)', 'Prolactin (pg/ml)', 'HGF (pg/ml)', 'CD44 (ng/ml)', 'Midkine (pg/ml)',
    'Thrombospondin-2 (pg/ml)', 'TIMP-1 (pg/ml)', 'HE4 (pg/ml)'
]

# Separar características y etiquetas de los datos reales
X_real = df[important_features]
y_real = df['Tumor type']

# Identificar clases minoritarias
class_counts = y_real.value_counts()
minority_classes = class_counts[class_counts < class_counts.median()].index

# Separar datos de clases minoritarias
X_minority = X_real[y_real.isin(minority_classes)]
y_minority = y_real[y_real.isin(minority_classes)]

# Entrenar el modelo CTGAN solo con las clases minoritarias
model = CTGAN(epochs=300)
model.fit(X_minority)

# Generar datos sintéticos para las clases minoritarias
synthetic_data_minority = model.sample(len(X_minority))

# Asignar etiquetas correctas a los datos sintéticos generados
synthetic_data_minority['Tumor type'] = np.random.choice(minority_classes, len(synthetic_data_minority))

# Separar características y etiquetas de los datos sintéticos generados
X_synthetic = synthetic_data_minority[important_features]
y_synthetic = synthetic_data_minority['Tumor type']

# Combinar datos reales y datos sintéticos generados
X_combined = pd.concat([X_real, X_synthetic], axis=0)
y_combined = pd.concat([y_real, y_synthetic], axis=0)

# Aplicar SMOTE para sobremuestrear las clases minoritarias en el conjunto combinado
smote = SMOTE(random_state=42, k_neighbors=5)
X_resampled, y_resampled = smote.fit_resample(X_combined, y_combined)

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# Definir los modelos individuales con regularización
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
lgbm_clf = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, num_leaves=31, random_state=42)
xgb_clf = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Definir el Voting Classifier
voting_clf = VotingClassifier(
    estimators=[
        ('rf', rf_clf),
        ('gb', gb_clf),
        ('lgbm', lgbm_clf),
        ('xgb', xgb_clf)
    ],
    voting='soft'  # 'soft' uses predicted probabilities
)

# Validación cruzada estratificada
skf = StratifiedKFold(n_splits=5)
cv_scores = cross_val_score(voting_clf, X_train, y_train, cv=skf, scoring='accuracy')
print(f'Cross-Validation Accuracy Scores: {cv_scores}')
print(f'Mean Cross-Validation Accuracy: {cv_scores.mean()}')

# Entrenar el Voting Classifier con todos los datos resampleados
voting_clf.fit(X_train, y_train)

# Realizar predicciones y evaluar el modelo en el conjunto de prueba
y_pred = voting_clf.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


[2024-06-27 11:48:03,548: INFO: null: Guidance: There are no missing values in column sFas (pg/ml). Extra column not created.]
[2024-06-27 11:48:03,619: INFO: null: Guidance: There are no missing values in column sHER2/sEGFR2/sErbB2 (pg/ml). Extra column not created.]
[2024-06-27 11:48:03,814: INFO: null: Guidance: There are no missing values in column CA 15-3 (U/ml). Extra column not created.]
[2024-06-27 11:48:03,854: INFO: null: Guidance: There are no missing values in column CA19-9 (U/ml). Extra column not created.]
[2024-06-27 11:48:03,882: INFO: null: Guidance: There are no missing values in column CA-125 (U/ml). Extra column not created.]
[2024-06-27 11:48:03,918: INFO: null: Guidance: There are no missing values in column TIMP-2 (pg/ml). Extra column not created.]
[2024-06-27 11:48:04,065: INFO: null: Guidance: There are no missing values in column TGFa (pg/ml). Extra column not created.]
[2024-06-27 11:48:04,065: INFO: null: Guidance: There are no missing values in column Sex_

In [41]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib
from imblearn.over_sampling import SMOTE
from ctgan import CTGAN
import lightgbm as lgb
import xgboost as xgb
import torch
from CancerPrediction.entity.config_entity import ModelTrainerConfig
from CancerPrediction import logger
import json

def set_seed(seed: int):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig, seed: int = 42):
        self.config = config
        self.params = config.params  # Obtener los parámetros desde el config
        self.seed = seed
        set_seed(self.seed)

    def train(self):
        # Cargar datos de entrenamiento y prueba
        train_data = pd.read_excel(self.config.train_data_path)
        test_data = pd.read_excel(self.config.test_data_path)

        # Combinar datos de entrenamiento y prueba
        df_combined = pd.concat([train_data, test_data], axis=0)

        # Seleccionar características importantes
        X_real = df_combined[self.config.important_features]
        y_real = df_combined[self.config.target_column]

        # Identificar clases minoritarias
        class_counts = y_real.value_counts()
        minority_classes = class_counts[class_counts < class_counts.median()].index

        # Separar datos de clases minoritarias
        X_minority = X_real[y_real.isin(minority_classes)]
        y_minority = y_real[y_real.isin(minority_classes)]

        # Entrenar el modelo CTGAN solo con las clases minoritarias
        ctgan_params = self.params['CTGAN']
        model = CTGAN(**ctgan_params)
        model.fit(X_minority)

        # Generar datos sintéticos para las clases minoritarias
        synthetic_data_minority = model.sample(len(X_minority))

        # Asignar etiquetas correctas a los datos sintéticos generados
        synthetic_data_minority[self.config.target_column] = np.random.choice(minority_classes, len(synthetic_data_minority))

        # Separar características y etiquetas de los datos sintéticos generados
        X_synthetic = synthetic_data_minority[self.config.important_features]
        y_synthetic = synthetic_data_minority[self.config.target_column]

        # Combinar datos reales y datos sintéticos generados
        X_combined = pd.concat([X_real, X_synthetic], axis=0)
        y_combined = pd.concat([y_real, y_synthetic], axis=0)

        # Aplicar SMOTE para sobremuestrear las clases minoritarias en el conjunto combinado
        smote = SMOTE(random_state=self.seed)
        X_resampled, y_resampled = smote.fit_resample(X_combined, y_combined)

        # Dividir en conjuntos de entrenamiento y prueba
        X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=self.seed, stratify=y_resampled)

        # Definir los modelos individuales con regularización
        rf_clf = RandomForestClassifier(random_state=self.seed, **self.params['RandomForest'])
        gb_clf = GradientBoostingClassifier(random_state=self.seed, **self.params['GradientBoosting'])
        lgbm_clf = lgb.LGBMClassifier(random_state=self.seed, **self.params['LightGBM'])
        xgb_clf = xgb.XGBClassifier(random_state=self.seed, **self.params['XGBoost'])

        # Definir el Voting Classifier
        voting_clf = VotingClassifier(
            estimators=[
                ('rf', rf_clf),
                ('gb', gb_clf),
                ('lgbm', lgbm_clf),
                ('xgb', xgb_clf)
            ],
            voting='soft'  # 'soft' uses predicted probabilities
        )

        # Entrenar el Voting Classifier con todos los datos resampleados
        voting_clf.fit(X_train, y_train)

        # Guardar el modelo entrenado
        joblib.dump(voting_clf, os.path.join(self.config.root_dir, self.config.model_name))

        # Evaluar el modelo
        y_pred = voting_clf.predict(X_test)
        logger.info(f"Classification Report:\n {classification_report(y_test, y_pred)}")
        logger.info(f"Confusion Matrix:\n {confusion_matrix(y_test, y_pred)}")




[2024-06-27 13:00:37,661: INFO: 2705523432: Training Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       149
           1       1.00      0.96      0.98       273
           2       0.88      0.91      0.89        32
           3       0.97      0.97      0.97        33
           4       0.96      0.96      0.96        69
           5       0.98      1.00      0.99        44
           6       0.97      1.00      0.98        57
           7       0.90      0.96      0.93        46

    accuracy                           0.97       703
   macro avg       0.95      0.97      0.96       703
weighted avg       0.97      0.97      0.97       703
]
[2024-06-27 13:00:37,661: INFO: 2705523432: Training Confusion Matrix:
[[145   0   1   0   2   0   1   0]
 [  4 262   2   0   1   1   0   3]
 [  1   0  29   1   0   0   0   1]
 [  0   0   0  32   0   0   1   0]
 [  1   1   0   0  66   0   0   1]
 [  0   0   0   0   0  44 

In [45]:
# Definir los parámetros ajustados para evitar el sobreajuste
params = {
    'CTGAN': {
        'epochs': 300,
        'batch_size': 500,
        'discriminator_steps': 1,
        'log_frequency': True,
        'verbose': True
    },
    'RandomForest': {
        'n_estimators': 100,
        'max_depth': 5  # Reducir la profundidad máxima para evitar sobreajuste
    },
    'GradientBoosting': {
        'n_estimators': 100,
        'learning_rate': 0.05,  # Reducir la tasa de aprendizaje para hacer el modelo más robusto
        'max_depth': 3
    },
    'LightGBM': {
        'n_estimators': 100,
        'learning_rate': 0.05,  # Reducir la tasa de aprendizaje
        'num_leaves': 31
    },
    'XGBoost': {
        'n_estimators': 100,
        'learning_rate': 0.05,  # Reducir la tasa de aprendizaje
        'max_depth': 3
    }
}

import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib
from imblearn.over_sampling import SMOTE
from ctgan import CTGAN
import lightgbm as lgb
import xgboost as xgb
import torch
import json

def set_seed(seed: int):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

class ModelTrainer:
    def __init__(self, train_data_path, test_data_path, important_features, target_column, params, model_save_path, seed: int = 42):
        self.train_data_path = train_data_path
        self.test_data_path = test_data_path
        self.important_features = important_features
        self.target_column = target_column
        self.params = params
        self.model_save_path = model_save_path
        self.seed = seed
        set_seed(self.seed)

    def train(self):
        # Cargar datos de entrenamiento y prueba
        train_data = pd.read_excel(self.train_data_path)
        test_data = pd.read_excel(self.test_data_path)

        # Combinar datos de entrenamiento y prueba
        df_combined = pd.concat([train_data, test_data], axis=0)

        # Seleccionar características importantes
        X_real = df_combined[self.important_features]
        y_real = df_combined[self.target_column]

        # Identificar clases minoritarias
        class_counts = y_real.value_counts()
        minority_classes = class_counts[class_counts < class_counts.median()].index

        # Separar datos de clases minoritarias
        X_minority = X_real[y_real.isin(minority_classes)]
        y_minority = y_real[y_real.isin(minority_classes)]

        # Entrenar el modelo CTGAN solo con las clases minoritarias
        ctgan_params = self.params['CTGAN']
        model = CTGAN(**ctgan_params)
        model.fit(X_minority)

        # Generar datos sintéticos para las clases minoritarias
        synthetic_data_minority = model.sample(len(X_minority))

        # Asignar etiquetas correctas a los datos sintéticos generados
        synthetic_data_minority[self.target_column] = np.random.choice(minority_classes, len(synthetic_data_minority))

        # Separar características y etiquetas de los datos sintéticos generados
        X_synthetic = synthetic_data_minority[self.important_features]
        y_synthetic = synthetic_data_minority[self.target_column]

        # Combinar datos reales y datos sintéticos generados
        X_combined = pd.concat([X_real, X_synthetic], axis=0)
        y_combined = pd.concat([y_real, y_synthetic], axis=0)

        # Aplicar SMOTE para sobremuestrear las clases minoritarias en el conjunto combinado
        smote = SMOTE(random_state=self.seed)
        X_resampled, y_resampled = smote.fit_resample(X_combined, y_combined)

        # Dividir en conjuntos de entrenamiento y prueba
        X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=self.seed, stratify=y_resampled)

        # Definir los modelos individuales con regularización
        rf_clf = RandomForestClassifier(random_state=self.seed, **self.params['RandomForest'])
        gb_clf = GradientBoostingClassifier(random_state=self.seed, **self.params['GradientBoosting'])
        lgbm_clf = lgb.LGBMClassifier(random_state=self.seed, **self.params['LightGBM'])
        xgb_clf = xgb.XGBClassifier(random_state=self.seed, **self.params['XGBoost'])

        # Definir el Voting Classifier
        voting_clf = VotingClassifier(
            estimators=[
                ('rf', rf_clf),
                ('gb', gb_clf),
                ('lgbm', lgbm_clf),
                ('xgb', xgb_clf)
            ],
            voting='soft'  # 'soft' uses predicted probabilities
        )

        # Entrenar el Voting Classifier con todos los datos resampleados
        voting_clf.fit(X_train, y_train)

        # Guardar el modelo entrenado
        joblib.dump(voting_clf, self.model_save_path)

        # Evaluar el modelo
        y_pred = voting_clf.predict(X_test)
        print(f"Classification Report:\n {classification_report(y_test, y_pred)}")
        print(f"Confusion Matrix:\n {confusion_matrix(y_test, y_pred)}")

# Definir las rutas de los datos y los parámetros
train_data_path = "artifacts/data_transformation/train_df.xlsx"
test_data_path = "artifacts/data_transformation/test_df.xlsx"
important_features = [
    'sFas (pg/ml)', 'sHER2/sEGFR2/sErbB2 (pg/ml)', 'CA 15-3 (U/ml)', 'CA19-9 (U/ml)', 'CA-125 (U/ml)',
    'TIMP-2 (pg/ml)', 'TGFa (pg/ml)', 'Sex_1', 'Leptin (pg/ml)', 'IL-8 (pg/ml)', 'IL-6 (pg/ml)',
    'AFP (pg/ml)', 'GDF15 (ng/ml)', 'Prolactin (pg/ml)', 'HGF (pg/ml)', 'CD44 (ng/ml)', 'Midkine (pg/ml)',
    'Thrombospondin-2 (pg/ml)', 'TIMP-1 (pg/ml)', 'HE4 (pg/ml)'
]
target_column = "Tumor type"
model_save_path = "artifacts/model_trainer/model.joblib"

# Entrenar el modelo con los nuevos parámetros
model_trainer = ModelTrainer(train_data_path, test_data_path, important_features, target_column, params, model_save_path)
model_trainer.train()


# Evaluar el sobreajuste
train_data = pd.read_excel(train_data_path)
test_data = pd.read_excel(test_data_path)
X_train = train_data[important_features]
y_train = train_data[target_column]
X_test = test_data[important_features]
y_test = test_data[target_column]

model = joblib.load(model_save_path)

# Predicciones en conjunto de entrenamiento
y_train_pred = model.predict(X_train)
train_report = classification_report(y_train, y_train_pred)
train_conf_matrix = confusion_matrix(y_train, y_train_pred)

# Predicciones en conjunto de prueba
y_test_pred = model.predict(X_test)
test_report = classification_report(y_test, y_test_pred)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)

# Comparar métricas
print(f"Training Classification Report:\n{train_report}")
print(f"Training Confusion Matrix:\n{train_conf_matrix}")
print(f"Testing Classification Report:\n{test_report}")
print(f"Testing Confusion Matrix:\n{test_conf_matrix}")




[2024-06-27 13:08:10,288: INFO: null: Guidance: There are no missing values in column sFas (pg/ml). Extra column not created.]
[2024-06-27 13:08:10,327: INFO: null: Guidance: There are no missing values in column sHER2/sEGFR2/sErbB2 (pg/ml). Extra column not created.]
[2024-06-27 13:08:10,472: INFO: null: Guidance: There are no missing values in column CA 15-3 (U/ml). Extra column not created.]
[2024-06-27 13:08:10,500: INFO: null: Guidance: There are no missing values in column CA19-9 (U/ml). Extra column not created.]
[2024-06-27 13:08:10,522: INFO: null: Guidance: There are no missing values in column CA-125 (U/ml). Extra column not created.]
[2024-06-27 13:08:10,558: INFO: null: Guidance: There are no missing values in column TIMP-2 (pg/ml). Extra column not created.]
[2024-06-27 13:08:10,708: INFO: null: Guidance: There are no missing values in column TGFa (pg/ml). Extra column not created.]
[2024-06-27 13:08:10,724: INFO: null: Guidance: There are no missing values in column Sex_

Gen. (-2.24) | Discrim. (0.26): 100%|██████████| 300/300 [00:25<00:00, 11.75it/s] 


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000406 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4847
[LightGBM] [Info] Number of data points in the train set: 2483, number of used features: 20
[LightGBM] [Info] Start training from score -2.077430
[LightGBM] [Info] Start training from score -2.077430
[LightGBM] [Info] Start training from score -2.080650
[LightGBM] [Info] Start training from score -2.080650
[LightGBM] [Info] Start training from score -2.080650
[LightGBM] [Info] Start training from score -2.077430
[LightGBM] [Info] Start training from score -2.080650
[LightGBM] [Info] Start training from score -2.080650
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.87      0.85        77
           1       0.93      0.74      0.83        77
           2       0.89      0.90      0.89        78
           3       0.90      0.90      