In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from imblearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD
from collections import Counter

# Lista de características importantes
important_features = [
    'sFas (pg/ml)', 'sHER2/sEGFR2/sErbB2 (pg/ml)', 'CA 15-3 (U/ml)', 'CA19-9 (U/ml)', 'CA-125 (U/ml)',
    'TIMP-2 (pg/ml)', 'TGFa (pg/ml)', 'Leptin (pg/ml)', 'IL-8 (pg/ml)', 'IL-6 (pg/ml)', 'AFP (pg/ml)',
    'GDF15 (ng/ml)', 'Prolactin (pg/ml)', 'HGF (pg/ml)', 'CD44 (ng/ml)', 'Midkine (pg/ml)',
    'Thrombospondin-2 (pg/ml)', 'TIMP-1 (pg/ml)', 'HE4 (pg/ml)'
]

# Cargar los datos
df = pd.read_excel("C:\\Minio-MLflow-setup\\Minio-MLflow-set-up\\Cancer-prediction\\data\\df_combined.xlsx")
df1 = pd.DataFrame(df, columns=important_features)
y = df["Tumor type"]
dff = pd.concat([df1, y], axis=1)

# Cargar datos adicionales sin etiquetar
y_unlabeled = pd.read_excel("variable-objetivo.xlsx")
y_unlabeled = y_unlabeled.drop("Unnamed: 0", axis=1)
df_final = pd.concat([df1, y_unlabeled], axis=1)

# Preparar las variables de entrada y salida
X = df_final.drop("Tumor type", axis=1)
y = df_final["Tumor type"]

# Mapear las clases a categorías binarias
category_0 = ["Stomach", "Ovary", "Esophagus", "Liver", "Pancreas", "Lung"]
df_final["mapped_target"] = df_final["Tumor type"].apply(lambda x: 0 if x in category_0 else 1)

X = df_final.drop(["mapped_target", "Tumor type"], axis=1)
y_real = df_final["mapped_target"]

# Dividir los datos en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y_real, test_size=0.3, random_state=42)

# Definir los parámetros y el modelo de regresión logística
params = {
    "max_iter": 3000,
    "random_state": 42,
    "solver": "lbfgs"
}

modelos = {
    'Logistic Regression': LogisticRegression(**params)
}

# Entrenar y evaluar el modelo
for modelo_nombre, modelo in modelos.items():
    # Crear pipeline
    pipeline = Pipeline(steps=[
        ('classifier', modelo)
    ])
    
    # Entrenar el modelo
    pipeline.fit(X_train, y_train)
    
    # Predicciones en el conjunto de entrenamiento
    y_train_pred = pipeline.predict(X_train)
    
    # Calcular métricas de entrenamiento
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred, average='weighted')
    train_precision = precision_score(y_train, y_train_pred, average='weighted')
    train_recall = recall_score(y_train, y_train_pred, average='weighted')

    # Predicciones en el conjunto de prueba
    y_test_pred = pipeline.predict(X_test)
    
    # Calcular métricas de prueba
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred, average='weighted')
    test_precision = precision_score(y_test, y_test_pred, average='weighted')
    test_recall = recall_score(y_test, y_test_pred, average='weighted')
    
    # Mostrar resultados
    print(f"Modelo: {modelo_nombre}")
    print(f"Accuracy en entrenamiento: {train_accuracy}")
    print(f"F1-score en entrenamiento: {train_f1}")
    print(f"Precision en entrenamiento: {train_precision}")
    print(f"Recall en entrenamiento: {train_recall}")
    print(f"Accuracy en prueba: {test_accuracy}")
    print(f"F1-score en prueba: {test_f1}")
    print(f"Precision en prueba: {test_precision}")
    print(f"Recall en prueba: {test_recall}")
    print("-------------------------------------------------------------")

# Generar datos sintéticos con SVD
minority_classes = ["Stomach", "Ovary", "Esophagus", "Liver", "Pancreas", "Lung"]

# Crear un DataFrame con los datos de las clases minoritarias
X_minority = X[y.isin(minority_classes)]
y_minority = y[y.isin(minority_classes)]
df_minority = pd.concat([X_minority, y_minority], axis=1)

# Definir parámetros de SVD
n_components = min(len(df_minority), len(important_features) - 1)
svd = TruncatedSVD(n_components=n_components, random_state=42)

# Ajustar SVD a los datos de las clases minoritarias
X_minority_svd = svd.fit_transform(X_minority)

# Generar datos sintéticos a partir de los componentes SVD
synthetic_data_minority = svd.inverse_transform(X_minority_svd)
synthetic_data_minority = pd.DataFrame(synthetic_data_minority, columns=important_features)

# Asignar etiquetas de clase a los datos sintéticos generados
synthetic_data_minority["Tumor type"] = np.random.choice(minority_classes, len(synthetic_data_minority))

# Separar las características y el target en los datos sintéticos
X_synthetic = synthetic_data_minority[important_features]
y_synthetic = synthetic_data_minority["Tumor type"]

# Combinar datos reales y datos sintéticos generados
X_combined = pd.concat([X, X_synthetic], axis=0)
y_combined = pd.concat([y, y_synthetic], axis=0)

df_combined = pd.concat([X_combined, y_combined], axis=1)
df_combined["Tumor type"].value_counts().to_list()

# Preparar las variables de entrada y salida
X = df_combined.drop("Tumor type", axis=1)
y = df_combined["Tumor type"]

# Codificar las etiquetas
le = LabelEncoder()
y = le.fit_transform(y)

print(Counter(y))
