In [1]:
#pip install imblearn
#pip install tqdm
#pip install sympy

In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.under_sampling import RandomUnderSampler, NearMiss, CondensedNearestNeighbour
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from ctgan import CTGAN 

# Define functions
def calculate_probability(df, column, value):
    return len(df[df[column] == value]) / len(df)

def show_stats_save_table(y_true, y_pred, set_name, model_name, table_results_df):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0) 
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    global_score = calculate_global_score(accuracy, precision, recall, f1)
    new_row = pd.DataFrame({
        'Model': [model_name],
        'Set': [set_name],
        'Accuracy': [accuracy],
        'Precision': [precision],
        'Recall': [recall],
        'F1-Score': [f1],
        'Global Score': [global_score]
    })
    return pd.concat([table_results_df, new_row], ignore_index=True)

def calculate_global_score(accuracy, precision, recall, f1):
    weights = {'accuracy': 0.25, 'precision': 0.25, 'recall': 0.25, 'f1': 0.25}
    global_score = (accuracy * weights['accuracy'] + precision * weights['precision'] + recall * weights['recall'] + f1 * weights['f1'])
    return round(global_score * 100, 2)

def train_and_evaluate_model(X_train, y_train, X_test, y_test, model, model_name, table_results_df):
    try:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        table_results_df = show_stats_save_table(y_train, model.predict(X_train), "Train", model_name, table_results_df)
        table_results_df = show_stats_save_table(y_test, y_pred, "Test", model_name, table_results_df)
    except Exception as e:
        print(f"Error training or evaluating the model {model_name}: {e}")
    return table_results_df

# Función para obtener columnas numéricas
def get_numeric_columns(df):
    return df.select_dtypes(include=[float, int]).columns.tolist()

# Función para obtener columnas categóricas
def get_categorical_columns(df):
    return df.select_dtypes(include=['object', 'category']).columns.tolist()

# Load data
file_path = 'D:/MÁSTER DATA SCIENCE/KSCHOOL/1.TFM/PARTTE 2 TFM/df_final.xlsx'  # Replace with your actual file path
df = pd.read_excel(file_path)

# Initial preprocessing 
df.fillna(0, inplace=True)
print(df.isnull().sum())

# Exploratory Data Analysis (EDA)
print("Unique values of 'Tumor type':", df['Tumor type'].unique())
print("\nMedians by tumor type:")
print(df.groupby('Tumor type').median(numeric_only=True))

# Probabilities of each tumor type
total_probabilities = {tumor_type: calculate_probability(df, 'Tumor type', tumor_type) for tumor_type in df['Tumor type'].unique()}
print("\nProbabilities of each tumor type:")
for tumor_type, prob in total_probabilities.items():
    print(f"- {tumor_type}: {prob:.2%}")

# Split into subsets based on 'Omega score' median
omega_score_threshold = df['Omega score'].median()
df_below_median = df[df['Omega score'] <= omega_score_threshold].copy()
df_above_median = df[df['Omega score'] > omega_score_threshold].copy()

# Calculate median for the entire DataFrame for each subset
median_below = df_below_median.median(numeric_only=True)
median_above = df_above_median.median(numeric_only=True)
print("\nMedians of the DataFrame for the subset below the 'Omega score' median:")
print(median_below)
print("\nMedians of the DataFrame for the subset above the 'Omega score' median:")
print(median_above)

# Save dataframes to Excel
df_below_median.to_excel('df_below_median.xlsx', index=False)
df_above_median.to_excel('df_above_median.xlsx', index=False)

# Separate features and labels for BOTH datasets
X_below = df_below_median.drop(columns=['Tumor type'])
y_below = df_below_median['Tumor type']

X_above = df_above_median.drop(columns=['Tumor type'])
y_above = df_above_median['Tumor type']

def preprocess_and_train(X, y, table_results_df, filename):
    # One-Hot Encoding
    encoder = OneHotEncoder(handle_unknown='ignore')
    categorical_features_initial = X.select_dtypes(include=['object']).columns
    X_encoded = pd.DataFrame(encoder.fit_transform(X[categorical_features_initial]).toarray(), columns=encoder.get_feature_names_out(categorical_features_initial))
    X = pd.concat([X.drop(categorical_features_initial, axis=1), X_encoded], axis=1)
    
    # Impute missing values before resampling
    numeric_imputer = SimpleImputer(strategy='mean')
    categorical_imputer = SimpleImputer(strategy='most_frequent')

    numeric_features = X.select_dtypes(include=np.number).columns.tolist()
    categorical_features_after_encoding = X.select_dtypes(exclude=np.number).columns.tolist()

    # Impute only if there are features of that type and missing values
    if numeric_features and X[numeric_features].isnull().values.any():
        X[numeric_features] = numeric_imputer.fit_transform(X[numeric_features])

    if categorical_features_after_encoding and X[categorical_features_after_encoding].isnull().values.any():
        X[categorical_features_after_encoding] = categorical_imputer.fit_transform(X[categorical_features_after_encoding])

    # Class balancing strategies
    estrategias_balanceo = {
        'No Balancing': None,
        'RandomUnderSampler': RandomUnderSampler(random_state=42),
        'CondensedNearestNeighbour': CondensedNearestNeighbour(random_state=42),
        'CTGAN': CTGAN(),
    }
    
    # Define models
    models = {
        'Logistic Regression': LogisticRegression(max_iter=5000),
        'Decision Tree': DecisionTreeClassifier(),
        'Random Forest': RandomForestClassifier(),
        'KNN': KNeighborsClassifier(),
        'AdaBoost': AdaBoostClassifier(algorithm='SAMME'),
        'Gradient Boosting': GradientBoostingClassifier()
    }
        
            # Training and evaluating models
    for strategy_name, strategy in estrategias_balanceo.items():
        print(f"\nBalancing strategy: {strategy_name}")
        if strategy is None:
            X_resampled, y_resampled = X.copy(), y.copy()
        else:
            if strategy_name == 'CTGAN':
                # CTGAN Resampling (Corrected and Simplified)
                X_ctgan = X.copy()
                X_ctgan["Tumor type"] = y

                # Fit CTGAN and sample new data points
                ctgan = CTGAN(epochs=10)
                ctgan.fit(X_ctgan, discrete_columns=['Tumor type'])
                minority_class_label = y.value_counts().idxmin()
                minority_class_size = y.value_counts()[minority_class_label]

                # Ensure we sample enough synthetic data points to balance classes
                majority_class_size = y.value_counts().max()
                samples_to_generate = majority_class_size - minority_class_size

                # Handle the case where the minority class is already the majority
                if samples_to_generate < 0:
                    samples_to_generate = 0

                samples = ctgan.sample(samples_to_generate)

                # Combine minority class samples with generated samples
                X_resampled = pd.concat([X_ctgan[y == minority_class_label], samples.drop(columns=['Tumor type'])], ignore_index=True)
                y_resampled = pd.concat([y[y == minority_class_label], samples['Tumor type']], ignore_index=True)

            else:
                X_resampled, y_resampled = strategy.fit_resample(X, y)

        X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
        
        # Adjusting Shapes If Necessary
        min_samples = min(X_resampled.shape[0], len(y_resampled))
        X_resampled = X_resampled.iloc[:min_samples]  
        y_resampled = y_resampled.iloc[:min_samples]
        
        X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

        # Standard Scaling
        scaler = StandardScaler()
        numeric_features = X_resampled.select_dtypes(include=np.number).columns.tolist()
        X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
        X_test[numeric_features] = scaler.transform(X_test[numeric_features])

        for model_name, model in models.items():
            print(f"\nModel: {model_name}")
            model_name_full = f"{model_name} ({strategy_name})"
            table_results_df = train_and_evaluate_model(X_train, y_train, X_test, y_test, model, model_name_full, table_results_df)

    # Save results to an Excel file
    table_results_df.to_excel(filename, index=False)

# Initialize results DataFrames
table_results_df_below = pd.DataFrame(columns=['Model', 'Set', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'Global Score'])
table_results_df_above = pd.DataFrame(columns=['Model', 'Set', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'Global Score'])

# Train models on both subsets
preprocess_and_train(X_below, y_below, table_results_df_below, 'table_results_df_below_median.xlsx')
preprocess_and_train(X_above, y_above, table_results_df_above, 'table_results_df_above_median.xlsx')

Unnamed: 0                     0
Tumor type                     0
AJCC Stage                     0
AFP (pg/ml)                    0
Angiopoietin-2 (pg/ml)         0
AXL (pg/ml)                    0
CA-125 (U/ml)                  0
CA 15-3 (U/ml)                 0
CA19-9 (U/ml)                  0
CD44 (ng/ml)                   0
CEA (pg/ml)                    0
CYFRA 21-1 (pg/ml)             0
DKK1 (ng/ml)                   0
Endoglin (pg/ml)               0
FGF2 (pg/ml)                   0
Follistatin (pg/ml)            0
Galectin-3 (ng/ml)             0
G-CSF (pg/ml)                  0
GDF15 (ng/ml)                  0
HE4 (pg/ml)                    0
HGF (pg/ml)                    0
IL-6 (pg/ml)                   0
IL-8 (pg/ml)                   0
Kallikrein-6 (pg/ml)           0
Leptin (pg/ml)                 0
Mesothelin (ng/ml)             0
Midkine (pg/ml)                0
Myeloperoxidase (ng/ml)        0
NSE (ng/ml)                    0
OPG (ng/ml)                    0
OPN (pg/ml

ValueError: Found input variables with inconsistent numbers of samples: [752, 503]