In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, cross_validate
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer, precision_score, f1_score, RocCurveDisplay, roc_auc_score
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Load and merge CSVs

In [27]:
# Function to merge CSV files
def merge_csvs(csv_list):
    # Load the first CSV
    merged_df = pd.read_csv(csv_list[0])
    
    # Merge the remaining CSVs
    for csv in csv_list[1:]:
        temp_df = pd.read_csv(csv)
        merged_df = merged_df.merge(temp_df, on="id")
    
    return merged_df

merged_df = merge_csvs(["annotations_ds.csv"])#, "cnn.csv"

# Train and compare models

### 1. Hyperparameter Tuning

We used the **RandomizedSearchCV** method, that allows for randomized sampling of hyperparameters to optimize the machine learning method.
The function accepts a classifier, a set of hyperparameter distributions to search over, the input data X and labels y, and a name for the model.

> **Preprocessing:** We created a **ColumnTransformer**, which applies **StandardScaler()** to scale the numeric features. This is crucial, especially for algorithms sensitive to feature scaling like XGBoost.

> **Pipeline Setup:** We created a pipeline that consists of the preprocessing step and the classifying step.

> **Cross-Validation:** We used **StratifiedKFold** cross-validation, which splits the data into 5 folds while ensuring that the distribution of classes remains balanced in each fold.

> **RandomizedSearchCV:** The next step is to perform hyperparameter tuning by randomly selecting hyperparameter combinations from the provided **param_distributions**.

> **Fitting the Model and Displaying Results:** After the tuning process and the fitting of each model, the function prints the best hyperparameters and the highest AUC score achieved during cross-validation.

In [28]:
# Function to perform RandomizedSearchCV
def tune_model(classifier, param_distributions, X, y, model_name):
    # Define the preprocessor (if needed)
    # Here, we"ll standardize the numeric features
    numeric_features = X.select_dtypes(include=["float64", "int64"]).columns
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), numeric_features)  # Standardize numeric features
        ]
    )

    # Create the pipeline
    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),  # Preprocessing step
        ("classifier", classifier)  # Classifier passed as parameter
    ])

    # Setup StratifiedKFold cross-validation
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Setup RandomizedSearchCV
    random_search = RandomizedSearchCV(
        pipeline,
        param_distributions,
        n_iter=50,  # Number of random combinations to try
        scoring="roc_auc",  # Use AUC as the scoring metric
        cv=kf,
        random_state=42,
        n_jobs=1,  # Use all available cores
        return_train_score=True,
        error_score='raise',
    )

    # Perform RandomizedSearchCV
    random_search.fit(X, y)

    # Display the best parameters and score
    print(f"{model_name} - Best parameters: {random_search.best_params_}")
    print(f"{model_name} - Best AUC score: {random_search.best_score_:.4f}")

    return random_search.best_estimator_, random_search.best_params_

###  XGBoost (xgb_param_distributions): 


- **n_estimators:** Number of boosting rounds or trees to grow, with values ranging from 50 to 300 in steps of 50.
- **max_depth:** Maximum depth of each tree, which controls the complexity of the model, sampled between 3 and 15.
- **learning_rate:** The step size at each iteration, ranging from 0.01 to 0.2 to control the contribution of each tree.
- **subsample:** Fraction of samples to be used in each boosting round, with values of 0.5, 0.7, and 1.0.
- **colsample_bytree:** Fraction of features used when building each tree, with values of 0.5, 0.7, and 1.0.
- **gamma:** Minimum loss reduction required to make a further partition in a leaf node, ranging from 0 to 5 in steps of 0.5.

In [29]:
# 0-> bom, 1-> mau
def indf_sum(y_train):
    y_train[y_train < 3] = 0
    y_train[y_train > 3] = 1
    return y_train

# Define features and target
merged_df = merged_df[merged_df["Malignancy"]!=3]

X = merged_df.drop(columns=["ID", "Scan_ID","Patient_ID", "Malignancy"])  # Exclude "id" and "class" from features
y = indf_sum(merged_df["Malignancy"])  # Target variable

In [30]:
# Parameter distributions for each classifier
xgb_param_distributions = {
    "classifier__n_estimators": np.arange(50, 300, 50),
    "classifier__max_depth": np.arange(3, 15),
    "classifier__learning_rate": [0.01, 0.05, 0.1, 0.2],
    "classifier__subsample": [0.5, 0.7, 1.0],
    "classifier__colsample_bytree": [0.5, 0.7, 1.0],
    "classifier__gamma": np.arange(0, 5, 0.5),
}

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train[y_train>3] = 1


XGBoost - Best parameters: {'classifier__subsample': 0.7, 'classifier__n_estimators': 200, 'classifier__max_depth': 10, 'classifier__learning_rate': 0.1, 'classifier__gamma': 2.0, 'classifier__colsample_bytree': 0.7}
XGBoost - Best AUC score: 0.9329
Random Forest - Best parameters: {'classifier__n_estimators': 150, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 2, 'classifier__max_features': 'sqrt', 'classifier__max_depth': 11}
Random Forest - Best AUC score: 0.9285


### Random Forest (rf_param_distributions):

- **n_estimators:** Number of trees in the forest, sampled between 50 and 300 in steps of 50.
- **max_depth:** Maximum depth of each tree, sampled between 3 and 15.
- **min_samples_split:** Minimum number of samples required to split an internal node, with options of 2, 5, and 10.
- **min_samples_leaf:** Minimum number of samples required to be at a leaf node, with values of 1, 2, and 4.
- **max_features:** The number of features to consider when looking for the best split, with options "auto" and "sqrt".

In [None]:
rf_param_distributions = {
    "classifier__n_estimators": np.arange(50, 300, 50),
    "classifier__max_depth": np.arange(3, 15),
    "classifier__min_samples_split": [2, 5, 10],
    "classifier__min_samples_leaf": [1, 2, 4],
    "classifier__max_features": ["log2", "sqrt"],
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

''' 
3 é irrelevante , remove
1ª-> 1+2 bom, 4+5 mau                 corta-se max(1+2, 4+5)-min(1+2,4+5) a max(1+2,4+5)
2ª-> 1*0.5+2*0.5 bom, 4*0.5+5*0.5 mau
3ª-> w1, w2, w4, w5 
4ª-> >=3 mau, else bom

  ver diferênças entre as diff formas


sendo 1, 5 os resultados mais seguros dar mais importância q o 2, 4
r => nº de nodulos malig/benig
'''

# Tune models
best_xgb_model, best_xgb_params = tune_model(XGBClassifier(eval_metric="mlogloss"), xgb_param_distributions, X_train, y_train, "XGBoost")
best_rf_model, best_rf_params = tune_model(RandomForestClassifier(random_state=42), rf_param_distributions, X_train, y_train, "Random Forest")
# TODO: SVM?

In [40]:
''' FAZER PLOT DE RESULTADOS
bACC
F1,... várias métricas(só ACC é proibido) 

para treino extra pontos para alterar pesos em vez de nº de exemplos
'''

models = [
    ("XgBoost", best_xgb_model),
    ("Random Forest", best_rf_model),
]

_, ax = plt.subplots()
for name, pipeline in models:
    RocCurveDisplay.from_estimator(pipeline, X_test, y_test, ax=ax, name=name)
plt.title("ROC Curve")
plt.show()

names = []
precisions = []
f1_scores = []
aucs = []
for name, pipeline in models:
    names.append(name)
    precisions.append(precision_score(y_test, pipeline.predict(X_test)))
    f1_scores.append(f1_score(y_test, best_rf_model.predict(X_test)))
    aucs.append(roc_auc_score(y_test, best_xgb_model.predict_proba(X_test)[:, 1]))


plt.bar(names, precisions)
plt.ylabel("scale")
plt.title("Precision")
plt.ylim(bottom=0.5)
plt.show()

plt.bar(names, f1_scores)
plt.ylabel("scale")
plt.title("F1 Score")
plt.ylim(bottom=0.5)
plt.show()

plt.bar(names, aucs)
plt.ylabel("scale")
plt.title("AUC Score")
plt.ylim(bottom=0.5)
plt.show()

AUC: 0.9192
Precision: 0.8513 ± 0.0384
F1-score: 0.8506 ± 0.0380
