# Radiomics&Clinical data 4 CTO

In [None]:
# Install Baselines
!pip install catboost xgboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


# 🕮 Necessary Imports for the Notebook

The examples in this notebook require the following imports.
Make sure to run this cell before any other cell.




In [None]:
# Setup Imports
import pandas as pd
import numpy as np

from sklearn.datasets import load_breast_cancer, load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, r2_score

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.inspection import DecisionBoundaryDisplay

# Baseline Imports
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from catboost import CatBoostClassifier, CatBoostRegressor

In [None]:
#Upload Dataset
from google.colab import files
uploaded = files.upload()

In [None]:
df = pd.read_excel('Your_data.xlsx')

# EDA

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# 1. Basic Data Exploration
print("Shape of the dataset:", df.shape)

# 2. Descriptive Statistics
print("\nDescriptive statistics:\n", df.describe())  # Summary statistics for numerical columns
df = df.dropna(subset=['FAIL'])
print("\nClass distribution (FAIL column):\n", df['FAIL'].value_counts())  # Distribution of classes

# 3. Visualizations
ax = df['FAIL'].value_counts().plot(kind='bar', color=['skyblue', 'salmon'])
ax.set_xticklabels(['Success', 'Fail'])
plt.title('Operation outcome')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks(rotation=0)  # Keep x-axis labels horizontal
plt.show()

In [None]:
#check data format
unconvertible_columns = []

def convert_to_float(x, column_name):
    try:
        return float(x)
    except ValueError:
        if column_name not in unconvertible_columns:
            unconvertible_columns.append(column_name)
        return x

# Apply the function to all columns
for column in df.columns:
    df[column] = df[column].apply(lambda x: convert_to_float(x, column))

print("Columns with unconvertible values:", unconvertible_columns)

In [None]:
#missing data (NaN)
rows_with_missing_data = df[df.isnull().any(axis=1)]
num_rows_with_missing_data = len(rows_with_missing_data)

#rows with strings
rows_with_strings = df[df.applymap(lambda x: isinstance(x, str)).any(axis=1)]
num_rows_with_strings = len(rows_with_strings)

print(f"Number of rows with missing data: {num_rows_with_missing_data}")
print(f"Number of rows with strings: {num_rows_with_strings}")

In [None]:
cleaned_df = df.drop(columns=unconvertible_columns)

Boruta Feature Selection

In [None]:
!pip install Boruta

Split before boruta

In [None]:
X = cleaned_df.drop('FAIL', axis=1)
y = cleaned_df['FAIL']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
#Convert y_train and y_test to Pandas Series to use value_counts()
y_train_series = pd.Series(y_train)
y_test_series = pd.Series(y_test)

print(y_train_series.value_counts(), y_test_series.value_counts())

In [None]:
#For radiomics-clinical
#Split the X into radiomics and clinical
strength_index = X_train.columns.get_loc('Strength_864')

#df1: (up to 'strength_864' + 'FAIL')
X_train1_columns = X_train.columns[:strength_index + 1].tolist()
X_train1 = X_train[X_train1_columns]
print(X_train1_columns)

#df2
X_train2_columns = [col for col in X_train.columns if col not in X_train1_columns]
X_train2 = X_train[X_train2_columns]
print(X_train2.columns)

In [None]:
#separate Boruta
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier

#for X1
rf1 = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
feat_selector1 = BorutaPy(rf1, n_estimators='auto', verbose=2, random_state=1)
feat_selector1.fit(X_train1.values, y_train)
X1_filtered = feat_selector1.transform(X_train1.values)
selected_features1 = X_train1.columns[feat_selector1.support_]

#for X2
rf2 = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
feat_selector2 = BorutaPy(rf2, n_estimators='auto', verbose=2, random_state=1)
feat_selector2.fit(X_train2.values, y_train)
X2_filtered = feat_selector2.transform(X_train2.values)
selected_features2 = X_train2.columns[feat_selector2.support_]

For combined clinincal and radiomics

In [None]:
#concatenate selected features
X_train_selected = pd.concat([X_train1[selected_features1], X_train2[selected_features2]], axis=1)
all_selected_features = selected_features1.tolist() + selected_features2.tolist()

#select for X_test
X_test_selected = X_test[all_selected_features]

For radiomics vs clinical

In [None]:
X_train_selected = X_train1[selected_features1]
#X_train_selected = X_train2[selected_features2]

X_test_selected = X_test[selected_features1]
#X_test_selected = X_test[selected_features2]

Preprocess dataset

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

#Train models

In [None]:
# Predict with baselines
y_pred_xgb = XGBClassifier(random_state=42).fit(X_train_scaled, y_train).predict_proba(X_test_scaled)
y_pred_cat = CatBoostClassifier(random_state=42, verbose=0).fit(X_train_scaled, y_train).predict_proba(X_test_scaled)
y_pred_rf = RandomForestClassifier(random_state=42).fit(X_train_scaled, y_train).predict_proba(X_test_scaled)
#y_pred_lr = LinearRegression().fit(X_train_scaled, y_train).predict(X_test_scaled)

In [None]:
from sklearn.metrics import roc_auc_score, matthews_corrcoef, f1_score

#performance
model_scores = []

for model_name, y_pred_proba in [("XGBoost", y_pred_xgb), ("RandomForest", y_pred_rf), ("CatBoost", y_pred_cat)]:
    y_pred = (y_pred_proba[:, 1] > 0.4).astype(int)

    auc = roc_auc_score(y_test, y_pred_proba[:, 1])
    mcc = matthews_corrcoef(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    model_scores.append([model_name, auc, mcc, f1])

#scores
scores_df = pd.DataFrame(model_scores, columns=["Model", "AUC", "MCC", "F1-Score"])

#F1 plot
ax = scores_df.plot.barh(x="Model", y="F1-Score")
ax.set_xlim(0, 1)  # Set x-axis limits for F1-score (0 to 1)
plt.title("F1-Scores of Models")
plt.show()

#scores table
print(scores_df)