In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error, make_scorer, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold

from nimosef.utils.stats import drop_high_nan, process_columns_for_rf
from ukb_utils import columns_to_code, columns_to_drop, pretty_name_map

In [None]:
data_path = '/media/jaume/DATA/Data/Urblauna_SFTP/UKB_Cardiac_BIDS'
split_filename = 'derivatives/nimosef_flip_logs/train_val_test_split.json'
number_patients = 1000

# Mean shape code
baseline_experiment_name = 'experiment_20250317_201737'  # Baseline
new_model_experiment_name = 'experiment_20250317_201146'  # Motion

derivatives_path = os.path.join(data_path, 'derivatives')

# For the training
dataset_split = 'train'

res_factor_z = 1  # Same as original
img_folder_baseline = os.path.join(derivatives_path, 'nimosef_flip_logs', 'baseline', f"imgs_train_{baseline_experiment_name}_res_factor_{res_factor_z}")
img_folder_new_model = os.path.join(derivatives_path, 'nimosef_flip_logs', 'baseline', f"imgs_train_{new_model_experiment_name}_res_factor_{res_factor_z}")

save_folder_results = os.path.join(derivatives_path, 'nimosef_flip_logs', 'baseline', f"results_train_comparison_res_factor_{res_factor_z}")
os.makedirs(save_folder_results, exist_ok=True)

# Metadata file
metadata_filename = os.path.join(derivatives_path, 'metadata_participants_ALL.tsv')

save_folder_results = os.path.join(derivatives_path, 'nimosef_flip_logs', 'baseline', f"results_train_comparison_res_factor_{res_factor_z}")
path_to_baseline_code = os.path.join(derivatives_path, 'nimosef_flip_logs', 'baseline', baseline_experiment_name, 'shape_code.parquet')
path_to_new_model_code = os.path.join(derivatives_path, 'nimosef_flip_logs', 'baseline', new_model_experiment_name, 'shape_code.parquet')

path_to_baseline_distance = os.path.join(derivatives_path, 'nimosef_flip_logs', 'baseline', baseline_experiment_name, 'shape_code_distances.csv')
path_to_new_model_distance = os.path.join(derivatives_path, 'nimosef_flip_logs', 'baseline', new_model_experiment_name, 'shape_code_distances.csv')

df_shape_code_baseline = pd.read_parquet(path_to_baseline_code)
df_shape_code_new_model = pd.read_parquet(path_to_new_model_code)
df_shape_baseline_distance = pd.read_csv(path_to_baseline_distance, index_col=0)
df_shape_new_distance = pd.read_csv(path_to_new_model_distance, index_col=0)
df_metadata = pd.read_csv(metadata_filename, index_col=0, sep='\t')

In [None]:
df_shape_baseline_distance.index.name = 'Subject'
df_metadata.index.name = 'Subject'

# Merge the metadata and shape distance data on the 'Subject' column
df_merged = pd.merge(df_metadata.reset_index().copy(), df_shape_baseline_distance.reset_index().copy(), on='Subject')

# Subjects todrop
subj_to_drop = []

In [None]:
# Example usage:
# Assume df_merged is your merged DataFrame.
df_merged = pd.merge(df_metadata.reset_index().copy(), df_shape_baseline_distance.reset_index().copy(), on='Subject')
numeric_df = process_columns_for_rf(df_merged.copy(), columns_to_drop, columns_to_code, nan_threshold=1.)

# Define the subjects you want to remove
numeric_df = numeric_df[~numeric_df["Subject"].isin(subj_to_drop)]
numeric_df = numeric_df.reset_index(drop=True)

# Optionally, inspect the resulting DataFrame.
print("Processed DataFrame columns:")
print(numeric_df.columns)
print(numeric_df.dtypes)

print(numeric_df.shape)

In [None]:
# Assume your data is already prepared:
y = numeric_df['Distance'].values
X_df = pd.get_dummies(numeric_df.drop(['Subject', 'Distance'], axis=1))

feature_names = X_df.columns.tolist()
X = X_df.values

X_sp = pd.get_dummies(numeric_df.drop(['Subject'], axis=1))

In [None]:

# Define an RMSE scorer (note: many built-in scorers in scikit-learn are "negative" for loss functions)
rmse_scorer = make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)), greater_is_better=False)
# ----------------------------
# 5-Fold Cross-Validation Setup
# ----------------------------
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# ----------------------------
# Evaluate Prediction Accuracy with Cross-Val Predictions
# ----------------------------
rf = RandomForestRegressor(n_estimators=501, random_state=42, max_depth=20, max_samples=0.8)

# ----------------------------
# Compute and Visualize Feature Importances via CV
# ----------------------------
all_importances = []
fold_maes = []
fold_r2s = []
for train_index, test_index in cv.split(X):
    X_train, y_train = X[train_index], y[train_index]
    # Train a new model on each fold.
    rf_fold = RandomForestRegressor(n_estimators=501, random_state=42, max_depth=20, max_samples=0.8)
    rf_fold.fit(X_train, y_train)
    all_importances.append(rf_fold.feature_importances_)
    
    X_test, y_test = X[test_index], y[test_index]
    y_pred_fold = rf_fold.predict(X_test)    
    fold_maes.append(mean_absolute_error(y_test, y_pred_fold))
    fold_r2s.append(r2_score(y_test, y_pred_fold))

print("MAE for each fold:", fold_maes)
print("Mean MAE:", np.mean(fold_maes))
print("R² for each fold:", fold_r2s)
print("Mean R²:", np.mean(fold_r2s))

all_importances = np.array(all_importances)  # shape: (n_folds, n_features)
mean_importances = np.mean(all_importances, axis=0)

# Sort features by importance.
sorted_indices = np.argsort(mean_importances)[::-1]
top_n = 10  # Number of top features to display.
top_features = [feature_names[i] for i in sorted_indices[:top_n]]
top_importances = mean_importances[sorted_indices][:top_n]

# Plot feature importances.
plt.figure(figsize=(10, 6))
sns.barplot(x=top_importances, y=top_features, palette="viridis")
plt.xlabel("Average Feature Importance")
plt.title(f"Top {top_n} Important Features (5-Fold CV)")
plt.show()

In [None]:
df_importance = pd.DataFrame(data=all_importances, columns=feature_names, index=[f"Fold_{i}" for i in range(1, 6)])
df_mae = pd.DataFrame(data=fold_maes, columns=['MAE'], index=[f"Fold_{i}" for i in range(1, 6)])
df_r2 = pd.DataFrame(data=fold_r2s, columns=['R²'], index=[f"Fold_{i}" for i in range(1, 6)])
df_scores = pd.concat([df_mae, df_r2], axis=1)

df_scores.to_csv(os.path.join(save_folder_results, 'scores_baseline.csv'))
df_importance.to_csv(os.path.join(save_folder_results, 'importance_baseline.csv'))