In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from nimosef.utils.stats import drop_high_nan, process_columns_for_rf
from ukb_utils import columns_to_code, columns_to_drop, pretty_name_map

In [None]:
data_path = '/media/jaume/DATA/Data/Urblauna_SFTP/UKB_Cardiac_BIDS'
split_filename = 'derivatives/nimosef_flip_logs/train_val_test_split.json'
number_patients = 1000

baseline_experiment_name = 'experiment_20250310_175059' # Baseline v1
new_model_experiment_name = 'experiment_20250311_001426' # Motion v1

derivatives_path = os.path.join(data_path, 'derivatives')

# For the training
dataset_split = 'train'

res_factor_z = 1  # Same as original
img_folder_baseline = os.path.join(derivatives_path, 'nimosef_flip_logs', 'baseline', f"imgs_train_{baseline_experiment_name}_res_factor_{res_factor_z}")
img_folder_new_model = os.path.join(derivatives_path, 'nimosef_flip_logs', 'baseline', f"imgs_train_{new_model_experiment_name}_res_factor_{res_factor_z}")

save_folder_results = os.path.join(derivatives_path, 'nimosef_flip_logs', 'baseline', f"results_train_comparison_res_factor_{res_factor_z}")
os.makedirs(save_folder_results, exist_ok=True)

# Metadata file
metadata_filename = os.path.join(derivatives_path, 'metadata_participants_ALL.tsv')

save_folder_results = os.path.join(derivatives_path, 'nimosef_flip_logs', 'baseline', f"results_train_comparison_res_factor_{res_factor_z}")
path_to_baseline_code = os.path.join(derivatives_path, 'nimosef_flip_logs', 'baseline', baseline_experiment_name, 'shape_code.parquet')
path_to_new_model_code = os.path.join(derivatives_path, 'nimosef_flip_logs', 'baseline', new_model_experiment_name, 'shape_code.parquet')

path_to_baseline_distance = os.path.join(derivatives_path, 'nimosef_flip_logs', 'baseline', baseline_experiment_name, 'shape_code_distances.csv')
path_to_new_model_distance = os.path.join(derivatives_path, 'nimosef_flip_logs', 'baseline', new_model_experiment_name, 'shape_code_distances.csv')

# path_to_flip = os.path.join(save_folder_results, 'flip_affine.csv')
# df_flip_affine = pd.read_csv(path_to_flip, index_col=0)

df_shape_code_baseline = pd.read_parquet(path_to_baseline_code)
df_shape_code_new_model = pd.read_parquet(path_to_new_model_code)
df_shape_baseline_distance = pd.read_csv(path_to_baseline_distance, index_col=0)
df_shape_new_distance = pd.read_csv(path_to_new_model_distance, index_col=0)
df_metadata = pd.read_csv(metadata_filename, index_col=0, sep='\t')

In [None]:
# print(df_metadata.head())
# print(df_shape_new_distance.head())
df_shape_new_distance.index.name = 'Subject'
df_metadata.index.name = 'Subject'

# df_flip_affine.index.name = 'Subject's

# Merge the metadata and shape distance data on the 'Subject' column
df_merged = pd.merge(df_metadata.reset_index().copy(), df_shape_new_distance.reset_index().copy(), on='Subject')
# df_merged = pd.merge(df_merged, df_flip_affine.reset_index().copy(), on='Subject')
print(df_merged.head())
# df_merged.head()

# Subjects to drop
# subj_to_drop = ['sub-1076522', 'sub-1112328', 'sub-1019084', 'sub-1140112', 'sub-1134162']
subj_to_drop = []

In [None]:
# Example usage:
# Assume df_merged is your merged DataFrame.
df_merged = pd.merge(df_metadata.reset_index().copy(), df_shape_new_distance.reset_index().copy(), on='Subject')
# df_merged = pd.merge(df_merged, df_flip_affine.reset_index().copy(), on='Subject')
numeric_df = process_columns_for_rf(df_merged.copy(), columns_to_drop, columns_to_code, nan_threshold=1)

# Define the subjects you want to remove
numeric_df = numeric_df[~numeric_df["Subject"].isin(subj_to_drop)]
numeric_df = numeric_df.reset_index(drop=True)

# Optionally, inspect the resulting DataFrame.
print("Processed DataFrame columns:")
print(numeric_df.columns)
print(numeric_df.dtypes)

print(numeric_df.shape)

In [None]:
# Assume your data is already prepared:
# y is your target (Distance)
# X is obtained via:
# X = pd.get_dummies(df_merged.drop(['Subject', 'Distance'], axis=1)).values
# Also get the feature names:
y = numeric_df['Distance'].values
X_df = pd.get_dummies(numeric_df.drop(['Subject', 'Distance'], axis=1))
# X_df = numeric_df.drop(['Subject', 'Distance'], axis=1).copy()
feature_names = X_df.columns.tolist()
X = X_df.values

In [None]:
scores_baseline = os.path.join(save_folder_results, 'scores_baseline.csv')
importance_baseline = os.path.join(save_folder_results, 'importance_baseline.csv')

scores_new_model = os.path.join(save_folder_results, 'scores_new.csv')
importance_new_model = os.path.join(save_folder_results, 'importance_new.csv')

df_scores_baseline = pd.read_csv(scores_baseline, index_col=0)
df_importance_baseline = pd.read_csv(importance_baseline, index_col=0)

df_scores_new = pd.read_csv(scores_new_model, index_col=0)
df_importance_new = pd.read_csv(importance_new_model, index_col=0)

df_scores_new.index.name = 'Fold'
df_scores_baseline.index.name = 'Fold'

df_importance_new.index.name = 'Fold'
df_importance_baseline['model'] = 'baseline'
df_importance_new['model'] = 'new'

df_scores_baseline.columns = ['MAE', 'R2']
df_scores_new.columns = ['MAE', 'R2']

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 14})

# Combine the importance DataFrames
df_importances = pd.concat([df_importance_baseline, df_importance_new])

# Compute the average importance per feature across folds for each model.
# (Assumes that all columns except 'model' are feature importances.)
df_avg = df_importances.groupby('model').mean()

# Transpose so rows = features, columns = models
df_avg_transposed = df_avg.transpose()

# Optionally, sort features by the maximum importance across models
df_avg_transposed['max_importance'] = df_avg_transposed[['baseline', 'new']].max(axis=1)
df_avg_transposed = df_avg_transposed.sort_values('max_importance', ascending=False)
df_avg_transposed.drop('max_importance', axis=1, inplace=True)

# Optionally, select the top N features to keep the plot clean
top_n = 20
df_top = df_avg_transposed.head(top_n)
df_top.columns = ['Baseline', 'Ours']

# Rename the features
df_top_pretty = df_top.rename(index=pretty_name_map)

# ----------------------------
# Compute Average R² for Each Model
# ----------------------------
baseline_r2 = df_scores_baseline['R2'].mean()
new_r2 = df_scores_new['R2'].mean()

# Plot a side-by-side bar chart
ax = df_top_pretty.plot(kind='bar', figsize=(10, 6))
plt.title("Average Feature Importances Across Folds (5-Fold CV)")
# plt.xlabel("Feature")
plt.ylabel("Average Importance")
plt.xticks(rotation=45, ha='right')

# ----------------------------
# Plotting with R² Annotation
# ----------------------------
# Create a text box with the average R² values.
r2_text = f"Baseline R²: {baseline_r2:.2f}\nOurs R²: {new_r2:.2f}"
props = dict(boxstyle='round', facecolor='white', alpha=0.5)
ax.text(0.99, 0.7, r2_text, transform=ax.transAxes, fontsize=14,
        verticalalignment='top', horizontalalignment='right', bbox=props)

# Change legend location to upper left
ax.legend(loc='upper right')

plt.tight_layout()
plt.show()

df_top.head()

# Instead of just showing, grab the figure and save it.
fig = ax.get_figure()  # or plt.gcf()
save_path = os.path.join(save_folder_results, 'feature_importances.png')
fig.savefig(save_path, dpi=300, bbox_inches='tight')

save_path = os.path.join(save_folder_results, 'feature_importances.eps')
fig.savefig(save_path, format='eps', dpi=300, bbox_inches='tight')