In [1]:
# Import 
# We will need the RBCPath type from the rbclib package to load data from the RBC.
from rbclib import RBCPath

# We'll also want to load some data directly from the filesystem.
from pathlib import Path

# We'll want to load/process some of the data using pandas and numpy.
import pandas as pd
import numpy as np

import time

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [2]:
# Import data 
# all_sub_brain_data = pd.read_parquet('all_subject_brainData.parquet')

cols_to_use = ['subject_id', 'StructName', 'SurfArea', 'GrayVol'] 
all_sub_brain_data = pd.read_parquet('all_subject_brainData.parquet', columns=cols_to_use)


all_sub_brain_data["subject_id"] = all_sub_brain_data["subject_id"].str.replace("sub-", "").astype(int)

In [3]:
# Participant meta-data is generally located in the BIDS repository for each
# study:
rbcdata_path = Path('/home/jovyan/shared/data/RBC')
train_filepath = rbcdata_path / 'train_participants.tsv'
test_filepath = rbcdata_path / 'test_participants.tsv'

# Load the PNC participants TSV files...
with train_filepath.open('r') as f:
    train_data = pd.read_csv(f, sep='\t')
with test_filepath.open('r') as f:
    test_data = pd.read_csv(f, sep='\t')

# We can also concatenate the two datasets into a single dataset of all
# study participants:
all_data = pd.concat([train_data, test_data])
all_data = all_data.rename(columns={'participant_id': 'subject_id'})

In [41]:
# --- Fusion avec p_factor ---
merged_df = all_sub_brain_data.merge(all_data[['subject_id', 'p_factor']], on='subject_id')

# --- Agrégation par sujet et région ---
aggregated = merged_df.groupby(['subject_id', 'StructName']).mean(numeric_only=True)

# --- Pivot : index=subject_id, colonnes multi-indexées (StructName, measure) ---
pivot = aggregated.unstack(level='StructName')

# --- Flatten les colonnes : (measure, StructName) → StructName.measure ---
pivot.columns = [f"{struct}.{measure}" for measure, struct in pivot.columns]

# --- Ajout du p_factor ---
pivot = pivot.reset_index()
pivot = pivot.merge(all_data[['subject_id', 'p_factor']], on='subject_id')
pivot = pivot.dropna(subset=['p_factor'])

In [42]:
pivot

Unnamed: 0,subject_id,17Networks_1.SurfArea,17Networks_10.SurfArea,17Networks_11.SurfArea,17Networks_12.SurfArea,17Networks_13.SurfArea,17Networks_14.SurfArea,17Networks_15.SurfArea,17Networks_16.SurfArea,17Networks_17.SurfArea,...,region00901.p_factor,rostralanteriorcingulate.p_factor,rostralmiddlefrontal.p_factor,superiorfrontal.p_factor,superiorparietal.p_factor,superiortemporal.p_factor,supramarginal.p_factor,temporalpole.p_factor,transversetemporal.p_factor,p_factor
0,1317462,7595.5,3657.0,2284.5,5454.0,6478.0,2831.0,1791.0,8694.5,7622.5,...,-1.065559,-1.065559,-1.065559,-1.065559,-1.065559,-1.065559,-1.065559,-1.065559,-1.065559,-1.065559
1,8454119,9920.5,4286.0,1916.5,6549.5,7965.0,4006.5,2276.0,9302.0,9517.0,...,,-1.209245,-1.209245,-1.209245,-1.209245,-1.209245,-1.209245,-1.209245,-1.209245,-1.209245
3,11574123,8253.0,3672.5,1908.0,5427.0,6431.5,3170.5,1882.0,7519.0,7522.5,...,-0.573095,-0.573095,-0.573095,-0.573095,-0.573095,-0.573095,-0.573095,-0.573095,-0.573095,-0.573095
4,16684978,6775.5,3075.0,1611.5,4520.5,5365.0,2562.0,1472.0,6546.5,6317.0,...,0.269375,0.269375,0.269375,0.269375,0.269375,0.269375,0.269375,0.269375,0.269375,0.269375
5,25354225,8901.5,3072.5,1687.5,5686.0,6452.0,3141.5,1930.0,7148.0,7519.0,...,-1.608375,-1.608375,-1.608375,-1.608375,-1.608375,-1.608375,-1.608375,-1.608375,-1.608375,-1.608375
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1585,4285301913,7384.0,3136.5,1811.5,4713.0,5814.0,2397.0,1789.0,7204.0,6681.0,...,-0.762220,-0.762220,-0.762220,-0.762220,-0.762220,-0.762220,-0.762220,-0.762220,-0.762220,-0.762220
1586,4285804772,8567.5,4515.0,2151.0,6606.0,8403.5,3681.0,2252.5,10198.0,9567.5,...,0.032354,0.032354,0.032354,0.032354,0.032354,0.032354,0.032354,0.032354,0.032354,0.032354
1587,4286784089,7161.5,2994.0,1515.5,4576.5,5311.0,2525.0,1570.5,6534.5,6088.5,...,,-0.682395,-0.682395,-0.682395,-0.682395,-0.682395,-0.682395,-0.682395,-0.682395,-0.682395
1590,4290234295,8122.5,3701.5,2156.0,5658.0,6492.5,2922.5,1999.5,7985.0,7168.0,...,-1.608375,-1.608375,-1.608375,-1.608375,-1.608375,-1.608375,-1.608375,-1.608375,-1.608375,-1.608375


In [43]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.impute import SimpleImputer
# from sklearn.model_selection import train_test_split
# import pandas as pd

# Préparation des données pour le modèle
X = pivot.drop(columns=['subject_id', 'p_factor'])
y = pivot['p_factor']

# Imputation
X = SimpleImputer(strategy='mean').fit_transform(X)

# Split + modèle
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [46]:
# Importance par feature
importances = model.feature_importances_
feature_names = pivot.drop(columns=['subject_id', 'p_factor']).columns

# Extraire le nom de la région (avant le point)
struct_names = [col.split('.')[0] for col in feature_names]
importance_df = pd.DataFrame({'StructName': struct_names, 'Importance': importances})

# Regrouper les importances par structure (somme des importances de ses paramètres)
importance_by_struct = importance_df.groupby('StructName')['Importance'].sum().sort_values(ascending=False)

# Affichage


In [51]:
roi2keep = importance_by_struct.head(100)
roi_list = roi2keep.index.tolist()



In [54]:
filtered_df = all_sub_brain_data[all_sub_brain_data['StructName'].isin(roi_list)]

In [61]:
# --- Fusion avec p_factor ---
merged_df2 = filtered_df.merge(all_data[['subject_id', 'p_factor']], on='subject_id')


In [63]:
train_df = merged_df2[merged_df2['p_factor'].notna()].copy()
test_df = merged_df2[merged_df2['p_factor'].isna()].copy()

In [66]:
# 1. Agrégation moyenne sur train_df
agg_train = train_df.groupby(['subject_id', 'StructName']).mean(numeric_only=True).reset_index()

# 2. Pivot train (colonnes multi-index)
pivot_train = agg_train.pivot(index='subject_id', columns='StructName')

# Flatten colonnes (param, roi)
pivot_train.columns = [f"{param}_{roi}" for param, roi in pivot_train.columns]

# Cible y_train
p_factor_train = train_df.groupby('subject_id')['p_factor'].first()

# Même opération sur test_df
agg_test = test_df.groupby(['subject_id', 'StructName']).mean(numeric_only=True).reset_index()
pivot_test = agg_test.pivot(index='subject_id', columns='StructName')
pivot_test.columns = [f"{param}_{roi}" for param, roi in pivot_test.columns]

# --- Gestion des colonnes manquantes dans test par rapport à train ---
pivot_test = pivot_test.reindex(columns=pivot_train.columns, fill_value=0)  # ou np.nan puis imputer

In [69]:
X_train = pivot_train.values
X_test = pivot_test.values

y_train = p_factor_train.loc[pivot_train.index].values  # ordre aligné

# --- Modèle ---
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# --- Prédictions ---
y_pred = model.predict(X_test)

# Résultat : DataFrame avec subject_id et prédiction
predictions_df = pd.DataFrame({
    'subject_id': pivot_test.index,
    'p_factor_pred': y_pred
})

In [70]:
predictions_df

Unnamed: 0,subject_id,p_factor_pred
0,11407866,-1.599839
1,27876862,-1.599839
2,29811890,-1.599716
3,32938469,-1.599839
4,37329051,-1.599839
...,...,...
527,4242587695,-1.599839
528,4264683889,-1.599839
529,4284483827,-1.599839
530,4288368085,-1.599839
