In [1]:
import os 
import shutil
import pandas as pd
from config_local import helpers

from sklearn.model_selection import train_test_split

hp = helpers()

_data_path = hp.get_data_path()
phenotypic = pd.read_csv(_data_path + "NYU/NYU_phenotypic.csv").rename(columns={"ScanDir ID": "ID"})

In [2]:
phenotypic = phenotypic[phenotypic["QC_Rest_1"].notna() & (phenotypic["DX"]!=2)]
phenotypic['y'] = (phenotypic['DX'] == 0)

In [3]:
df_train, df_test = train_test_split(
    phenotypic, test_size=0.2, shuffle=True, stratify=phenotypic['DX'], random_state=815
)

In [4]:
print(f"Train distribution:\n{df_train.DX.value_counts(normalize=True)}\n")
print(f"Test distribution:\n{df_test.DX.value_counts(normalize=True)}\n")

Train distribution:
0    0.456140
1    0.339181
3    0.204678
Name: DX, dtype: float64

Test distribution:
0    0.465116
1    0.348837
3    0.186047
Name: DX, dtype: float64



In [5]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=815)
_ = skf.get_n_splits(df_train, df_train.DX)

In [6]:
for i, (train_index, test_index) in enumerate(skf.split(df_train.drop('y', axis=1), df_train.y)):
    print(f"Fold {i}:")
    print(f"Test fold distribution:\n{df_train.iloc[test_index].DX.value_counts(normalize=True)}\n")

Fold 0:
Test fold distribution:
0    0.457143
3    0.285714
1    0.257143
Name: DX, dtype: float64

Fold 1:
Test fold distribution:
0    0.470588
1    0.411765
3    0.117647
Name: DX, dtype: float64

Fold 2:
Test fold distribution:
0    0.470588
1    0.323529
3    0.205882
Name: DX, dtype: float64

Fold 3:
Test fold distribution:
0    0.441176
1    0.323529
3    0.235294
Name: DX, dtype: float64

Fold 4:
Test fold distribution:
0    0.441176
1    0.382353
3    0.176471
Name: DX, dtype: float64



In [7]:
for i, (train_index, test_index) in enumerate(skf.split(df_train.drop('y', axis=1), df_train.y)):
    print(f"Fold {i+1}:")
    print(f"Test fold distribution:\n{df_train.iloc[test_index].DX.value_counts(normalize=True)}\n")

    dest = _data_path + f"data/train/fold{i+1}/"
    print(dest)
    if os.path.exists(dest):
        shutil.rmtree(dest)
    
    os.mkdir(dest)
    for id in df_train.iloc[test_index].ID.values:
        
        ID = str(id).zfill(7)
        file_mean = f"wmean_mrda{ID}_session_1_rest_1.nii.gz"
        file_mask = f"mask_wmean_mrda{ID}_session_1_rest_1.nii.gz"
        file_func = f"sfnwmrda{ID}_session_1_rest_1.nii.gz"
        src = _data_path + f"NYU/{ID}/"
        
        
        if not os.path.exists(src + file_mean): 
            print(f"{src}\n does not exist!")
        else:
            shutil.copyfile(src + file_mean, dest + file_mean)
            shutil.copyfile(src + file_mask, dest + file_mask)
            shutil.copyfile(src + file_func, dest + file_func)


Fold 1:
Test fold distribution:
0    0.457143
3    0.285714
1    0.257143
Name: DX, dtype: float64

/media/jan/TOSHIBA EXT/SMHDD_22/data/train/fold1/
Fold 2:
Test fold distribution:
0    0.470588
1    0.411765
3    0.117647
Name: DX, dtype: float64

/media/jan/TOSHIBA EXT/SMHDD_22/data/train/fold2/
Fold 3:
Test fold distribution:
0    0.470588
1    0.323529
3    0.205882
Name: DX, dtype: float64

/media/jan/TOSHIBA EXT/SMHDD_22/data/train/fold3/
Fold 4:
Test fold distribution:
0    0.441176
1    0.323529
3    0.235294
Name: DX, dtype: float64

/media/jan/TOSHIBA EXT/SMHDD_22/data/train/fold4/
Fold 5:
Test fold distribution:
0    0.441176
1    0.382353
3    0.176471
Name: DX, dtype: float64

/media/jan/TOSHIBA EXT/SMHDD_22/data/train/fold5/


In [8]:
dest = _data_path + f"data/test/"

# delete folder if it exists and creat new one
if os.path.exists(dest):
        shutil.rmtree(dest)
    
os.mkdir(dest)

# copy test files
for id in df_test.ID.values:
    
    ID = str(id).zfill(7)

    file_mean = f"wmean_mrda{ID}_session_1_rest_1.nii.gz"
    file_mask = f"mask_wmean_mrda{ID}_session_1_rest_1.nii.gz"
    file_func = f"sfnwmrda{ID}_session_1_rest_1.nii.gz"
    src = _data_path + f"NYU/{ID}/"
    
    
    if not os.path.exists(src + file_mean): 
        print(f"{src}\n does not exist!")
    else:
        shutil.copyfile(src + file_mean, dest + file_mean)
        shutil.copyfile(src + file_mask, dest + file_mask)
        shutil.copyfile(src + file_func, dest + file_func)


<p style="text-align: center;"><i><b>Fin</b></i></p>