### Generate the post hoc dataset for analysis

In [1]:
from imagen_posthocloader import *

In [2]:
DATA_DIR = "/ritter/share/data/IMAGEN"
posthoc = IMAGEN_posthoc()

### 1. Load the [INSTRUMENT](https://imagen-europe.com/resources/imagen-dataset/documentation/) data

#### collect the selected instrument files from IMAGEN_RAW and store in posthoc

Please refer to <i>set_INSTRUMENT()</i> in <i>imagen_posthocloader.py</i>, and load the file and save it as <b> all_*.csv</b> 

In [3]:
# Instrument
NEO   = posthoc.set_INSTRUMENT('NEO')#, save=True)
SURPS = posthoc.set_INSTRUMENT('SURPS')#, save=True)
CTQ   = posthoc.set_INSTRUMENT('CTQ')#, save=True)
CTS   = posthoc.set_INSTRUMENT('CTS')#, save=True)
LEQ   = posthoc.set_INSTRUMENT('LEQ')#, save=True)
PBQ   = posthoc.set_INSTRUMENT('PBQ')#, save=True)
GEN   = posthoc.set_INSTRUMENT('GEN')#, save=True)
FTND  = posthoc.set_INSTRUMENT('FTND')#, save=True)

In [4]:
# general information of the instrument
# selected ROI
col_NEO = list(NEO.columns[2:].values)
col_SURPS = list(SURPS.columns[2:].values)
col_CTQ = list(CTQ.columns[2:].values)
col_CTS = list(CTS.columns[2:].values)
col_LEQ = list(LEQ.columns[2:].values)
col_PBQ = list(PBQ.columns[2:].values)
col_GEN = list(GEN.columns[2:].values)
col_FTND = list(FTND.columns[2:].values)
roi_list = col_NEO + col_SURPS + col_CTQ + col_LEQ + col_PBQ + col_GEN + col_FTND
print(roi_list)

['Openness mean', 'Conscientiousness mean', 'Extroversion mean', 'Agreeableness mean', 'Neuroticism mean', 'Anxiety Sensitivity mean', 'Hopelessness mean', 'Impulsivity mean', 'Sensation seeking mean', 'Emotional abuse sum', 'Physical abuse sum', 'Sexual abuse sum', 'Emotional neglect sum', 'Physical neglect sum', 'Denial sum', 'Family valence', 'Accident valence', 'Sexuality valence', 'Autonomy valence', 'Devience valence', 'Relocation valence', 'Distress valence', 'Noscale valence', 'Overall valence', 'Family mean frequency', 'Accident mean frequency', 'Sexuality mean frequency', 'Autonomy mean frequency', 'Devience mean frequency', 'Relocation mean frequency', 'Distress mean frequency', 'Noscale mean frequency', 'Overall mean frequency', 'pbq_03', 'pbq_03a', 'pbq_03b', 'pbq_03c', 'pbq_05', 'pbq_05a', 'pbq_05b', 'pbq_05c', 'pbq_06', 'pbq_06a', 'pbq_12', 'pbq_13', 'pbq_13a', 'pbq_13b', 'pbq_13g', 'Paternal_disorder', 'Maternal_disorder', 'Likelihood of nicotine dependence child']


### 2. Load the HDF5 data

#### collect the HDF5 files from h5files and save in posthoc

Please refer to <i>set_HDF5()</i> in <i>imagen_posthocloader.py</i>, and load the file and save it as <b> all_*.csv</b>

In [5]:
# Hdf5
BINGE = posthoc.set_HDF5('Binge')#, save=True)

In [6]:
# general information of the hdf5
BINGE.columns

Index(['ID', 'Session', 'y', 'Dataset', 'Sex', 'Site', 'Class'], dtype='object')

### 3. Load the RUN data

#### collect the RUN file from results and save in posthoc

Please refer to <i>set_RUN()</i> in <i>imagen_posthocloader.py</i>, and load the file and save it as <b> all_*.csv</b>

In [7]:
# RUN
RUN = posthoc.set_RUN('../../results/holdout_all-tp-clean_run.csv')#, save=True)

In [8]:
# general infromation of the hdf5
RUN.columns

Index(['i', 'o', 'io', 'technique', 'Session', 'Trial', 'path', 'n_samples',
       'n_samples_cc', 'i_is_conf', 'o_is_conf', 'Model', 'model_SVM-rbf__C',
       'model_SVM-rbf__gamma', 'runtime', 'model_SVM-lin__C',
       'model_GB__learning_rate', 'model_LR__C', 'train_score', 'valid_score',
       'test_score', 'roc_auc', 'holdout_score', 'holdout_roc_auc', 'dataset',
       'ID', 'true_label', 'prediction', 'TP prob', 'TN prob', 'FP prob',
       'FN prob', 'T prob', 'F prob', 'Prob', 'Predict TF', 'Model PN',
       'Label PN'],
      dtype='object')

### 4. Save the INSTRUMENT data

#### collect the instrument files from posthoc into one file

Please refer to <i>to_INSTRUMENT()</i> in <i>imagen_posthocloader.py</i>, and load the file

In [9]:
# collect the instrument file 
inst_list = [NEO, SURPS,CTQ, CTS, LEQ, PBQ, GEN,FTND]
# save the instrument file
INST = posthoc.to_INSTRUMENT(inst_list)#, save=True)

In [10]:
# general information of the instrument
# selected ROI
col_INST = list(INST.columns[2:].values)
print(col_INST)

['Openness mean', 'Conscientiousness mean', 'Extroversion mean', 'Agreeableness mean', 'Neuroticism mean', 'Anxiety Sensitivity mean', 'Hopelessness mean', 'Impulsivity mean', 'Sensation seeking mean', 'Emotional abuse sum', 'Physical abuse sum', 'Sexual abuse sum', 'Emotional neglect sum', 'Physical neglect sum', 'Denial sum', 'Assault mean', 'Injury mean', 'Negotiation mean', 'Psychological Aggression mean', 'Sexual Coercion mean', 'Family valence', 'Accident valence', 'Sexuality valence', 'Autonomy valence', 'Devience valence', 'Relocation valence', 'Distress valence', 'Noscale valence', 'Overall valence', 'Family mean frequency', 'Accident mean frequency', 'Sexuality mean frequency', 'Autonomy mean frequency', 'Devience mean frequency', 'Relocation mean frequency', 'Distress mean frequency', 'Noscale mean frequency', 'Overall mean frequency', 'pbq_03', 'pbq_03a', 'pbq_03b', 'pbq_03c', 'pbq_05', 'pbq_05a', 'pbq_05b', 'pbq_05c', 'pbq_06', 'pbq_06a', 'pbq_12', 'pbq_13', 'pbq_13a', 'pb

### 5. Read the INSTRUMENT data

#### read the instrument files from posthoc into one file

Please refer to <i>read_INSTRUMENT()</i> in <i>imagen_posthocloader.py</i>, and load the file

In [11]:
INST = posthoc.read_INSTRUMENT('IMAGEN_INSTRUMENT.csv')

In [12]:
# general information of the instrument
INST.columns

Index(['ID', 'Session', 'Openness mean', 'Conscientiousness mean',
       'Extroversion mean', 'Agreeableness mean', 'Neuroticism mean',
       'Anxiety Sensitivity mean', 'Hopelessness mean', 'Impulsivity mean',
       'Sensation seeking mean', 'Emotional abuse sum', 'Physical abuse sum',
       'Sexual abuse sum', 'Emotional neglect sum', 'Physical neglect sum',
       'Denial sum', 'Assault mean', 'Injury mean', 'Negotiation mean',
       'Psychological Aggression mean', 'Sexual Coercion mean',
       'Family valence', 'Accident valence', 'Sexuality valence',
       'Autonomy valence', 'Devience valence', 'Relocation valence',
       'Distress valence', 'Noscale valence', 'Overall valence',
       'Family mean frequency', 'Accident mean frequency',
       'Sexuality mean frequency', 'Autonomy mean frequency',
       'Devience mean frequency', 'Relocation mean frequency',
       'Distress mean frequency', 'Noscale mean frequency',
       'Overall mean frequency', 'pbq_03', 'pbq_03a',

### 6. Read the HDF5 data

#### collect the hdf5 files from posthoc into one file

Please refer to <i>to_HDF5()</i> in <i>imagen_posthocloader.py</i>, and load the file and save it as <b> all_*.csv</b>

In [13]:
HDF5 = posthoc.to_HDF5('all_Binge.csv')#, save=True)

#### read the HDF5 files from posthoc into one file

Please refer to <i>read_HDF5()</i> in <i>imagen_posthocloader.py</i>, and load the file</b>

In [14]:
HDF5 = posthoc.read_HDF5('IMAGEN_HDF5.csv')

In [15]:
# general information of the hdf5
HDF5.columns

Index(['ID', 'Session', 'y', 'Dataset', 'Sex', 'Site', 'Class'], dtype='object')

### 7. Read the RUN data

#### select the ROI of the RUN file from posthoc into one file

Please refer to <i>to_RUN()</i> in <i>imagen_posthocloader.py</i>, and load the file and save it as <b> all_*.csv</b>

In [16]:
COL = ['ID','Session','Trial','dataset','io','technique','Model',
       'TP prob','TN prob','FP prob','FN prob','T prob','F prob','Prob',
       'Predict TF','Model PN','Label PN','true_label','prediction']

In [17]:
RUN = posthoc.to_RUN('all_RUN.csv', COL)#, save = True)

In [18]:
# general information of the run
RUN.columns

Index(['ID', 'Session', 'Trial', 'dataset', 'io', 'technique', 'Model',
       'TP prob', 'TN prob', 'FP prob', 'FN prob', 'T prob', 'F prob', 'Prob',
       'Predict TF', 'Model PN', 'Label PN', 'true_label', 'prediction'],
      dtype='object')

#### read the RUN files from posthoc into one file

Please refer to <i>read_RUN()</i> in <i>imagen_posthocloader.py</i>, and load the file

In [19]:
RUN = posthoc.read_RUN('IMAGEN_RUN.csv')

In [20]:
# general information of the run
RUN.columns

Index(['ID', 'Session', 'Trial', 'dataset', 'io', 'technique', 'Model',
       'TP prob', 'TN prob', 'FP prob', 'FN prob', 'T prob', 'F prob', 'Prob',
       'Predict TF', 'Model PN', 'Label PN', 'true_label', 'prediction'],
      dtype='object')

### 8. Save the post hoc dataset

#### set the dataset for analysis of diagnosis (X:FU3 == y:FU3)

Please refer to <i>to_posthoc()</i> in <i>imagen_posthocloader.py</i>, and load the file and save it as <b> all_*.csv</b>

In [34]:
DATA = ['IMAGEN_HDF5.csv', 'IMAGEN_INSTRUMENT.csv', 'IMAGEN_RUN.csv']
FU3 = posthoc.to_posthoc(DATA)#, save=True)

In [35]:
# general information of the instrument
FU3.columns

Index(['ID', 'Session', 'y', 'Dataset', 'Sex', 'Site', 'Class', 'Trial',
       'dataset', 'io', 'technique', 'Model', 'TP prob', 'TN prob', 'FP prob',
       'FN prob', 'T prob', 'F prob', 'Prob', 'Predict TF', 'Model PN',
       'Label PN', 'true_label', 'prediction', 'Openness mean',
       'Conscientiousness mean', 'Extroversion mean', 'Agreeableness mean',
       'Neuroticism mean', 'Anxiety Sensitivity mean', 'Hopelessness mean',
       'Impulsivity mean', 'Sensation seeking mean', 'Emotional abuse sum',
       'Physical abuse sum', 'Sexual abuse sum', 'Emotional neglect sum',
       'Physical neglect sum', 'Denial sum', 'Assault mean', 'Injury mean',
       'Negotiation mean', 'Psychological Aggression mean',
       'Sexual Coercion mean', 'Family valence', 'Accident valence',
       'Sexuality valence', 'Autonomy valence', 'Devience valence',
       'Relocation valence', 'Distress valence', 'Noscale valence',
       'Overall valence', 'Family mean frequency', 'Accident mean fre

In [36]:
FU3.iloc[1000]

ID                                                           37058553
Session                                                           FU3
y                                                               Binge
Dataset                                                      Training
Sex                                                              Male
                                                      ...            
pbq_13b                                                           NaN
pbq_13g                                                           NaN
Paternal_disorder                          Major Depression recurrent
Maternal_disorder                                                 NaN
Likelihood of nicotine dependence child                less dependent
Name: 1000, Length: 80, dtype: object

#### set the dataset for analysis of prognosis (X:FU3 != y:FU3)

Please refer to <i>read_posthoc()</i> in <i>imagen_posthocloader.py</i>, and load the file

In [37]:
FU3 = posthoc.read_posthoc('IMAGEN_posthoc.csv')

In [38]:
FU3.groupby('Dataset').get_group('Holdout')

Unnamed: 0,ID,Session,y,Dataset,Sex,Site,Class,Trial,dataset,io,...,pbq_06,pbq_06a,pbq_12,pbq_13,pbq_13a,pbq_13b,pbq_13g,Paternal_disorder,Maternal_disorder,Likelihood of nicotine dependence child
2600,1163495,FU3,Binge,Holdout,Male,Mannheim,AAM,0,Holdout set,X-Binge,...,,,,,,,,,,less dependent
2601,1163495,FU3,Binge,Holdout,Male,Mannheim,AAM,1,Holdout set,X-Binge,...,,,,,,,,,,less dependent
2602,1163495,FU3,Binge,Holdout,Male,Mannheim,AAM,2,Holdout set,X-Binge,...,,,,,,,,,,less dependent
2603,1163495,FU3,Binge,Holdout,Male,Mannheim,AAM,3,Holdout set,X-Binge,...,,,,,,,,,,less dependent
2604,1163495,FU3,Binge,Holdout,Male,Mannheim,AAM,4,Holdout set,X-Binge,...,,,,,,,,,,less dependent
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5451,99875982,FU3,Binge,Holdout,Male,Hamburg,HC,2,Holdout set,X-Binge,...,,,,,,,,Major Depression single episode,,less dependent
5452,99875982,FU3,Binge,Holdout,Male,Hamburg,HC,3,Holdout set,X-Binge,...,,,,,,,,Major Depression single episode,,less dependent
5453,99875982,FU3,Binge,Holdout,Male,Hamburg,HC,4,Holdout set,X-Binge,...,,,,,,,,Major Depression single episode,,less dependent
5454,99875982,FU3,Binge,Holdout,Male,Hamburg,HC,5,Holdout set,X-Binge,...,,,,,,,,Major Depression single episode,,less dependent


In [39]:
FU3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5456 entries, 0 to 5455
Data columns (total 80 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   ID                                       5456 non-null   int64  
 1   Session                                  5456 non-null   object 
 2   y                                        5456 non-null   object 
 3   Dataset                                  5456 non-null   object 
 4   Sex                                      5456 non-null   object 
 5   Site                                     5456 non-null   object 
 6   Class                                    5456 non-null   object 
 7   Trial                                    5456 non-null   int64  
 8   dataset                                  5456 non-null   object 
 9   io                                       5456 non-null   object 
 10  technique                                5456 no

In [40]:
# general information of the instrument
print(list(FU3.columns))

['ID', 'Session', 'y', 'Dataset', 'Sex', 'Site', 'Class', 'Trial', 'dataset', 'io', 'technique', 'Model', 'TP prob', 'TN prob', 'FP prob', 'FN prob', 'T prob', 'F prob', 'Prob', 'Predict TF', 'Model PN', 'Label PN', 'true_label', 'prediction', 'Openness mean', 'Conscientiousness mean', 'Extroversion mean', 'Agreeableness mean', 'Neuroticism mean', 'Anxiety Sensitivity mean', 'Hopelessness mean', 'Impulsivity mean', 'Sensation seeking mean', 'Emotional abuse sum', 'Physical abuse sum', 'Sexual abuse sum', 'Emotional neglect sum', 'Physical neglect sum', 'Denial sum', 'Assault mean', 'Injury mean', 'Negotiation mean', 'Psychological Aggression mean', 'Sexual Coercion mean', 'Family valence', 'Accident valence', 'Sexuality valence', 'Autonomy valence', 'Devience valence', 'Relocation valence', 'Distress valence', 'Noscale valence', 'Overall valence', 'Family mean frequency', 'Accident mean frequency', 'Sexuality mean frequency', 'Autonomy mean frequency', 'Devience mean frequency', 'Reloc

In [41]:
print(FU3.iloc[1000])

ID                                                           37058553
Session                                                           FU3
y                                                               Binge
Dataset                                                      Training
Sex                                                              Male
                                                      ...            
pbq_13b                                                           NaN
pbq_13g                                                           NaN
Paternal_disorder                          Major Depression recurrent
Maternal_disorder                                                 NaN
Likelihood of nicotine dependence child                less dependent
Name: 1000, Length: 80, dtype: object
