# Notebook to qc the given demographic file and summarise info 

In [180]:
import os
import glob
import pandas as pd
import numpy as np
import json
import shutil
import matplotlib.pyplot as plt

In [181]:
# initialise 

site_code = 'H10'
site_folder = f'/home/mathilde/Documents/RDS/MELD_FE/DATA/MELD_{site_code}'

csv_file = os.path.join(site_folder,f'MELD_participants_infos_{site_code}.csv')
df_raw=pd.read_csv(csv_file)
df_raw.head()

df = df_raw.copy()

## 1. Check data are correct

In [184]:
columns = ['id', 'site', 'patient_control', 'sex',
       'radiology', 'radiology_report', 'field_strengths___1',
       'field_strengths___2', 'field_strengths___3', 'age_at_preop_t1_15t',
       'age_at_preop_t1_3t', 'age_at_preop_t1_7t', 'preop_t1_yr_15t',
       'preop_t1_yr_3t', 'preop_t1_yr_7t', 'postop_t1_yr',
       'postop_t1_yr_2___1', 'postop_t1_yr_2___2', 'postop_t1_yr_2___3',
       'control_headache', 'preop_t1', 'preop_t2', 'preop_flair', 'preop_dwi',
       'postop_t1', 'fields', 'lesion_mask', 'age_at_onset', 'gtcs',
       'drug_resistant', 'aeds', 'mri_negative', 'seeg', 'operated',
       'surgery_year', 'age_at_surgery', 'mri_negative_surgery', 'procedure',
       'procedure_other', 'histology', 'histology_other', 'seizure_free',
       'seizure_free_aura', 'engel_1yr', 'ilae_1yr', 'engel', 'ilae',
       'follow_up', 'aeds_post_op', 'participant_information_complete']

In [185]:
check_functions = {
    'id':                   (lambda x: check_id_MELD(x, 
                                                site_code=site_code)),
    'site':                 (lambda x: check_site_code(x, 
                                                site_code=site_code)),
    'patient_control':      (lambda x: check_in_categories(x, 
                                                categories=[1,2])),
    'sex':                  (lambda x: check_in_categories(x, 
                                                categories=[0,1])),
    'radiology':            (lambda x: check_in_categories(x, 
                                                categories=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,21,22,23])),
    # 'radiology_report'

    'field_strengths___1':  (lambda x: check_in_categories(x, categories=[1])),
    'field_strengths___2':  (lambda x: check_in_categories(x, categories=[1])),
    'field_strengths___3':  (lambda x: check_in_categories(x, categories=[1])),

    'age_at_preop_t1_15t':  (lambda x: check_age_years(x)),
    'age_at_preop_t1_3t':   (lambda x: check_age_years(x)), 
    'age_at_preop_t1_7t':   (lambda x: check_age_years(x)),

    'preop_t1_yr_15t':      (lambda x: check_year(x)),
    'preop_t1_yr_3t':       (lambda x: check_year(x)),
    'preop_t1_yr_7t':       (lambda x: check_year(x)), 
    'postop_t1_yr':         (lambda x: check_year(x)),
    'postop_t1_yr_2___1':   (lambda x: check_in_categories(x, categories=[1])),
    'postop_t1_yr_2___2':   (lambda x: check_in_categories(x, categories=[1])),
    'postop_t1_yr_2___3':   (lambda x: check_in_categories(x, categories=[1])),

    'control_headache':     (lambda x: check_in_categories(x, 
                                                categories=[1,2,555])),

    'preop_t1':             (lambda x: check_in_categories(x, categories=[0,1])), 
    'preop_t2':             (lambda x: check_in_categories(x, categories=[0,1])),
    'preop_flair':          (lambda x: check_in_categories(x, categories=[0,1])), 
    'preop_dwi':            (lambda x: check_in_categories(x, categories=[0,1])),
    'postop_t1':            (lambda x: check_in_categories(x, categories=[1,2,3])),
    'fields':               (lambda x: check_in_categories(x, categories=[0,1])),
    'lesion_mask':          (lambda x: check_in_categories(x, 
                                                categories=[0,1,555])),
    'age_at_onset':         (lambda x: check_age_years(x)),
    
    'gtcs':                 (lambda x: check_in_categories(x, 
                                                categories=[0,1,3])),
    'drug_resistant':       (lambda x: check_in_categories(x, 
                                                categories=[0,1,3])),
    # 'aeds': str,
    
    'mri_negative':         (lambda x: check_in_categories(x, 
                                                categories=[0,1,3])),
    'seeg':                 (lambda x: check_in_categories(x, 
                                                categories=[0,1,3])),
    'operated':             (lambda x: check_in_categories(x, 
                                                categories=[0,1,3])),
    'surgery_year':         (lambda x: check_year(x)),
    
    'age_at_surgery':       (lambda x: check_age_years(x)),

    'mri_negative_surgery': (lambda x: check_in_categories(x, 
                                                categories=[0,1,555])),
    'procedure':            (lambda x: check_in_categories(x, 
                                                categories=[1,2,3,4,555])),
                          
    # 'procedure_other',

    'histology':            (lambda x: check_in_categories(x, 
                                                categories=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,21,22,23])),
    # 'histology_other'
    
    'seizure_free':         (lambda x: check_in_categories(x, 
                                                categories=[1,2,555])),
    'seizure_free_aura':    (lambda x: check_in_categories(x, 
                                                categories=[1,2,555])),
    'engel_1yr':            (lambda x: check_in_categories(x, 
                                                categories=[1,2,3,4,555])),
    'ilae_1yr':             (lambda x: check_in_categories(x, 
                                                categories=[1,2,3,4,5,6,555])),
    'engel':                (lambda x: check_in_categories(x, 
                                                categories=[1,2,3,4,5,6,555])),
    'ilae' :                (lambda x: check_in_categories(x, 
                                                categories=[1,2,3,4,5,6,555])),
    'aeds_post_op':         (lambda x: check_in_categories(x, 
                                                categories=[1,2,3,555])),
}

In [189]:

def is_nan(x):
    return (x != x)

def check_id_MELD(value, site_code):
    parts = value.split("_")
    error = ""
    return_code = 1
    if len(parts)!=4:
        return_code = 0
        error = error + 'Error in MELD id structure;'
        return return_code, error
    if parts[0] != 'MELD':
        return_code = 0
        error + 'Error in first term of the id structure;'
    if parts[1]!=site_code:
        return_code = 0
        error = error + 'Wrong site code;'
    if not (parts[2] == 'C') and not (parts[2] == 'P'):
        return_code = 0
        error = error + 'Error in group, other than C or P;'
    return return_code, error

def check_site_code(value, site_code):
    error = ""
    return_code = 1
    if value!=site_code:
        return_code = 0
        error = error + 'Wrong site code;'
    return return_code, error

def check_in_categories(value, categories):
    valid_values = set(categories)
    error = ""
    return_code = 1
    if not is_nan(value):
        if not value in valid_values:
            return_code = 0
            error = error + f'Value {value} not in allowed categories;'
    return return_code, error 

def check_age_years(value, range=[0,80]):
    error = ""
    return_code = 1
    if not is_nan(value):
        if not (value>range[0]) or not (value<range[1]):
            return_code = 0
            error = error + f'Value {value} seems to be months instead of years;'
    return return_code, error 


def check_year(value, range=[2000,2023]):
    error = ""
    return_code = 1
    if not is_nan(value):
        if not (value>range[0]) or not (value<range[1]):
            return_code = 0
            error = error + f'Value {value} seems to be a wrong year;'
    return return_code, error 



In [210]:
# check values in column are correct
df_qc = pd.DataFrame()
for i, df_row in df.iterrows():
    values={}
    values['subject']=df_row['id']
    for column in columns:
        value = df_row[column]
        if column in check_functions:
            error_code, error = check_functions[column](value)
            values[column+'.passcheck']=error_code
            if error_code==0 :
                values[column+'.error']=error
    df_qc = pd.concat([df_qc, pd.DataFrame([values])])
df_qc = df_qc.reset_index()

In [212]:
df_qc.head()

Unnamed: 0,index,subject,id.passcheck,site.passcheck,patient_control.passcheck,sex.passcheck,radiology.passcheck,field_strengths___1.passcheck,field_strengths___2.passcheck,field_strengths___3.passcheck,...,procedure.passcheck,histology.passcheck,seizure_free.passcheck,seizure_free_aura.passcheck,engel_1yr.passcheck,ilae_1yr.passcheck,engel.passcheck,ilae.passcheck,aeds_post_op.passcheck,age_at_onset.error
0,0,MELD_H10_P_0001,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,
1,0,MELD_H10_P_0002,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,
2,0,MELD_H10_P_0003,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,
3,0,MELD_H10_P_0004,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,
4,0,MELD_H10_P_0005,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,


In [213]:
# for each columns print subject with error and the error
for column in columns:
    try:
        if (df_qc[column+'.passcheck']==0).any():
            print(f"Error found in column {column}")
            failed_row = df_qc.loc[df_qc[column+'.passcheck']==0].index
            print(f"subjects: {df_qc.loc[failed_row, 'subject'].values}")
            print(f"errors: {df_qc.loc[failed_row, column+'.error'].values}")
            print("\n")
    except:
        pass

Error found in column age_at_onset
subjects: ['MELD_H10_P_0051']
errors: ['Value 0.0 seems to be months instead of years;']




## 2. Check mandatory data are provided


In [214]:
# combine age preop t1
df['age_at_preop_t1'] = df['age_at_preop_t1_3t']
df['age_at_preop_t1'] = df['age_at_preop_t1'].fillna(df['age_at_preop_t1_7t'])
df['age_at_preop_t1'] = df['age_at_preop_t1'].fillna(df['age_at_preop_t1_15t'])

In [215]:
# check mandatory data are given and correct
df_mand = pd.DataFrame()

for i,df_row in df.iterrows():
    values = {}
    values['subject']=df_row['id']
    # check id, group and sex provided
    values['id'] = not is_nan(df_row['id'])
    values['patient_control'] = not is_nan(df_row['patient_control'])
    values['sex'] = not is_nan(df_row['sex'])
    # check preop age provided
    values['age_at_preop_t1'] = not is_nan(df_row['age_at_preop_t1'])
    
    ### only mandatory for patients
    if df_row['patient_control']==1:
        # check age of onset provided and smaller than preop
        values['age_at_onset'] = (not is_nan(df_row['id'])) and (df_row['age_at_onset']<df_row['age_at_preop_t1'])
        # check radiology or histology provided
        values['radiology_histology'] = (not is_nan(df_row['radiology'])) or (not is_nan(df_row['histology'])) 
    
    df_mand = pd.concat([df_mand,pd.DataFrame([values])])

In [216]:
df_mand.head()

Unnamed: 0,subject,id,patient_control,sex,age_at_preop_t1,age_at_onset,radiology_histology
0,MELD_H10_P_0001,True,True,True,True,True,True
0,MELD_H10_P_0002,True,True,True,True,True,True
0,MELD_H10_P_0003,True,True,True,True,True,True
0,MELD_H10_P_0004,True,True,True,True,True,True
0,MELD_H10_P_0005,True,True,True,True,True,True


## 3. Print free text 
