## Validating PRL task logs

This script validates raw task logs organized acording to BIDS scheme. Script features:
- checks proper folders and logs naming 
- looks for missing and extra files
- looks for missing data within logs and removes duplicates

---
**Last update**: 07.01.2020 

In [1]:
import os
import filecmp
import pandas as pd

path_root = '/home/kmb/Desktop/Neuroscience/Projects/BONNA_decide_net/'
path_logs = os.path.join(
    path_root, 
    'data/main_fmri_study/sourcedata/behavioral/task_logs'
)

!ls $path_logs

sub-m02  sub-m06  sub-m10  sub-m14  sub-m18  sub-m22  sub-m26  sub-m30
sub-m03  sub-m07  sub-m11  sub-m15  sub-m19  sub-m23  sub-m27  sub-m31
sub-m04  sub-m08  sub-m12  sub-m16  sub-m20  sub-m24  sub-m28  sub-m32
sub-m05  sub-m09  sub-m13  sub-m17  sub-m21  sub-m25  sub-m29  sub-m33


All logs for single subject sits within `<path_logs>/sub-<subject_label>`.

In [2]:
# Check valid folder names and infer available subjects
subjects = []
for subfolder in os.listdir(path_logs):
    if subfolder[:5] != 'sub-m': 
        raise NameError(f'wrong folder name: {subfolder}')
    if len(subfolder) != 7: 
        raise NameError(f'wrong folder name: {subfolder}')
    subjects.append(subfolder[-3:])

print(f'--> No errors in folder naming.')
print(f'--> Subjects found: \n{sorted(subjects)}')
print(f'--> Total number of subjects: {len(subjects)}')

--> No errors in folder naming.
--> Subjects found: 
['m02', 'm03', 'm04', 'm05', 'm06', 'm07', 'm08', 'm09', 'm10', 'm11', 'm12', 'm13', 'm14', 'm15', 'm16', 'm17', 'm18', 'm19', 'm20', 'm21', 'm22', 'm23', 'm24', 'm25', 'm26', 'm27', 'm28', 'm29', 'm30', 'm31', 'm32', 'm33']
--> Total number of subjects: 32


#### Missing files, extra files, spellings 
Use this code to find missing files or extra filels and resolve potential problems **manually** (because of small sample size and large variety of potential problem with files: spellings, procedure failures etc.). Rerun after fix, and proceed to the next section.

In [3]:
missing, extra = [], []

for subject in subjects:
    
    path_subfolder = os.path.join(path_logs, 'sub-' + subject)
    log_rew = f'{subject}_prl_DecideNet_rew.csv'
    log_pun = f'{subject}_prl_DecideNet_pun.csv'
    
    # look for missing files
    if log_rew not in os.listdir(path_subfolder):
        missing.append(log_rew)
    if log_pun not in os.listdir(path_subfolder):
        missing.append(log_pun)

    # look for extra files
    if len(os.listdir(path_subfolder)) != 8:
        extra.append([subject, len(os.listdir(path_subfolder))])
        
print(f'--> Logs not found: \n{sorted(missing)}')
print(f'--> Total number of missing logs: {len(missing)}')
print(f'--> Extra files: \n{extra}')
print(f'--> Number of subs with extra files: {len(extra)}')

--> Logs not found: 
[]
--> Total number of missing logs: 0
--> Extra files: 
[]
--> Number of subs with extra files: 0


#### Missing data within files
Use this code to find if there is missing data within logs (missing trials or missing columns). One can also look for variability in file size to detect potential problems. Finally, code looks for failed duplicate files generated by PsychoPy. 

In [4]:
proper_shape = (110, 28) # log dataframe correct size
wrong_shape, file_size, wrong_duplicates = [], [], []

for subject in subjects:
    
    path_subfolder = os.path.join(path_logs, 'sub-' + subject)
    log_rew_path = f'{path_subfolder}/{subject}_prl_DecideNet_rew.csv'
    log_pun_path = f'{path_subfolder}/{subject}_prl_DecideNet_pun.csv'
    df_rew = pd.read_csv(log_rew_path)
    df_pun = pd.read_csv(log_pun_path)
    
    # Test log shape 
    if df_pun.shape != proper_shape:
        wrong_shape.append([log_pun_path, df_pun.shape])
    if df_rew.shape != proper_shape:
        wrong_shape.append([log_rew_path, df_rew.shape])
    
    # Save file size
    file_size.append((log_rew_path, os.path.getsize(log_pun_path)))
    file_size.append((log_pun_path, os.path.getsize(log_pun_path)))
    
    # Look if duplicates are same
    log_rew_path2 = f'{path_subfolder}/{subject}_prl_DecideNet_rew_2.csv'
    log_pun_path2 = f'{path_subfolder}/{subject}_prl_DecideNet_pun_2.csv'
    
    if not filecmp.cmp(log_rew_path, log_rew_path2):
        wrong_duplicates.append(log_rew_path)
    if not filecmp.cmp(log_pun_path, log_pun_path2):
        wrong_duplicates.append(log_pun_path)
        
print(f'--> Logs with wrong shape: \n{wrong_shape}')
# print(f'--> Logs with wrong duplicates: \n{wrong_duplicates}')

--> Logs with wrong shape: 
[]


#### Fix subject_id field and remove spurious columns
Apply only after manually resolved conflicts with file names!

In [None]:
for subject in subjects:
    
    path_subfolder = os.path.join(path_logs, 'sub-' + subject)
    log_rew_path = f'{path_subfolder}/{subject}_prl_DecideNet_rew.csv'
    log_pun_path = f'{path_subfolder}/{subject}_prl_DecideNet_pun.csv'
    df_rew = pd.read_csv(log_rew_path)
    df_pun = pd.read_csv(log_pun_path)
    
    # Filter only useful columns
    keys = ['block', 'rwd', 'magn_left', 'magn_right', 'onset_iti_plan',
       'onset_isi_plan', 'onset_dec_plan', 'onset_out_plan', '.thisRepN',
       '.thisTrialN', '.thisN', '.thisIndex', 'onset_iti', 'onset_iti_glob',
       'onset_dec', 'onset_dec_glob', 'onset_isi', 'onset_isi_glob',
       'onset_out', 'onset_out_glob', 'acc_after_trial', 'won_bool',
       'won_magn', 'rt', 'response', 'subject_id', 'condition', 'group']
    df_rew = df_rew[keys]
    df_pun = df_pun[keys]
    
    # Fix subject_id field
    df_rew['subject_id'] = subject
    df_pun['subject_id'] = subject

    # Save changes
    df_rew.to_csv(log_rew_path, index=False)
    df_pun.to_csv(log_pun_path, index=False)