### Quality Assurance
This Jupyter Notebook includes all code snippets for processing individual datasets and generating various plots for quality assurance (QA). 

# Import Packages

In [2]:
import os
import re
import shutil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
from statannot import add_stat_annotation
from scipy import stats

# Append Files

In [None]:
def append_files(path, file_pattern='*[mono|R]_features*', file_name='group_R_features.csv'):
    '''
    Append subject files (eye/behavior) and save as a group .csv file

    Parameters
    ----------
    path : string
        DESCRIPTION. The relative path to the data folder
    file_pattern : string, optional
        DESCRIPTION. The pattern for files being appended. 
        The default is '*[mono|R]_features*' for eye feature files.
    file_name : string, optional
        DESCRIPTION. the group file name to save
        The default is 'group_R_features.csv'.

    Returns
    -------
    df_group : dataframe
        DESCRIPTION. The group dataframe 

    '''
    # grab every sub folders under the input path
    sub_folders = [f.path for f in os.scandir(path) if f.is_dir()]
    # define group dataframe
    df_group = pd.DataFrame()
    
    # loop through every folder
    for folder_path in sub_folders:
        try: 
            # extract subject id
            subject_id = re.findall(r's[0-9]+', folder_path)[0]
            # extract tracking and behavior features
            file_path = glob(os.path.join(folder_path, file_pattern))
            # read in the subject csv
            df_ind = pd.read_csv(file_path[0])
            # add subject id columne
            df_ind['sub_id'] = subject_id
            # append to the group dataframe
            df_group = pd.concat([df_group, df_ind], ignore_index=True)
        except:
            continue
    
    # save and return the group dataframe
    df_group = df_group.loc[:, ~df_group.columns.str.match('Unnamed')]
    
    # calculate z-score of two correlation coefficient columns
    # zipf_duration_correlation and word_length_duration_correlation
    for col_name in ['zipf_fixdur_corr', 'word_length_fixdur_corr']:
        # get index
        index = df_group.columns.get_loc(col_name)
        # extract the column values
        col = df_group[col_name]
        # compute the z-score
        z_col = stats.zscore(col, nan_policy='omit')
        # insert into the dataframe
        df_group.insert(index, f'zscored_{col_name}', z_col)
    
    df_group.to_csv(f'{path}{file_name}')
    return df_group

# call function to append individual dataset
path = '../../../../Data/'
df = append_files(path, file_pattern='*[mono|R]_features_default*', file_name='group_R_features_default.csv')

# Load Dataset

In [5]:
file_path = '../../../../Data/group_R_features_default.csv'
df = pd.read_csv(file_path)