In [43]:
import re
from pathlib import Path

import pandas as pd


def find_matching_folders(directory: str) -> tuple[list[str], list[str]]:
    """
    Find all folders in the given directory that match a specific pattern.

    Parameters:
    directory (str): The directory to search.

    Returns:
    List[str]: A list of matching folders.
    """
    matching_folders = []
    pattern = r"[lL]\d+_\d+"

    for exp_folder in Path(directory).iterdir():
        if exp_folder.is_dir():
            result_folder = exp_folder / "results"
            if result_folder.is_dir():
                for result in result_folder.iterdir():
                    if re.match(pattern, result.name):
                        matching_folders.append(str(result))
                    elif ".DS_Store" not in result.name:
                        non_matching_folders.append(str(result))
    return matching_folders, non_matching_folders


def filter_folders(matching_folder: list[str], df_col: pd.Series) -> list[str]:
    """
    Filter a list of folders based on the values in a DataFrame column.

    Parameters:
    matching_folder (List[str]): The list of folders to filter.
    df_col (pd.Series): The DataFrame column to use for filtering.

    Returns:
    List[str]: The filtered list of folders.
    """
    filtered_folders = []
    for folder in matching_folder:
        for value in df_col:
            if str(value) in folder:
                filtered_folders.append(folder)
                break
    return filtered_folders

# Move excluded to excluded_results

In [44]:
df = pd.read_csv("trial_IA_ID_mismatch (2).csv")
df

Unnamed: 0,Folder,subject_id,full_path,is_in_metadata,mapping,batch-condition,batch,condition,list,principle_list
0,ose_3p_l1_l60_tower_st_mit_l1_latest_widthfix,l42_2070,/Users/shubi/Library/CloudStorage/OneDrive-Tec...,True,,3p,3.0,p,42.0,6
1,ose_3p_l1_l60_tower_st_mit_l1_latest_widthfix,l59_547,/Users/shubi/Library/CloudStorage/OneDrive-Tec...,True,,3p,3.0,2,4.0,"(146,)"
3,l57_501,2,3,2,4,"(146,)",,,,
4,l59_485,2,3,2,21,"(146,)",,,,
...,...,...,...,...,...,...,,,,
155,l40_458,3,9,2,58,"(160, 161, 159)",,,,
156,l36_394,3,9,2,62,"(0, 1, 2, 3, 4)",,,,
157,l36_400,3,9,2,62,"(160, 161, 159)",,,,
158,l56_438,3,9,2,63,"(160, 161, 159)",,,,
159,l56_464,3,9,2,63,"(0, 1, 2, 3, 4)",,,,


In [45]:
matching_folders_eye_data = find_matching_folders("/Users/shubi/Documents/eye_data")
filter_folders(matching_folders_eye_data, df)

[]

In [46]:
matching_folders_onedrive = find_matching_folders(
    "/Users/shubi/Data/OneStop Full Experiment Folders Backup 1932024 after fixes"
)
[f for f in filter_folders(matching_folders_onedrive, df) if "report" not in f]

[]

In [47]:
filtered_df = df[df["RECORDING_SESSION_LABEL"] == "l59_485"]
filtered_df

Unnamed: 0,RECORDING_SESSION_LABEL,batch,article_id,paragraph_id,trial,missing_IA_IDs
4,l59_485,2,3,2,21,"(146,)"
17,l59_485,2,3,2,64,"(146,)"


In [68]:
metadata_df = pd.read_csv("Metadata OneStopGaze L1 - metadata (4).csv")
metadata_df.Folder.value_counts()

ose_1n_l1_l60_tower_st                           57
ose_2n_l1_l60_tower_st                           52
ose_3n_l1_l60_tower_st_fixed                     51
ose_1p_l1_l60_tower_st                           50
ose_3p_l1_l60_tower_st_fixed                     49
ose_2p_l1_l60_tower_st                           44
ose_2p_l1_l60_tower_st_mit_l1_latest             12
ose_3p_l1_l60_tower_st_mit_l1_latest_widthfix     6
ose_2n_l1_l60_tower_st_mit_l1_latest_widthfix     5
ose_3p_l1_l60_tower_st                            5
ose_3n_l1_l60_tower_st                            4
ose_1p_l1_l60_tower_iafix                         4
ose_3n_l1_l60_tower_st_mit_l1_latest_widthfix     4
ose_1p_l1_l60_tower_st_mit_l1_latest              3
ose_1p_l1_l60_tower_st_mit_l1_latest_widthfix     3
ose_2n_l1_l60_tower_st_mit_l1_latest              2
ose_1n_l1_l60_tower_st_mit_l1_latest              2
ose_2p_l1_l60_tower_st_mit_l1_latest_widthfix     2
ose_2p_l1_l60_tower_st_mit_l2                     2
ose_2n_l1_l6

In [69]:
matching_folders_eye_data = find_matching_folders(
    "/Users/shubi/Data/OneStop Full Experiment Folders Backup 1932024 after fixes"
)

In [70]:
matches = [f for f in filter_folders(matching_folders_eye_data, metadata_df.Folder)]

In [71]:
df = pd.DataFrame(
    {
        "Folder": [path.split("/")[-3] for path in matches],
        "subject_id": [path.split("/")[-1] for path in matches],
    }
)
df

Unnamed: 0,Folder,subject_id
0,ose_3p_l1_l60_tower_st_mit_l1_latest_widthfix,l42_2070
1,ose_3p_l1_l60_tower_st_mit_l1_latest_widthfix,l59_547
2,ose_3p_l1_l60_tower_st_mit_l1_latest_widthfix,l39_542
3,ose_3p_l1_l60_tower_st_mit_l1_latest_widthfix,l56_522
4,ose_3p_l1_l60_tower_st_mit_l1_latest_widthfix,l49_529
...,...,...
355,ose_2n_l1_l60_tower_st,l16_218
356,ose_2n_l1_l60_tower_st,l30_254
357,ose_2n_l1_l60_tower_st,l15_210
358,ose_2n_l1_l60_tower_st,l17_219


In [75]:
df = pd.DataFrame(
    {
        "Folder": [path.split("/")[-3] for path in matches],
        "subject_id": [path.split("/")[-1] for path in matches],
    }
)
df = df[df["subject_id"].str.lower().isin(metadata_df["Filename"])]
print(len(df))

360


In [76]:
df.groupby("Folder").count().sort_values("subject_id", ascending=False).index

Index(['ose_1n_l1_l60_tower_st', 'ose_1p_l1_l60_tower_st',
       'ose_2n_l1_l60_tower_st', 'ose_2p_l1_l60_tower_st',
       'ose_3p_l1_l60_tower_st_fixed', 'ose_3n_l1_l60_tower_st_fixed',
       'ose_3n_l1_l60_tower_st_fixed_lacc',
       'ose_3p_l1_l60_tower_st_fixed_lacc',
       'ose_2p_l1_l60_tower_st_mit_l1_latest', 'ose_2n_l1_l60_tower_st_lacc',
       'ose_3p_l1_l60_tower_st_mit_l1_latest_widthfix',
       'ose_2n_l1_l60_tower_st_mit_l1_latest_widthfix',
       'ose_3p_l1_l60_tower_st', 'ose_3n_l1_l60_tower_st',
       'ose_3n_l1_l60_tower_st_mit_l1_latest_widthfix',
       'ose_1p_l1_l60_tower_iafix',
       'ose_1p_l1_l60_tower_st_mit_l1_latest_widthfix',
       'ose_1p_l1_l60_tower_st_mit_l1_latest',
       'ose_2n_l1_l60_tower_st_mit_l1_latest',
       'ose_1n_l1_l60_tower_st_mit_l1_latest',
       'ose_2p_l1_l60_tower_st_mit_l1_latest_widthfix',
       'ose_2p_l1_l60_tower_st_mit_l2',
       'ose_2n_l1_l60_tower_st_lacclab_corrected',
       'ose_3n_l1_l60_tower_st_mit_l1_

In [85]:
metadata_df = pd.read_csv("Metadata OneStopGaze L1 - metadata (4).csv")

df = pd.DataFrame(
    {
        "Folder": [path.split("/")[-3] for path in matching_folders_eye_data],
        "subject_id": [path.split("/")[-1] for path in matching_folders_eye_data],
        "full_path": matching_folders_eye_data,
    }
)
df["is_in_metadata"] = (
    df["subject_id"].str.lower().isin(metadata_df["Filename"].str.lower())
)

mappings = {
    "ose_1n_l1_l60_tower_st": "old_mit",
    "ose_1p_l1_l60_tower_st": "old_mit",
    "ose_2n_l1_l60_tower_st": "old_mit",
    "ose_2p_l1_l60_tower_st": "old_mit",
    "ose_3p_l1_l60_tower_st": "old_mit",
    "ose_3n_l1_l60_tower_st": "old_mit",
    "ose_3p_l1_l60_tower_st_fixed": "old_mit",
    "ose_3n_l1_l60_tower_st_fixed": "old_mit",
    "ose_1p_l1_l60_tower_iafix": "old_mit",
    "ose_3n_l1_l60_tower_st_fixed_lacc": "fixed_lacc",
    "ose_3p_l1_l60_tower_st_fixed_lacc": "fixed_lacc",
    "ose_2n_l1_l60_tower_st_lacclab_corrected": "corrected_lacc",
    "ose_2n_l1_l60_tower_st_lacc": "old_lacc",
    "ose_2p_l1_l60_tower_st_lacc": "old_lacc",
    "ose_2n_l1_l60_tower_st_mit_l1_latest": "new_mit",
    "ose_2p_l1_l60_tower_st_mit_l1_latest": "new_mit",
    "ose_1n_l1_l60_tower_st_mit_l1_latest": "new_mit",
    "ose_1p_l1_l60_tower_st_mit_l1_latest": "new_mit",
    "ose_3n_l1_l60_tower_st_mit_l1_latest": "new_mit",
    "ose_1n_l1_l60_tower_st_mit_l2_latest": "new_mit",
    "ose_3p_l1_l60_tower_st_mit_l1_latest_widthfix": "new_mit",
    "ose_3n_l1_l60_tower_st_mit_l1_latest_widthfix": "new_mit",
    "ose_1p_l1_l60_tower_st_mit_l1_latest_widthfix": "new_mit",
    "ose_2n_l1_l60_tower_st_mit_l1_latest_widthfix": "new_mit",
    "ose_2p_l1_l60_tower_st_mit_l1_latest_widthfix": "new_mit",
    "ose_2p_l1_l60_tower_st_mit_l2": "new_mit",
}
df["mapping"] = df.Folder.map(mappings)
print(df.mapping.value_counts())
print("total:", len(df))

old_mit           260
fixed_lacc         47
new_mit            43
old_lacc            9
corrected_lacc      1
Name: mapping, dtype: int64
total: 360


In [60]:
non_unique_subject_ids = df[df.duplicated(subset="subject_id", keep=False)]
non_unique_subject_ids.sort_values("Folder")

Unnamed: 0,Folder,subject_id,full_path,is_in_metadata,mapping,batch-condition,batch,condition,list,principle_list


In [5]:
non_unique_subject_ids.merge(
    metadata_df,
    left_on="subject_id",
    right_on="Filename",
    how="left",
    suffixes=("_folder", "_metadata"),
)[["subject_id", "Folder_folder", "Folder_metadata"]].sort_values("subject_id")

Unnamed: 0,subject_id,Folder_folder,Folder_metadata


In [6]:
# List of tuples with subject_id and Folder_folder values for the rows to update
rows_to_update = [
    ("l14_349", "ose_3n_l1_l60_tower_st_fixed"),
    ("l18_358", "ose_3n_l1_l60_tower_st_fixed"),
    ("l34_398", "ose_3p_l1_l60_tower_st_fixed_lacc"),
    ("l44_305", "ose_2n_l1_l60_tower_st"),
]

# Update the 'is_in_metadata' column for the specified rows
for subject_id, folder_folder in rows_to_update:
    df.loc[
        (df["subject_id"] == subject_id) & (df["Folder"] == folder_folder),
        "is_in_metadata",
    ] = False

In [7]:
metadata_df[~metadata_df.Filename.str.lower().isin(df.subject_id.str.lower())]

Unnamed: 0,Batch,Condition,Folder,List,ID,Filename,L1,Date,Experimenter,First Name,...,Start Time,End Time,Drift Above,Drift Below,Comprehension,Comprehension.1,Black Screen,Experiment Notes,Survey notes,Dataviewer Inspection Notes (Shubi)


In [8]:
df.groupby("Folder").count().sort_values("subject_id", ascending=False).rename(
    columns={"subject_id": "Participant count"}
)["Participant count"]

Folder
ose_1n_l1_l60_tower_st                           57
ose_1p_l1_l60_tower_st                           50
ose_2n_l1_l60_tower_st                           44
ose_2p_l1_l60_tower_st                           43
ose_3p_l1_l60_tower_st_fixed                     27
ose_3n_l1_l60_tower_st_fixed                     26
ose_3n_l1_l60_tower_st_fixed_lacc                25
ose_3p_l1_l60_tower_st_fixed_lacc                22
ose_2p_l1_l60_tower_st_mit_l1_latest             12
ose_2n_l1_l60_tower_st_lacc                       8
ose_3p_l1_l60_tower_st_mit_l1_latest_widthfix     6
ose_2n_l1_l60_tower_st_mit_l1_latest_widthfix     5
ose_3p_l1_l60_tower_st                            5
ose_3n_l1_l60_tower_st                            4
ose_3n_l1_l60_tower_st_mit_l1_latest_widthfix     4
ose_1p_l1_l60_tower_iafix                         4
ose_1p_l1_l60_tower_st_mit_l1_latest_widthfix     3
ose_1p_l1_l60_tower_st_mit_l1_latest              3
ose_2n_l1_l60_tower_st_mit_l1_latest              2
ose_1

In [9]:
df.mapping.value_counts()

old_mit           260
fixed_lacc         47
new_mit            20
old_lacc            9
corrected_lacc      1
Name: mapping, dtype: int64

In [11]:
import shutil

for index, row in df.query("is_in_metadata == False").iterrows():
    folder_path = row["full_path"]
    new_folder_path = folder_path.replace("results", "excluded_results")
    print(folder_path, new_folder_path)
    shutil.move(folder_path, new_folder_path)

In [16]:
for folder in non_matching:
    # assert "results" should be in the path only once
    assert folder.count("results") == 1
    new_folder_path = folder.replace("results", "excluded_results")
    print(folder, new_folder_path)
    shutil.move(folder, new_folder_path)

In [17]:
def find_matching_folders2(directory: str) -> tuple[list[str], list[str]]:
    """
    Find all folders in the given directory that match a specific pattern.

    Parameters:
    directory (str): The directory to search.

    Returns:
    List[str]: A list of matching folders.
    """
    matching_folders = []
    pattern = r"[Ll]\d+_\d+"  # e.g. L1_1 or l1_1
    non_matching_folders = []
    for exp_folder in Path(directory).iterdir():
        if exp_folder.is_dir():
            result_folder = exp_folder / "runtime" / "dataviewer"
            if result_folder.is_dir():
                for result in result_folder.iterdir():
                    if re.match(pattern, result.name):
                        matching_folders.append(str(result))
                    elif ".DS_Store" not in result.name:
                        non_matching_folders.append(str(result))
    return matching_folders, non_matching_folders


matching_folders_eye_data, non_matching = find_matching_folders2(
    "/Users/shubi/Library/CloudStorage/OneDrive-Technion/In-lab Experiments/OneStopGaze L1 English/Full Experiment Folders"
)
matching_folders_eye_data += non_matching
df = pd.DataFrame(
    {
        "Folder": [path.split("/")[-3] for path in matching_folders_eye_data],
        "subject_id": [path.split("/")[-1] for path in matching_folders_eye_data],
        "full_path": matching_folders_eye_data,
    }
)
df["is_in_metadata"] = (
    df["subject_id"].str.lower().isin(metadata_df["Filename"].str.lower())
)
df

Unnamed: 0,Folder,subject_id,full_path,is_in_metadata,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,ose_3p_l1_l60_tower_st_mit_l1_latest_widthfix,l42_2070,new_mit,3p,3.0,p,42.0,6.0
6,ose_2n_l1_l60_tower_st_lacclab_corrected,l59_485,corrected_lacc,2n,2.0,n,59.0,5.0
7,ose_3p_l1_l60_tower_st_fixed_lacc,l57_439,fixed_lacc,3p,3.0,p,57.0,3.0
8,ose_3p_l1_l60_tower_st_fixed_lacc,l53_431,fixed_lacc,3p,3.0,p,53.0,5.0
9,ose_3p_l1_l60_tower_st_fixed_lacc,l45_452,fixed_lacc,3p,3.0,p,45.0,3.0
...,...,...,...,...,,,,
355,ose_2n_l1_l60_tower_st,l16_218,old_mit,2n,2.0,n,16.0,4.0
356,ose_2n_l1_l60_tower_st,l30_254,old_mit,2n,2.0,n,30.0,6.0
357,ose_2n_l1_l60_tower_st,l15_210,old_mit,2n,2.0,n,15.0,3.0
358,ose_2n_l1_l60_tower_st,l17_219,old_mit,2n,2.0,n,17.0,5.0


In [61]:
for index, row in df.query("is_in_metadata == False").iterrows():
    folder_path = row["full_path"]
    new_folder_path = folder_path.replace("dataviewer", "excluded_dataviewer")
    print(folder_path, new_folder_path)
    shutil.move(folder_path, new_folder_path)

/Users/shubi/Library/CloudStorage/OneDrive-Technion/In-lab Experiments/OneStopGaze L1 English/Full Experiment Folders/ose_1n_l1_l60_tower_st/runtime/dataviewer/l39_124 /Users/shubi/Library/CloudStorage/OneDrive-Technion/In-lab Experiments/OneStopGaze L1 English/Full Experiment Folders/ose_1n_l1_l60_tower_st/runtime/excluded_dataviewer/l39_124
/Users/shubi/Library/CloudStorage/OneDrive-Technion/In-lab Experiments/OneStopGaze L1 English/Full Experiment Folders/ose_1n_l1_l60_tower_st/runtime/dataviewer/l40_126 /Users/shubi/Library/CloudStorage/OneDrive-Technion/In-lab Experiments/OneStopGaze L1 English/Full Experiment Folders/ose_1n_l1_l60_tower_st/runtime/excluded_dataviewer/l40_126
/Users/shubi/Library/CloudStorage/OneDrive-Technion/In-lab Experiments/OneStopGaze L1 English/Full Experiment Folders/ose_1n_l1_l60_tower_st/runtime/dataviewer/l41_128 /Users/shubi/Library/CloudStorage/OneDrive-Technion/In-lab Experiments/OneStopGaze L1 English/Full Experiment Folders/ose_1n_l1_l60_tower_st/r