In [1]:
from pathlib import Path
from datetime import datetime
import pandas as pd

base_path = Path(
    "/Users/shubi/Library/CloudStorage/OneDrive-Technion/In-lab Experiments/OneStopGaze L1 English"
)
exp_folders_path = base_path / "Full Experiment Folders"


def get_missing_words(p_words: list[str], p_ias: pd.DataFrame) -> list[str]:
    missing_words = []
    for idx, word in enumerate(p_words[::-1]):
        idx = len(p_words) - idx - 1
        if idx >= len(p_ias):
            # print(
            #     f"Error: {ias_name} has {len(p_ias)} AOIs, but the trial has {len(p_words)} words"
            # )
            missing_words.append(word)
        elif abs(len(p_ias) - len(p_words)) > 10:
            raise Exception
        else:
            break

    missing_words = list(reversed(missing_words))
    return missing_words


def add_missing_rows_to_ias(
    p_ias: pd.DataFrame, missing_words: list[str]
) -> pd.DataFrame:
    char_width = 19
    margin = 10
    for indx, word in enumerate(missing_words):
        num_characters = len(word)
        word_len_in_px = (num_characters * char_width) + (margin * 2) - 1
        new_row = {
            "group": None,
            "type": None,
            "ID": p_ias["ID"].iloc[-1] + 1,
            "label": word,
        }

        if indx == 0:
            new_row.update(
                {
                    "left": 358,
                    "top": p_ias["bottom"].iloc[-1],
                    "right": 358 + word_len_in_px,
                    "bottom": p_ias["bottom"].iloc[-1] + 114,
                }
            )
        else:
            new_row.update(
                {
                    "left": p_ias["right"].iloc[-1],
                    "top": p_ias["top"].iloc[-1],
                    "right": p_ias["right"].iloc[-1] + word_len_in_px,
                    "bottom": p_ias["bottom"].iloc[-1],
                }
            )

        new_row = pd.Series(new_row)
        new_row[["group", "type"]] = p_ias[["group", "type"]].ffill(axis=0).iloc[-1]
        p_ias = pd.concat([p_ias, new_row.to_frame().T], ignore_index=True)
    return p_ias


def load_ias(ias_path: Path) -> pd.DataFrame:
    ias_data = pd.read_csv(
        ias_path,
        sep="\t",
        names=["group", "ID", "left", "top", "right", "bottom", "label"],
    )
    ias_data[["group", "type"]] = ias_data["group"].str.split(" ", n=1, expand=True)
    ias_data = ias_data[
        ["group", "type", "ID", "left", "top", "right", "bottom", "label"]
    ]
    return ias_data  # type: ignore


def get_paragarph_ias(ias_data: pd.DataFrame, subject_id: str) -> pd.DataFrame:
    """
    Get the largest group size from the given IAS data.
    Does not take into account the last 3 groups as they are question, question+answers, feedback.

    Args:
        ias_data (pd.DataFrame): The input IAS data.

    Returns:
        pd.DataFrame: The largest group.
    """

    # Extract a list of all the 'group' values in ias_data
    group_values = ias_data["group"].unique().tolist()
    # discard the last three
    if len(group_values) != 1:
        group_values = group_values[:-3]
        # keep only ias_data where group is in group_values
        ias_data = ias_data[ias_data["group"].isin(group_values)].copy()  # type: ignore
    else:
        assert subject_id == "l34_277"  # IA_40.ias (trial 38) has only one group.

    largest_group = ias_data.groupby("group").size().idxmax()
    ias_data_largest_group = ias_data[ias_data["group"] == largest_group]
    return ias_data_largest_group  # type: ignore


def load_dat(dat_file_path: Path, names=None) -> pd.DataFrame:
    if names:
        dat = pd.read_csv(dat_file_path, sep="\t", names=names)
    else:
        dat = pd.read_csv(dat_file_path, sep="\t")
    return dat


def get_trial_index(ias_path: Path, trial_report: pd.DataFrame, subject_id) -> int:
    trial_index = ias_path.name.split("_")[1].split(".")[0]
    actual_trial_indx = (  # type: ignore
        trial_report.loc[
            (trial_report["RECORDING_SESSION_LABEL"] == subject_id)
            & (trial_report["Trial_Index_"] == trial_index),
            "trial",
        ]
        .astype(int)
        .item()
    )
    return actual_trial_indx


def get_parag_words(
    subject_dat, ias_path, trial_report: pd.DataFrame, subject_id
) -> list[str]:
    trial_dat = subject_dat[
        subject_dat["trial"] == get_trial_index(ias_path, trial_report, subject_id)
    ].drop_duplicates()
    p_words = trial_dat["$paragraph"].item().split()
    return p_words


def get_dat_file_path(exp_folders_path: Path, folder_name: str) -> Path:
    dat_file_names = [
        "TRIAL_DataSource_onestop_BLOCKTRIAL.dat",
        f"TRIAL_DataSource_{folder_name}_BLOCKTRIAL.dat",
        f"TRIAL_DataSource_{folder_name.removesuffix('_widthfix')}_BLOCKTRIAL.dat",
        f"TRIAL_DataSource_{folder_name.removesuffix('_l2_latest')}_BLOCKTRIAL.dat",
    ]
    # TRIAL_DataSource_ose_1p_l1_l60_tower_st_mit_l1_latest_widthfix
    # TRIAL_DataSource_ose_1p_l1_l60_tower_st_mit_l1_latest_BLOCKTRIAL
    for file_name in dat_file_names:
        dat_file_path = exp_folders_path / folder_name / "datasets" / file_name
        if dat_file_path.exists():
            return dat_file_path
    else:
        raise FileNotFoundError(f"Could not find the dat file for {folder_name}")


def get_subject_dat_file_path(exp_folders_path, folder_name, subject_id):
    dat_file_names = [
        "actual_TRIAL_DataSource_onestop_BLOCKTRIAL.dat",
        f"actual_TRIAL_DataSource_{folder_name}_BLOCKTRIAL.dat",
        f"actual_TRIAL_DataSource_{folder_name.removesuffix('_widthfix')}_BLOCKTRIAL.dat",
        f"actual_TRIAL_DataSource_{folder_name.removesuffix('_l2_latest')}_BLOCKTRIAL.dat",
    ]
    # actual_TRIAL_DataSource_ose_1p_l1_l60_tower_st_mit_l1_latest_BLOCKTRIAL
    for file_name in dat_file_names:
        subject_dat_file_path = (
            exp_folders_path / folder_name / "results" / subject_id / file_name
        )
        if subject_dat_file_path.exists():
            return subject_dat_file_path

    raise FileNotFoundError(f"Could not find the dat file for {subject_id}")


def get_aoi_folder_path(exp_folders_path, folder_name, subject_id):
    potential_paths = [
        exp_folders_path / folder_name / "results" / subject_id / "aoi",
        exp_folders_path / folder_name / "runtime" / "dataviewer" / subject_id / "aoi",
    ]

    for path in potential_paths:
        if path.exists():
            return path

    raise FileNotFoundError(f"Could not find the aoi folder for {subject_id}")


def rebuild_ias_data(ias_data, p_ias) -> pd.DataFrame:
    group_value = p_ias.group.unique().item()

    before = ias_data[
        ias_data.index < ias_data[ias_data["group"] == group_value].index.min()
    ]
    after = ias_data[
        ias_data.index > ias_data[ias_data["group"] == group_value].index.max()
    ]

    ias_data = pd.concat([before, p_ias, after], ignore_index=True)

    # Original ias file has tabs as separators except for group and type which are separated by a space.
    ias_data["group"] = ias_data["group"] + " " + ias_data["type"]
    ias_data = ias_data.drop(columns=["type"])

    return ias_data


def get_sorted_ias_paths(
    exp_folders_path: Path,
    folder_name: str,
    subject_id: str,
    trial_report: pd.DataFrame,
) -> list[Path]:
    aoi_folder = get_aoi_folder_path(exp_folders_path, folder_name, subject_id)
    ias_paths = list(aoi_folder.glob("*.ias"))
    ias_paths.sort(
        key=lambda path: get_trial_index(
            path, trial_report=trial_report, subject_id=subject_id
        )
    )
    return ias_paths


def load_subject_dat(
    exp_folders_path: Path, folder_name: str, subject_id: str
) -> pd.DataFrame:
    dat_file_path = get_dat_file_path(exp_folders_path, folder_name)
    dat = load_dat(dat_file_path)

    subject_dat_file_path = get_subject_dat_file_path(
        exp_folders_path, folder_name, subject_id
    )
    subject_dat = load_dat(subject_dat_file_path, names=dat.columns.to_list())

    return subject_dat


def handle_long_words_going_down_line(
    p_ias: pd.DataFrame, folder_name, subject_id, ias_path, level: str
) -> tuple[pd.DataFrame, bool]:
    # Find duplicate consecutive words in p_ias["label"] by shifting the column by one and comparing
    # with the original column. If the two consecutive words are the same, then the word is a duplicate.
    duplicates = p_ias["label"].shift(-1) == p_ias["label"]

    found_duplicates = True if duplicates.any() else False

    word_parts = {
        "south-east": ["south-", "east"],
        "credit-card": ["credit-", "card"],
        "hunter-gatherer": ["hunter-", "gatherer"],
        "100sq-meter": ["100sq-", "meter"],
        "brand-new": ["brand-", "new"],
        "deep-fried": ["deep-", "fried"],
        "e-bicycles": ["e-", "bicycles"],
        "film-editing": ["film-", "editing"],
        "open-minded,": ["open-", "minded,"],
        "three-square-meter": {
            "Adv": ["three-square-", "meter"],
            "Ele": ["three-", "square-meter"],
        },
        "French-Canadian": ["French-", "Canadian"],
        "honey-flavored": ["honey-", "flavored"],
        "top-level": ["top-", "level"],
        "film-makers.": ["film-", "makers."],
        "post-genocide": ["post-", "genocide"],
        "10-year-olds": ["10-year-", "olds"],
        "100-seat": ["100-", "seat"],
        "51-year-old": ["51-year-", "old"],
        "el-Haite": ["el-", "Haite"],
        "6.30am;": ["6.30", "am;"],
        "al-Mamun.": ["al-", "Mamun."],
        "Seven-year-old": ["Seven-year-", "old"],
    }
    # For each duplicate in duplicates, updated the current word and the next word to the correct word
    for idx, word in p_ias[duplicates].iterrows():
        word_label = word["label"]
        assert isinstance(idx, int)
        if word_label in word_parts:
            if word_label == "three-square-meter":
                p_ias.at[idx, "label"] = word_parts[word_label][level][0]
                p_ias.at[idx + 1, "label"] = word_parts[word_label][level][1]
            else:
                p_ias.at[idx, "label"] = word_parts[word_label][0]
                p_ias.at[idx + 1, "label"] = word_parts[word_label][1]
        else:
            print(
                f"Error: {folder_name}, {subject_id},{ias_path.stem}, {ias_path.name} has duplicate words not in word_parts: {word_label}"
            )

    return p_ias, found_duplicates

In [2]:
trial_report_n = pd.read_csv(
    base_path / "Reports" / "n_reports" / "Output" / "n_trial_report.tsv", sep="\t"
)
trial_report_p = pd.read_csv(
    base_path / "Reports" / "p_reports" / "Output" / "p_trial_report.tsv", sep="\t"
)
trial_report = pd.concat([trial_report_n, trial_report_p], ignore_index=True)

In [3]:
dry_run = False

exp_folders_paths = exp_folders_path.glob("*")
fixed_trials = []
all_trials_p_ias = []
trial_report_subjects, non_trial_report_subjects = 0, 0
mismatched_words = []
for full_folder_name in exp_folders_paths:
    folder_fixed_trials = []
    folder_name = full_folder_name.name

    for subject in (exp_folders_path / folder_name / "results").glob("[lL]*"):
        subject_id = subject.name

        if subject_id not in trial_report["RECORDING_SESSION_LABEL"].to_list():
            print(f"Skipping {subject_id} as it is not in the trial report")
            non_trial_report_subjects += 1
            raise Exception
        else:
            print(f"Processing {subject_id}")
            trial_report_subjects += 1

        sub_trial_rep = trial_report[
            trial_report["RECORDING_SESSION_LABEL"] == subject_id
        ][["batch", "article_id", "paragraph_id", "trial", "level"]]

        subject_dat = load_subject_dat(exp_folders_path, folder_name, subject_id)

        ias_paths = get_sorted_ias_paths(
            exp_folders_path, folder_name, subject_id, trial_report
        )

        skipped_ias = []
        for ias_path in ias_paths:
            # print(ias_path.name)
            ias_data = load_ias(ias_path)
            if len(ias_data) < 30:
                skipped_ias.append(ias_path.stem)
                continue

            ias_data["ID"] = (
                ias_data.groupby("group").cumcount() + 1
            )  # for each group start the ID from 1

            batch, article_id, paragraph_id, level = (
                sub_trial_rep.loc[
                    trial_report["trial"]
                    == str(
                        get_trial_index(
                            ias_path, trial_report=trial_report, subject_id=subject_id
                        )
                    )
                ][["batch", "article_id", "paragraph_id", "level"]]
                .iloc[0]
                .to_list()
            )

            try:
                p_ias = get_paragarph_ias(ias_data, subject_id=subject_id)
            except Exception as e:
                print(f"Error: {e}")
                raise e

            p_ias, found_duplicates = handle_long_words_going_down_line(
                p_ias, folder_name, subject_id, ias_path, level=level
            )
            if found_duplicates:
                folder_fixed_trials.append(
                    (
                        folder_name,
                        subject_id,
                        ias_path.stem,
                        batch,
                        article_id,
                        paragraph_id,
                        level,
                        "duplicates (hyphen at end of line) fixed",
                    )
                )

            p_words = get_parag_words(
                subject_dat=subject_dat,
                ias_path=ias_path,
                trial_report=trial_report,
                subject_id=subject_id,
            )

            missing_words = get_missing_words(
                p_words=p_words,
                p_ias=p_ias,
            )
            if missing_words:
                p_ias = add_missing_rows_to_ias(
                    p_ias=p_ias, missing_words=missing_words
                )

                folder_fixed_trials.append(
                    (
                        folder_name,
                        subject_id,
                        ias_path.stem,
                        batch,
                        article_id,
                        paragraph_id,
                        level,
                        "11th row added",
                    )
                )

            ias_data = rebuild_ias_data(ias_data, p_ias)

            if not dry_run:
                ias_data.to_csv(ias_path, sep="\t", index=False, header=False)

            # all_trials_p_ias.append(
            #     (
            #         subject_id,
            #         batch,
            #         article_id,
            #         paragraph_id,
            #         level,
            #         # p_ias.drop(columns=["group"]).reset_index(drop=True).to_string(),
            #         p_ias.to_string(
            #             index=False,
            #             columns=["left", "right", "label"],
            #             col_space={"left": 10, "right": 10},
            #         ),
            #     )
            # )

    fixed_trials.extend(folder_fixed_trials)
    if folder_fixed_trials:
        print(f"{folder_name}; Fixed {len(folder_fixed_trials)} trials.")

if fixed_trials:
    date_str = datetime.now().strftime(
        "%Y%m%d"
    )  # get current date as a string in the format YYYYMMDD
    pd.DataFrame(
        fixed_trials,
        columns=[
            "folder",
            "subject",
            "trial",
            "batch",
            "article_id",
            "paragraph_id",
            "level",
            "reason",
        ],
    ).to_csv(f"fixed_trials_{date_str}.csv", index=False)

    print(f"Fixed trials saved to fixed_trials_{date_str}.csv")

# all_trials_p_ias_df = pd.DataFrame(
#     all_trials_p_ias,
#     columns=["subject_id", "batch", "article_id", "paragraph_id", "level", "p_ias"],
# )

Processing l42_2070
Processing l59_547
Processing l39_542
Processing l56_522
Processing l49_529
Processing l9_536
ose_3p_l1_l60_tower_st_mit_l1_latest_widthfix; Fixed 26 trials.
Processing l59_485
ose_2n_l1_l60_tower_st_lacclab_corrected; Fixed 2 trials.
Processing l57_439
Processing l53_431
Processing l45_452
Processing l46_453
Processing l34_447
Processing l44_412
Processing l40_405
Processing l41_451
Processing l58_441
Processing l37_395
Processing l36_394
Processing l54_461
Processing l55_435
Processing l48_422
Processing l47_457
Processing l50_460
Processing l60_446
Processing l35_450
Processing l51_427
Processing l52_428
Processing l38_396
Processing l43_411
ose_3p_l1_l60_tower_st_fixed_lacc; Fixed 87 trials.
Processing l11_525
Processing l36_524
ose_2p_l1_l60_tower_st_mit_l1_latest_widthfix; Fixed 5 trials.
Processing l21_102
Processing l22_103
Processing l39_125
Processing l60_190
Processing l50_145
Processing l16_96
Processing l40_127
Processing l8_268
Processing l4_80
Process

In [91]:
assert trial_report_subjects == 360
assert non_trial_report_subjects == 0

In [98]:
all_trials_p_ias_df[
    ["p_ias", "batch", "article_id", "paragraph_id", "level"]
].drop_duplicates().groupby(
    ["batch", "article_id", "paragraph_id", "level"]
).size().reset_index(name="count").sort_values(
    by="count", ascending=False
)  # ['count'].value_counts()

Unnamed: 0,batch,article_id,paragraph_id,level,count
120,2,1,5,Adv,3
326,3,9,3,Adv,3
206,2,8,6,Adv,3
204,2,8,5,Adv,3
202,2,8,4,Adv,2
...,...,...,...,...,...
87,1,7,4,Ele,1
207,2,8,6,Ele,1
91,1,8,2,Ele,1
98,1,9,1,Adv,1


In [None]:
df = (
    all_trials_p_ias_df[["p_ias", "batch", "article_id", "paragraph_id", "level"]]
    .drop_duplicates()
    .groupby(["batch", "article_id", "paragraph_id", "level"])
    .agg(list)
)
df[["col1", "col2", "col3"]] = df["p_ias"].apply(pd.Series)
df.drop(columns=["p_ias"], inplace=True)
df

In [163]:
import numpy as np
from Levenshtein import distance


def compute_distance(x, ind1, ind2):
    if isinstance(x[ind1], str) and isinstance(x[ind2], str):
        return int(distance(x[ind1], x[ind2]))
    else:
        return np.nan

In [171]:
df.apply(
    lambda x: compute_distance(x, "col1", "col2"), axis=1
).sort_values().reset_index().dropna()

Unnamed: 0,batch,article_id,paragraph_id,level,0
0,3,9,1,Ele,1.0
1,3,9,1,Adv,2.0
2,3,9,2,Ele,9.0
3,1,3,5,Ele,9.0
4,2,3,4,Ele,13.0
...,...,...,...,...,...
230,2,7,2,Adv,756.0
231,2,2,2,Adv,798.0
232,3,6,6,Adv,801.0
233,3,9,2,Adv,804.0


In [175]:
x = (
    all_trials_p_ias_df[["p_ias", "batch", "article_id", "paragraph_id", "level"]]
    .query("level == 'Adv' and batch =='2' and article_id =='8' and paragraph_id=='5'")
    .drop_duplicates()
)

In [177]:
x

Unnamed: 0,p_ias,batch,article_id,paragraph_id,level
429,left right label\n 358 ...,2,8,5,Adv
8586,left right label\n 358 ...,2,8,5,Adv
17765,left right label\n 358 ...,2,8,5,Adv


In [178]:
for i in x.p_ias.items():
    # print(i[0], i[1])
    print(i[0])
    # save to text file each i1
    with open(f"p_ias_{i[0]}.txt", "w") as file:
        file.write(i[1])

429
8586
17765


In [50]:
fixed_trials_df = pd.read_csv("fixed_trials.csv")
fixed_trials_df.drop_duplicates(subset=["subject"]).reset_index(drop=True).groupby(
    "folder"
).size()

folder
ose_1n_l1_l60_tower_st_mit_l1_latest         1
ose_2n_l1_l60_tower_st_lacc                  3
ose_2n_l1_l60_tower_st_lacclab_corrected     1
ose_2n_l1_l60_tower_st_mit_l1_latest         1
ose_2p_l1_l60_tower_st_mit_l1_latest         7
ose_3n_l1_l60_tower_st_fixed_lacc           24
ose_3n_l1_l60_tower_st_mit_l1_latest         1
ose_3p_l1_l60_tower_st_fixed_lacc           22
dtype: int64