## Match/merge/split eeg-label-texts

In [1]:
import numpy as np
import pandas as pd

df = pd.read_pickle('./data/tmp/zuco_eeg_128ch_1280len.df')
print(df.shape, df.columns)

label_table = pd.read_pickle('./data/tmp/zuco_label_8variants.df')
print(label_table.shape, label_table.columns)

(22335, 6) Index(['eeg', 'mask', 'text', 'dataset', 'task', 'subject'], dtype='object')
(1888, 17) Index(['raw text', 'dataset', 'task', 'control', 'raw label', 'input text',
       'text uid', 'sentiment label', 'relation label',
       'lexical simplification (v0)', 'lexical simplification (v1)',
       'semantic clarity (v0)', 'semantic clarity (v1)',
       'syntax simplification (v0)', 'syntax simplification (v1)',
       'naive rewritten', 'naive simplified'],
      dtype='object')


### Check text match
- make sure all texts (after revision) obtained from the original `.mat` files can be retrieved from the label table (from the `.csv` files).

In [2]:
typobook = {"emp11111ty":   "empty",
            "film.1":       "film.",
            "–":            "-",
            "’s":           "'s",
            "�s":           "'s",
            "`s":           "'s",
            "Maria":        "Marić",
            "1Universidad": "Universidad",
            "1902—19":      "1902 - 19",
            "Wuerttemberg": "Württemberg",
            "long -time":   "long-time",
            "Jose":         "José",
            "Bucher":       "Bôcher",
            "1839 ? May":   "1839 - May",
            "G�n�ration":  "Generation",
            "Bragança":     "Bragana",
            "1837?October": "1837 - October",
            "nVera-Ellen":  "Vera-Ellen",
            "write Ethics": "wrote Ethics",
            "Adams-Onis":   "Adams-Onís",
            "(40 km?)":     "(40 km²)",
            "(40 km˝)":     "(40 km²)",
            " (IPA: /?g?nz?b?g/) ": " ",
            '""Canes""':    '"Canes"',

            "111Senator":   "Senator",
            "Creteil":      "Créteil",
            "Zoonomia":     "Zoönomia",
            "1902�19":     "1902 - 19",
            "nee Darwin":   "née Darwin",
            "Ruthy":        "Réthy",
            "Eidgenoessische":  "Eidgenössische",
            "40 km�":       "40 km²",
            "King Leopold":  "King Léopold",
            }

def revise_typo(text):
    # the typo book 
    book = typobook
    for src, tgt in book.items():
        if src in text:
            text = text.replace(src, tgt)
    return text

# def match_text(text_in_mat, raw_text_in_table, input_text_in_table):
#     if (text_in_mat != raw_text_in_table) and (text_in_mat != input_text_in_table):
#         text_in_mat_revised = revise_typo(text_in_mat)
#         if (text_in_mat_revised != raw_text_in_table) and (text_in_mat_revised != input_text_in_table):
#             return False
#     return True

df['revised text'] = df['text'].apply(revise_typo)
input_texts = label_table['input text'].values.tolist()
matched = df['revised text'].isin(input_texts)
unmatched_rows = df[~matched]
print(unmatched_rows.shape[0])


0


### Assign `label id` according to the `index` of matched row in the label table

In [3]:
# sub_table = text_table[(text_table['dataset']==dataset_key) & (text_table['task']==task_key)]
new_groups = []
for (d, t, s), group in df.groupby(['dataset', 'task', 'subject']):
    sub_label_table = label_table[(label_table['dataset']==d) & (label_table['task']==t)]
    label_ids = []
    left_label = None 
    
    for i in range(group.shape[0]):
        row = group.iloc[i]
        src_text = row['revised text']
        ideal_matching_text = sub_label_table.iloc[i]['input text']
        task2_cache = set()
        if src_text == ideal_matching_text:
            label_id = sub_label_table.iloc[i].name
            control_label = sub_label_table.iloc[i]['raw label']
        else:
            matched_rows = sub_label_table[sub_label_table['input text'] == src_text]
            assert matched_rows.shape[0] > 0
            if matched_rows.shape[0] == 1:
                label_id = matched_rows.index.values.item()
                control_label = matched_rows['raw label'].values.item()
            elif matched_rows.shape[0] == 2:
                # print(matched_rows.shape[0], d, t, s, i)
                matched_labels = matched_rows['raw label'].values.tolist()
                # print(matched_labels)
                # print(src_text)
                if t == 'task3':  # use context label to locate
                    # assume the relation types are grouped as the text table
                    assert i != 0, f"{i}" # letf label may not work
                    assert matched_rows.iloc[1].name - matched_rows.iloc[0].name > 1  # hard to explain
                    if left_label in matched_labels:
                        the_matched_row = matched_rows[matched_rows['raw label']== left_label]
                        control_label = left_label
                        # print(f"😓😓😓use left label: {left_label}")
                    else: 
                        right_row = group.iloc[i+1]
                        right_matched_rows = sub_label_table[sub_label_table['input text'] == right_row['revised text']]
                        if right_matched_rows.shape[0] >1:
                            right_row = group.iloc[i+2]
                            right_matched_rows = sub_label_table[sub_label_table['input text'] == right_row['revised text']]
                        right_label = right_matched_rows['raw label'].values.item()
                        # print(f"😡😡😡use right label: {right_label}")
                        assert right_label in matched_labels
                        the_matched_row = matched_rows[matched_rows['raw label']== right_label]
                        control_label = right_label
                    label_id = the_matched_row.index.values.item()
                elif t == 'task2':
                    if src_text in task2_cache:
                        label_id = matched_rows.iloc[1].name
                    else:
                        label_id = matched_rows.iloc[0].name
                    task2_cache.add(src_text)
                    control_label = None
                else:
                    raise ValueError(f'{t}') 
                # print()
            else: 
                raise ValueError(f'{matched_rows.shape[0]}')
        left_label = control_label
        label_ids.append(label_id)

    group['label id'] = label_ids
    new_groups.append(group)
df = pd.concat(new_groups)
print(df.columns)   

Index(['eeg', 'mask', 'text', 'dataset', 'task', 'subject', 'revised text',
       'label id'],
      dtype='object')


### Merge eeg and labels according to the `label id`

In [4]:
df = df.reindex(columns=['eeg', 'mask', 'subject','label id'])
df_merged = df.merge(label_table, left_on='label id', right_index=True, how='left')
print(df_merged.shape, df_merged.columns)

(22335, 21) Index(['eeg', 'mask', 'subject', 'label id', 'raw text', 'dataset', 'task',
       'control', 'raw label', 'input text', 'text uid', 'sentiment label',
       'relation label', 'lexical simplification (v0)',
       'lexical simplification (v1)', 'semantic clarity (v0)',
       'semantic clarity (v1)', 'syntax simplification (v0)',
       'syntax simplification (v1)', 'naive rewritten', 'naive simplified'],
      dtype='object')


## Strict split on unique texts

As you can find in current `label table`, there exist more than one forms of text overlap, which are actually intentional setups by ZuCo dataset (Please refer to [Section 3.2](https://arxiv.org/pdf/1912.00903) in ZuCo 2.0 paper). There are 4 overlapping conditions in addition to the inter-subject overlap:
  1. between task2 and task3 (with same corpus of Wiki, offering comparison between reading paradigms, i.e., NR vs. TSR);
  2. between zuco1 and zuco2 (to discuss the effect of experimental setting);
  3. within task3 (same sentences annotated with different labels);
  4. within zuco2-task2 (**unkown reason**).  


Therefore, to ensure the generation will not benefit from the data leakage, we split samples by the `text uid`. 
- Due to the block design, adjacent samples have the same relation label in task3. We therefore adopt the  **random sampling** instead of *ordinal chuncking* or *interval sampling* to: (1) increase the diversity of relation types; and (2) exclude potential bias caused by the 'temporal adaptability of stimuli'. We set a fixed random seet to ensure the reproducibility.
- We first collect all duplicated texts into training set, then conduct stratified random sampling across `dataset` and `task`. 
- For samples with duplicate text but different relation labels, they are also included into training set due to the shared `text uid`.

In [5]:
uid_counts = label_table.value_counts(['text uid'])  # NOTE: find out subject-independent duplications
duplicated_uids = uid_counts[uid_counts>1]
duplicated_uids = [tp[0] for tp in duplicated_uids.index.tolist()]
# print(duplicated_uids)

new_groups = []
for name, group in df_merged.groupby(['dataset', 'task']):
    phases = []
    text_uids = group['text uid'].values
    text_uid_set = list(set(text_uids))
    dup_uids = [uid for uid in text_uid_set if uid in duplicated_uids]
    uniq_uids = [uid for uid in text_uid_set if uid not in duplicated_uids]
    n = len(text_uid_set)
    a = len(uniq_uids)
    k = 0.8-0.2*(n-a)/a  # k is the ratio of train samples in singular_uids, derived from [k*a+n-a:(a-k*a)/2 = 8:1]
    rng = np.random.default_rng(seed=42)
    uniq_uids_shuffled = rng.permutation(uniq_uids)
    train, val, test = np.split(uniq_uids_shuffled, [int(k*a), int((a+k*a)/2)])
    train_uids = train.tolist() + dup_uids
    val_uids = val.tolist()
    test_uids = test.tolist()

    for uid in text_uids:
        if uid in train_uids:
            phases.append('train')
        elif uid in val_uids:
            phases.append('val')
        elif uid in test_uids:
            phases.append('test')
            
    group['phase']=phases
    new_groups.append(group)
df_merged = pd.concat(new_groups)
print(df_merged.columns)
print(df_merged.value_counts(['phase']))

Index(['eeg', 'mask', 'subject', 'label id', 'raw text', 'dataset', 'task',
       'control', 'raw label', 'input text', 'text uid', 'sentiment label',
       'relation label', 'lexical simplification (v0)',
       'lexical simplification (v1)', 'semantic clarity (v0)',
       'semantic clarity (v1)', 'syntax simplification (v0)',
       'syntax simplification (v1)', 'naive rewritten', 'naive simplified',
       'phase'],
      dtype='object')
phase
train    17908
test      2227
val       2200
Name: count, dtype: int64


In [6]:
print(df_merged.dtypes)

eeg                            object
mask                           object
subject                        object
label id                        int64
raw text                       object
dataset                        object
task                           object
control                          bool
raw label                      object
input text                     object
text uid                        int64
sentiment label                object
relation label                 object
lexical simplification (v0)    object
lexical simplification (v1)    object
semantic clarity (v0)          object
semantic clarity (v1)          object
syntax simplification (v0)     object
syntax simplification (v1)     object
naive rewritten                object
naive simplified               object
phase                          object
dtype: object


In [7]:
pd.to_pickle(df_merged, './data/tmp/zuco_eeg_label_8variants.df')

In [8]:
for name, group in df_merged.groupby(['phase','task']):
    print(f"group: {name}, num of samples: {group.shape[0]}, num of unique sentences: {group['text uid'].nunique()}")

group: ('test', 'task1'), num of samples: 417, num of unique sentences: 40
group: ('test', 'task2'), num of samples: 827, num of unique sentences: 65
group: ('test', 'task3'), num of samples: 983, num of unique sentences: 77
group: ('train', 'task1'), num of samples: 3316, num of unique sentences: 320
group: ('train', 'task2'), num of samples: 6231, num of unique sentences: 461
group: ('train', 'task3'), num of samples: 8361, num of unique sentences: 524
group: ('val', 'task1'), num of samples: 406, num of unique sentences: 40
group: ('val', 'task2'), num of samples: 824, num of unique sentences: 64
group: ('val', 'task3'), num of samples: 970, num of unique sentences: 76


In [9]:
for name, group in df_merged.groupby(['phase','subject']):
    print(f"group: {name}, num of samples: {group.shape[0]}, num of unique sentences: {group['text uid'].nunique()}")

group: ('test', 'YAC'), num of samples: 44, num of unique sentences: 44
group: ('test', 'YAG'), num of samples: 56, num of unique sentences: 56
group: ('test', 'YAK'), num of samples: 62, num of unique sentences: 62
group: ('test', 'YDG'), num of samples: 73, num of unique sentences: 73
group: ('test', 'YDR'), num of samples: 72, num of unique sentences: 72
group: ('test', 'YFR'), num of samples: 34, num of unique sentences: 34
group: ('test', 'YFS'), num of samples: 71, num of unique sentences: 71
group: ('test', 'YHS'), num of samples: 73, num of unique sentences: 73
group: ('test', 'YIS'), num of samples: 72, num of unique sentences: 72
group: ('test', 'YLS'), num of samples: 63, num of unique sentences: 63
group: ('test', 'YMD'), num of samples: 63, num of unique sentences: 63
group: ('test', 'YMS'), num of samples: 65, num of unique sentences: 65
group: ('test', 'YRH'), num of samples: 55, num of unique sentences: 55
group: ('test', 'YRK'), num of samples: 51, num of unique senten

In [10]:
for name, group in df_merged.groupby(['phase','subject', 'task']):
    print(f"group: {name}, num of samples: {group.shape[0]}, num of unique sentences: {group['text uid'].nunique()}")

group: ('test', 'YAC', 'task2'), num of samples: 21, num of unique sentences: 21
group: ('test', 'YAC', 'task3'), num of samples: 23, num of unique sentences: 23
group: ('test', 'YAG', 'task2'), num of samples: 29, num of unique sentences: 29
group: ('test', 'YAG', 'task3'), num of samples: 27, num of unique sentences: 27
group: ('test', 'YAK', 'task2'), num of samples: 27, num of unique sentences: 27
group: ('test', 'YAK', 'task3'), num of samples: 35, num of unique sentences: 35
group: ('test', 'YDG', 'task2'), num of samples: 35, num of unique sentences: 35
group: ('test', 'YDG', 'task3'), num of samples: 38, num of unique sentences: 38
group: ('test', 'YDR', 'task2'), num of samples: 35, num of unique sentences: 35
group: ('test', 'YDR', 'task3'), num of samples: 37, num of unique sentences: 37
group: ('test', 'YFR', 'task2'), num of samples: 22, num of unique sentences: 22
group: ('test', 'YFR', 'task3'), num of samples: 12, num of unique sentences: 12
group: ('test', 'YFS', 'task