IMPORTANT: The ordering of samples in this version of the data, which is [provided by the organizers](https://github.com/rewire-online/edos) after the competition, doesn't match the one used in competition.

In [1]:
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
DATASET_FOLDER = Path('../edos_data')
Path(DATASET_FOLDER / 'processed').mkdir(exist_ok=True)

In [None]:
df = pd.read_csv(DATASET_FOLDER / 'raw' / 'edos_labelled_aggregated.csv')
df.head()

In [4]:
for part in ['dev', 'test']:
    part_df = df.loc[df.split == part]
    part_df.to_csv(DATASET_FOLDER / 'processed' / f'{part}_task_a_entries.csv', index=False)
    part_df.loc[part_df.label_sexist == 'sexist'].to_csv(
        DATASET_FOLDER / 'processed' / f'{part}_task_b_entries.csv', index=False
    )
    part_df.loc[part_df.label_sexist == 'sexist'].to_csv(
        DATASET_FOLDER / 'processed' / f'{part}_task_c_entries.csv', index=False
    )

In [5]:
df = df.loc[df.split == 'train']

In [6]:
df['label_category'].value_counts().to_dict()

{'none': 10602,
 '2. derogation': 1590,
 '3. animosity': 1165,
 '4. prejudiced discussions': 333,
 '1. threats, plans to harm and incitement': 310}

In [7]:
df = df.rename(columns={'rewire_id': 'orig_id'})
df.insert(0, 'id', df.index)

In [8]:
df_train, df_val = train_test_split(df, test_size=0.2, shuffle=True, random_state=42)
len(df_train), len(df_val)

(11200, 2800)

In [9]:
df_train.to_csv(DATASET_FOLDER / 'processed' / 'edos_2023_train.csv', index=False)
df_val.to_csv(DATASET_FOLDER / 'processed' / 'edos_2023_val.csv', index=False)

In [None]:
df_raw_reddit = pd.read_csv(DATASET_FOLDER / 'raw' / 'reddit_1M_unlabelled.csv')
df_raw_gab = pd.read_csv(DATASET_FOLDER / 'raw' / 'gab_1M_unlabelled.csv')
df_raw_reddit.insert(0, 'id', df_raw_reddit.index)
df_raw_gab.insert(0, 'id', df_raw_gab.index)
df_raw_reddit.head()

In [12]:
edos_val_texts = set(df_val['text'])
len(edos_val_texts)

2800

In [13]:
# remove edos val set from unlabeled texts
df_raw_reddit = df_raw_reddit[~df_raw_reddit['text'].isin(edos_val_texts)]
df_raw_gab = df_raw_gab[~df_raw_gab['text'].isin(edos_val_texts)]
len(df_raw_reddit), len(df_raw_gab)

(998626, 998576)

In [14]:
df_raw_reddit.to_csv(DATASET_FOLDER / 'processed' / 'gab_1M_unlabelled.csv', index=False)
df_raw_gab.to_csv(DATASET_FOLDER / 'processed' / 'reddit_1M_unlabelled.csv', index=False)

In [15]:
df_raw_reddit_train, df_raw_reddit_val = train_test_split(df_raw_reddit, test_size=0.2, shuffle=True, random_state=42)
len(df_raw_reddit_train), len(df_raw_reddit_val)

(798900, 199726)

In [16]:
df_raw_gab_train, df_raw_gab_val = train_test_split(df_raw_gab, test_size=0.2, shuffle=True, random_state=42)
len(df_raw_gab_train), len(df_raw_gab_val)

(798860, 199716)

In [40]:
df_raw_reddit_train.to_csv(DATASET_FOLDER / 'processed' / '1M_unlabelled_reddit_train.csv', index=False)
df_raw_reddit_val.to_csv(DATASET_FOLDER / 'processed' / '1M_unlabelled_reddit_val.csv', index=False)

df_raw_gab_train.to_csv(DATASET_FOLDER / 'processed' / '1M_unlabelled_gab_train.csv', index=False)
df_raw_gab_val.to_csv(DATASET_FOLDER / 'processed' / '1M_unlabelled_gab_val.csv', index=False)

---

In [17]:
for col in ['label_sexist', 'label_category', 'label_vector']:
    t = pd.DataFrame()

    for name, _df in [('all', df), ('train', df_train), ('val', df_val)]:
        t[name] = _df[col].value_counts()

    print(t)
    print(t / t.sum(axis=0))

              all  train   val
not sexist  10602   8477  2125
sexist       3398   2723   675
                 all     train       val
not sexist  0.757286  0.756875  0.758929
sexist      0.242714  0.243125  0.241071
                                            all  train   val
none                                      10602   8477  2125
2. derogation                              1590   1287   303
3. animosity                               1165    923   242
4. prejudiced discussions                   333    263    70
1. threats, plans to harm and incitement    310    250    60
                                               all     train       val
none                                      0.757286  0.756875  0.758929
2. derogation                             0.113571  0.114911  0.108214
3. animosity                              0.083214  0.082411  0.086429
4. prejudiced discussions                 0.023786  0.023482  0.025000
1. threats, plans to harm and incitement  0.022143  0.022321  0