In [2]:
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
DATASET_FOLDER = Path('../edos_data')
DATASET_FOLDER.absolute()

In [None]:
df = pd.read_csv(DATASET_FOLDER / 'raw' / 'train_all_tasks.csv')
df.head()

In [16]:
df['label_category'].value_counts().to_dict()

{'none': 10602,
 '2. derogation': 1590,
 '3. animosity': 1165,
 '4. prejudiced discussions': 333,
 '1. threats, plans to harm and incitement': 310}

In [17]:
df = df.rename(columns={'rewire_id': 'orig_id'})
df.insert(0, 'id', df.index)

In [18]:
df_train, df_val = train_test_split(df, test_size=0.2, shuffle=True, random_state=42)
len(df_train), len(df_val)

(11200, 2800)

In [19]:
df_train.to_csv(DATASET_FOLDER / 'processed' / 'edos_2023_train.csv', index=False)
df_val.to_csv(DATASET_FOLDER / 'processed' / 'edos_2023_val.csv', index=False)

In [None]:
df_raw_reddit = pd.read_csv(DATASET_FOLDER / 'raw' / 'reddit_1M_unlabelled.csv')
df_raw_gab = pd.read_csv(DATASET_FOLDER / 'raw' / 'gab_1M_unlabelled.csv')
df_raw_reddit.insert(0, 'id', df_raw_reddit.index)
df_raw_gab.insert(0, 'id', df_raw_gab.index)
df_raw_reddit.head()

In [26]:
edos_val_texts = set(df_val['text'])
len(edos_val_texts)

2800

In [35]:
# remove edos val set from unlabeled texts
df_raw_reddit = df_raw_reddit[~df_raw_reddit['text'].isin(edos_val_texts)]
df_raw_gab = df_raw_gab[~df_raw_gab['text'].isin(edos_val_texts)]
len(df_raw_reddit), len(df_raw_gab)

(998603, 998596)

In [39]:
df_raw_reddit.to_csv(DATASET_FOLDER / 'processed' / 'gab_1M_unlabelled.csv', index=False)
df_raw_gab.to_csv(DATASET_FOLDER / 'processed' / 'reddit_1M_unlabelled.csv', index=False)

In [36]:
df_raw_reddit_train, df_raw_reddit_val = train_test_split(df_raw_reddit, test_size=0.2, shuffle=True, random_state=42)
len(df_raw_reddit_train), len(df_raw_reddit_val)

(798882, 199721)

In [37]:
df_raw_gab_train, df_raw_gab_val = train_test_split(df_raw_gab, test_size=0.2, shuffle=True, random_state=42)
len(df_raw_gab_train), len(df_raw_gab_val)

(798876, 199720)

In [40]:
df_raw_reddit_train.to_csv(DATASET_FOLDER / 'processed' / '1M_unlabelled_reddit_train.csv', index=False)
df_raw_reddit_val.to_csv(DATASET_FOLDER / 'processed' / '1M_unlabelled_reddit_val.csv', index=False)

df_raw_gab_train.to_csv(DATASET_FOLDER / 'processed' / '1M_unlabelled_gab_train.csv', index=False)
df_raw_gab_val.to_csv(DATASET_FOLDER / 'processed' / '1M_unlabelled_gab_val.csv', index=False)

---

In [34]:
for col in ['label_sexist', 'label_category', 'label_vector']:
    t = pd.DataFrame()

    for name, _df in [('all', df), ('train', df_train), ('val', df_val)]:
        t[name] = _df[col].value_counts()

    print(t)
    print(t / t.sum(axis=0))

              all  train   val
not sexist  10602   8506  2096
sexist       3398   2694   704
                 all     train       val
not sexist  0.757286  0.759464  0.748571
sexist      0.242714  0.240536  0.251429
                                            all  train   val
none                                      10602   8506  2096
2. derogation                              1590   1288   302
3. animosity                               1165    896   269
4. prejudiced discussions                   333    260    73
1. threats, plans to harm and incitement    310    250    60
                                               all     train       val
none                                      0.757286  0.759464  0.748571
2. derogation                             0.113571  0.115000  0.107857
3. animosity                              0.083214  0.080000  0.096071
4. prejudiced discussions                 0.023786  0.023214  0.026071
1. threats, plans to harm and incitement  0.022143  0.022321  0