In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Hate speech datasets

* olid + offenseval
* hateval
* haspeede2018 + 2020
* measuring
* unintended

In [2]:
HS_DIRNAME = os.path.join('..', 'multitask_data', 'raw', 'hate_speech')
FORMATTED_DIR = os.path.join('..', 'multitask_data', 'formatted', 'hate_speech')
os.makedirs(FORMATTED_DIR, exist_ok=True)

## OffensEval

In [25]:
OLID_DIRNAME = os.path.join(HS_DIRNAME, 'OLIDv1.0')

In [38]:
olid_train_df = pd.read_csv(os.path.join(OLID_DIRNAME, 'olid-training-v1.0.tsv'), sep='\t')

In [39]:
olid_test_df = pd.read_csv(os.path.join(OLID_DIRNAME, 'testset-levela.tsv'), sep='\t')

for label in ['a', 'b', 'c']:
    labels_df = pd.read_csv(
        os.path.join(OLID_DIRNAME, f'labels-level{label}.csv'), 
        names=[f'subtask_{label}'], header=None
    )
    olid_test_df = olid_test_df.join(labels_df, on='id', how='left')

In [40]:
offenseval = pd.concat((olid_train_df, olid_test_df), ignore_index=True)

In [41]:
offenseval.loc[offenseval['subtask_b'] == 'TIN', 'subtask_b'] = offenseval.subtask_c
offenseval.drop(('subtask_c'), axis=1, inplace=True)
offenseval.rename(columns={
    'id': 'orig_id', 
    'tweet': 'text',
    'subtask_a': 'is_offense',
    'subtask_b': 'target'
}, inplace=True)

In [42]:
offenseval.loc[offenseval.is_offense.str.lower() == 'off', 'is_offense'] = 1
offenseval.loc[offenseval.is_offense.str.lower() == 'not', 'is_offense'] = 0

In [43]:
offenseval['dataset'] = 'olid'
offenseval['language'] = 'en'

In [44]:
offenseval.shape

(14100, 6)

In [None]:
offenseval

In [47]:
offenseval.to_csv(os.path.join(FORMATTED_DIR, 'olid_and_offenseval.csv'))

## HatEval

In [60]:
HATEVAL_DIR = os.path.join(HS_DIRNAME, 'hateval2019')

In [61]:
hatevals = []
for language in ['en', 'es']:
    for part in ['train', 'dev', 'test']:
        curr_df = pd.read_csv(os.path.join(HATEVAL_DIR, f'hateval2019_{language}_{part}.csv'))
        curr_df['dataset'] = 'hateval'
        curr_df['language'] = language
        hatevals.append(curr_df)

hateval = pd.concat(hatevals, ignore_index=True)

In [62]:
hateval.drop(['AG'], axis=1, inplace=True)

In [63]:
hateval.loc[(hateval.HS == 1) & (hateval.TR == 0), 'TR'] = 'GRP'
hateval.loc[(hateval.HS == 1) & (hateval.TR == 1), 'TR'] = 'IND' 
hateval.loc[hateval.HS == 0, 'TR'] = np.nan

In [64]:
hateval.rename(columns={
    'id': 'orig_id', 
    'HS': 'is_hate',
    'TR': 'target'
}, inplace=True)

In [65]:
hateval.shape

(19600, 6)

In [None]:
hateval

In [30]:
hateval.to_csv(os.path.join(FORMATTED_DIR, 'hateval.csv'))

## HaSpeeDe

### 2018

In [29]:
# Taking only FB from 2018, Twitter was included in 2020
HASPEEDE2018_DIR = os.path.join(HS_DIRNAME, 'HaSpeeDe_2018', 'FB-folder')

In [30]:
haspeede2018s = []
for part in ['train', 'reference']:
    curr_df = pd.read_csv(
        os.path.join(HASPEEDE2018_DIR, f'FB-{part}', f'haspeede_FB-{part}.tsv'), 
        header=None, names=['id', 'text', 'is_hate'], sep='\t'
    )
    haspeede2018s.append(curr_df)

haspeede2018 = pd.concat(haspeede2018s, ignore_index=True)

In [31]:
haspeede2018['dataset'] = 'haspeede2018'
haspeede2018['language'] = 'it'
haspeede2018.rename(columns={'id': 'orig_id'}, inplace=True) 

In [32]:
haspeede2018.shape

(4000, 5)

In [None]:
haspeede2018

### 2020

In [33]:
HASPEEDE2020_DIR = os.path.join(HS_DIRNAME, 'HaSpeeDe_2020')

In [34]:
haspeede2020 = pd.read_csv(
    os.path.join(HASPEEDE2020_DIR, f'haspeede2_dev', f'haspeede2_dev_taskAB.tsv'), 
    sep='\t'
)

for source in ['news', 'tweets']:
    ref_df = pd.read_csv(
        os.path.join(HASPEEDE2020_DIR, 'haspeede2_reference', f'haspeede2_reference_taskAB-{source}.tsv'), 
        sep='\t', header=None, names=haspeede2020.columns
    )
    haspeede2020 = pd.concat((haspeede2020, ref_df), ignore_index=True)

In [35]:
haspeede2020.drop(['stereotype'], axis=1, inplace=True)
haspeede2020.rename(columns={'id': 'orig_id', 'hs': 'is_hate', 'text ': 'text'}, inplace=True) 
haspeede2020['dataset'] = 'haspeede2020'
haspeede2020['language'] = 'it'

In [36]:
haspeede2020.shape

(8600, 5)

In [None]:
haspeede2020

### Concatenation

In [37]:
haspeede = pd.concat((haspeede2018, haspeede2020), ignore_index=True)

In [38]:
haspeede.shape

(12600, 5)

In [43]:
haspeede.to_csv(os.path.join(FORMATTED_DIR, 'haspeede.csv'))

## Measuring hate speech

In [3]:
HATE_THRESHOLD = 0.5

In [4]:
measuring_df_raw = pd.read_csv(os.path.join(HS_DIRNAME, 'measuring_hate_speech', 'data.csv'))

In [5]:
unique_comments = measuring_df_raw.groupby('comment_id', sort=False)

In [6]:
new_cols = [
    unique_comments['text'].first(),
    unique_comments['hate_speech_score'].first(),
    unique_comments['target_race'].mean(),
    unique_comments['target_religion'].mean(),
    unique_comments['target_origin'].mean(),
    unique_comments['target_gender'].mean(),
    unique_comments['target_sexuality'].mean(),
    unique_comments['target_age'].mean(),
    unique_comments['target_disability'].mean()
]

In [7]:
measuring = pd.concat(new_cols, axis=1)

In [8]:
measuring.hate_speech_score = np.where(measuring.hate_speech_score > HATE_THRESHOLD, 1, 0)

In [9]:
measuring.reset_index(inplace=True)

In [10]:
cols_to_rename = {
    col: col[len('target_'):] 
    for col in measuring.columns
    if col.startswith('target_')
}
cols_to_rename['hate_speech_score'] = 'is_hate'
cols_to_rename['comment_id'] = 'orig_id'

measuring.rename(columns=cols_to_rename, inplace=True)

In [11]:
measuring['dataset'] = 'measuring'
measuring['language'] = 'en'

In [12]:
measuring.drop(['origin', 'age'], axis=1, inplace=True)  # no annotation for this in `unintended`

In [50]:
measuring.shape

(39565, 10)

In [None]:
measuring

In [56]:
measuring.to_csv(os.path.join(FORMATTED_DIR, 'measuring.csv'))

## Jigsaw unintended bias (CivilComments)

In [14]:
HATE_THRESHOLD = 0.5

In [15]:
UNINTENDED_PATH = os.path.join(HS_DIRNAME, 'Jigsaw_unintended_bias_civil_comments')

In [16]:
# ~700 Mb of RAM

unintended_reader = pd.read_csv(
    os.path.join(UNINTENDED_PATH, 'all_data.csv'),
    chunksize=5000
)

relevant_rows = []
for chunk in unintended_reader:
    rel_chunk = chunk.loc[chunk.identity_annotator_count != 0]
    if rel_chunk.shape[0]:
        relevant_rows.append(rel_chunk)

unintended_df = pd.concat(relevant_rows, ignore_index=True)

In [17]:
TARGET_COLS = {
    'gender': ['male', 'female', 'transgender', 'other_gender'],
    'sexuality': ['heterosexual', 'homosexual_gay_or_lesbian', 'bisexual', 'other_sexual_orientation'],
    'religion': ['christian', 'jewish', 'muslim', 'hindu', 'buddhist', 'atheist', 'other_religion'],
    'race': ['black', 'white', 'asian', 'latino', 'other_race_or_ethnicity'],
    'disability': [
        'physical_disability', 'intellectual_or_learning_disability', 
        'psychiatric_or_mental_illness', 'other_disability'
    ]
}

COLS_TO_RENAME = {
    'id': 'orig_id',
    'comment_text': 'text',
    'toxicity': 'is_toxicity'
}

In [18]:
cols_to_keep = [x for v in TARGET_COLS.values() for x in v] + list(COLS_TO_RENAME.keys())

unintended_df.drop(
    [x for x in unintended_df.columns if x not in cols_to_keep],
    inplace=True, axis=1
)

In [19]:
for res_colname, raw_cols in TARGET_COLS.items():
    res_col = unintended_df[raw_cols].max(axis=1)
    unintended_df[res_colname] = res_col
    unintended_df.drop(raw_cols, inplace=True, axis=1)

In [20]:
unintended_df.toxicity = np.round(unintended_df.toxicity)
unintended_df.rename(columns=COLS_TO_RENAME, inplace=True)

In [21]:
unintended_df['dataset'] = 'unintended'
unintended_df['language'] = 'en'

In [60]:
unintended_df.shape

(448000, 10)

In [None]:
unintended_df

In [171]:
unintended_df.to_csv(os.path.join(FORMATTED_DIR, 'unintended_bias.csv'))

# Sexism datasets

* evalita 2018
* evalita 2020
* call_me_sexist
* exist
* online_misogyny
* edos

In [67]:
SEXISM_DIRNAME = os.path.join('..', 'multitask_data', 'raw', 'misogyny')
FORMATTED_DIR = os.path.join('..', 'multitask_data', 'formatted', 'misogyny')
os.makedirs(FORMATTED_DIR, exist_ok=True)

## AMI@EVALITA

In [68]:
EVALITA_DIR = os.path.join(SEXISM_DIRNAME, 'AMI_EVALITA')

### 2018

In [69]:
evalita2018_dfs = []
for lang in ['en', 'it']:
    for part in ['training', 'testing_labeled']:
        df = pd.read_csv(os.path.join(EVALITA_DIR, 'AMI2018_ELG', f'{lang}_{part}_anon.tsv'), sep='\t')
        df['language'] = lang
        evalita2018_dfs.append(df)

evalita2018 = pd.concat(evalita2018_dfs, ignore_index=True)

In [70]:
evalita2018['dataset'] = 'evalita2018'
evalita2018.rename(
    columns={'id': 'orig_id', 'misogynous': 'is_sexism', 'misogyny_category': 'sexism_category'},
    inplace=True
)

In [71]:
# in accordance with other HS datasets
evalita2018.loc[evalita2018.target == '0', 'target'] = np.nan
evalita2018.loc[evalita2018.target == 'active', 'target'] = 'IND'
evalita2018.loc[evalita2018.target.isin(['passive', 'pass']), 'target'] = 'GRP'

In [72]:
evalita2018.loc[evalita2018.sexism_category == '0', 'sexism_category'] = 0

In [73]:
evalita2018.shape

(10000, 7)

In [None]:
evalita2018

In [76]:
evalita2018.to_csv(os.path.join(FORMATTED_DIR, 'evalita2018.csv'))

### 2020

In [71]:
# taking only test because train is from 2018

evalita2020 = pd.read_csv(
    os.path.join(EVALITA_DIR, 'AMI2020', 'testset', 'AMI2020_test_raw_gold_anon.tsv'),
    sep='\t'
)

In [72]:
evalita2020.rename(columns={'id': 'orig_id', 'misogynous': 'is_sexism'}, inplace=True)
evalita2020.drop(['aggressiveness'], axis=1, inplace=True)
evalita2020['dataset'] = 'evalita2020'
evalita2020['language'] = 'it'

In [73]:
evalita2020.shape

(1000, 5)

In [None]:
evalita2020

In [81]:
evalita2020.to_csv(os.path.join(FORMATTED_DIR, 'evalita2020.csv'))

## Call me sexist

In [75]:
CONTENT_CATEGORIES = {
    1: 'behavioral expectations',
    2: 'stereotypes and comparisons',
    3: 'endorsement of inequality',
    4: 'denying inequality and rejecting feminism',
    5: 0,
    6: 0
}

In [76]:
call_me_sexist_data = pd.read_csv(os.path.join(SEXISM_DIRNAME, 'call_me_sexist', 'call_me_sexist_data.csv'))
call_me_sexist_data = call_me_sexist_data.loc[call_me_sexist_data.of_id == -1]  # removing adversarial examples
call_me_sexist_data = call_me_sexist_data.drop(['of_id', 'toxicity', 'dataset'], axis=1)

In [77]:
call_me_sexist_ann = pd.read_csv(os.path.join(SEXISM_DIRNAME, 'call_me_sexist', 'call_me_sexist_annotations.csv'))

# not in the final dataset
dropped_ids = call_me_sexist_ann.loc[~call_me_sexist_ann.id.isin(call_me_sexist_data.id)].index
call_me_sexist_ann.drop(dropped_ids, axis=0, inplace=True)

In [78]:
grouped = call_me_sexist_ann.groupby(['id', 'content']).count()

In [79]:
# for retrieving maximum everywhere: 
# nice when there are <4 workers
# not nice when the workers disagree, e.g. 2 votes for two options

# flat_df = pd.DataFrame.from_records(grouped.index)
# flat_df['worker'] = grouped.worker.tolist()
# max_content = flat_df.loc[flat_df.groupby(0).worker.idxmax()]

In [80]:
# for absolute maximum, no disagreement, 
# but sents with fewer annotators are lost
max_content = grouped.loc[grouped.worker >= 3]

In [81]:
# messages with no absolute maximum or with few annotators
len(set(grouped.index.get_level_values('id')) - set(max_content.index.get_level_values('id')))

745

In [82]:
max_content_ids = max_content.index.get_level_values('id')
max_content_vals = max_content.index.get_level_values('content').tolist()

# NB: if absolute max is used, there is no annotation for some sexist messages
call_me_sexist_data['category'] = 0
call_me_sexist_data.loc[call_me_sexist_data.id.isin(max_content_ids), 'category'] = max_content_vals

In [83]:
# NB: 5s ("not sure") are also disregarded from subclassification
for number, label in CONTENT_CATEGORIES.items():
    call_me_sexist_data.loc[call_me_sexist_data.category == number, 'category'] = label

In [84]:
call_me_sexist_data.rename(
    columns={'id': 'orig_id', 'sexist': 'is_sexism', 'category': 'sexism_category'},
    inplace=True
)
call_me_sexist_data.is_sexism = call_me_sexist_data.is_sexism.astype(int)
call_me_sexist_data['dataset'] = 'call_me_sexist'
call_me_sexist_data['language'] = 'en'

In [85]:
# sexist messages with and without subcategory annotation
with_sub = call_me_sexist_data.loc[
    (call_me_sexist_data.sexism_category != 0) & (call_me_sexist_data.is_sexism)
]
without_sub = call_me_sexist_data.loc[
    (call_me_sexist_data.sexism_category == 0) & (call_me_sexist_data.is_sexism)
]

with_sub.shape[0], without_sub.shape[0]

(1241, 523)

In [86]:
call_me_sexist_data.shape

(11339, 6)

In [None]:
call_me_sexist_data

In [94]:
call_me_sexist_data.to_csv(os.path.join(FORMATTED_DIR, 'call_me_sexist.csv'))

## EXIST

In [89]:
train = pd.read_csv(os.path.join(SEXISM_DIRNAME, 'EXIST2021', 'training', 'EXIST2021_training.tsv'), sep='\t')
test = pd.read_csv(os.path.join(SEXISM_DIRNAME, 'EXIST2021', 'test', 'EXIST2021_test_labeled.tsv'), sep='\t')
exist = pd.concat([train, test], ignore_index=True)

In [90]:
exist.drop(['source', 'test_case'], axis=1, inplace=True)
exist.rename(columns={'id': 'orig_id', 'task1': 'is_sexism', 'task2': 'sexism_category'}, inplace=True)
exist['dataset'] = 'exist'

In [91]:
exist.loc[exist.is_sexism == 'sexist', 'is_sexism'] = 1
exist.loc[exist.is_sexism == 'non-sexist', 'is_sexism'] = 0
exist.loc[exist.is_sexism == 0, 'sexism_category'] = 0

In [93]:
exist.shape

(11345, 6)

In [None]:
exist

In [99]:
exist.to_csv(os.path.join(FORMATTED_DIR, 'exist.csv'))

## Annotating online misogyny

In [128]:
# NB: encoding problems from authors' side, apostrophes (U+2019) are lost in some texts

online_misogyny = pd.read_csv(
    os.path.join(SEXISM_DIRNAME, 'Guest_et_al_online_misogyny', 'online-misogyny-eacl2021.csv')
)

In [129]:
# in Romanian
online_misogyny = online_misogyny.loc[online_misogyny.subreddit != 'Romania']

online_misogyny.drop([
    'link_id', 'parent_id', 'entry_utc', 'subreddit', 'author', 'image', 'label_date',
    'week', 'group', 'sheet_order', 'level_3', 'strength', 'highlight', 'split'
], axis=1, inplace=True)

In [130]:
def get_inconsistent(online_misogyny, level):
    a = online_misogyny.groupby(['entry_id'])
    n_unique_per_id = a.apply(lambda x: len(x[f'level_{level}'].unique()))
    inconsistent = n_unique_per_id[np.where(n_unique_per_id > 1)[0]].index.tolist()
    return inconsistent

In [131]:
inconsistent_1 = get_inconsistent(online_misogyny, level=1)
inconsistent_2 = get_inconsistent(online_misogyny, level=2)

In [132]:
# dropping examples with inconsistent `is_sexism` annotation.
online_misogyny.drop(
    online_misogyny.loc[online_misogyny.entry_id.isin(inconsistent_1)].index,
    axis=0, inplace=True
)

In [133]:
online_misogyny.loc[online_misogyny.level_1 == 'Misogynistic', 'level_1'] = 1
online_misogyny.loc[online_misogyny.level_1 == 'Nonmisogynistic', 'level_1'] = 0
online_misogyny.loc[online_misogyny.level_2 == 'None_of_the_categories', 'level_2'] = 0
online_misogyny.loc[online_misogyny.level_1 == 0, 'level_2'] = 0  # disregard subclasses of non-misogyny

In [134]:
# disregarding examples with inconsistent subclassification for task B
online_misogyny.loc[online_misogyny.entry_id.isin(inconsistent_2), 'level_2'] = 0

In [135]:
online_misogyny.drop_duplicates(subset='entry_id', inplace=True)

In [137]:
online_misogyny.dropna(subset=['body'], axis=0, inplace=True)

In [138]:
# sexist messages without subcategory annotation
online_misogyny.loc[(online_misogyny.level_2 == 0) & (online_misogyny.level_1)].shape

(67, 4)

In [139]:
online_misogyny.rename(
    columns={'entry_id': 'orig_id', 'body': 'text', 'level_1': 'is_sexism', 'level_2': 'sexism_category'},
    inplace=True
)
online_misogyny['dataset'] = 'online_misogyny'
online_misogyny['language'] = 'en'

In [140]:
online_misogyny.shape

(6355, 6)

In [None]:
online_misogyny

In [141]:
online_misogyny.to_csv(os.path.join(FORMATTED_DIR, 'online_misogyny.csv'))

## EDOS

In [142]:
EDOS_DIR = os.path.join('..', 'edos_data', 'processed')

In [143]:
def process_edos_part(part_name):
    edos_part = pd.read_csv(os.path.join(EDOS_DIR, f'edos_2023_{part_name}.csv'))

    edos_part.rename(columns={
        'rewire_id': 'orig_id', 
        'label_sexist': 'is_sexism',
        'label_category': 'sexism_category',
        'label_vector': 'sexism_subcategory'
    }, inplace=True)
    
    edos_part.loc[edos_part.is_sexism == 'not sexist', 'is_sexism'] = 0
    edos_part.loc[edos_part.is_sexism == 'sexist', 'is_sexism'] = 1
    edos_part.loc[edos_part.sexism_category == 'none', 'sexism_category'] = 0
    edos_part.loc[edos_part.sexism_subcategory == 'none', 'sexism_subcategory'] = 0
    
    edos_part['dataset'] = 'edos'
    edos_part['language'] = 'en'
    
    print(edos_part.shape)

    edos_part.to_csv(os.path.join(FORMATTED_DIR, f'edos_{part_name}.csv'))
    return edos_part

In [144]:
edos_train = process_edos_part('train')
edos_val = process_edos_part('val')

(11200, 7)
(2800, 7)


In [None]:
edos_train

# Task datasets

In [75]:
DATASETS_BY_TASK = {
    'binary_hate': [hateval, haspeede, measuring],
    'binary_offense': [offenseval],
    'binary_toxicity': [unintended_df],
    'target': [offenseval, hateval, evalita2018.loc[evalita2018.language == 'it']],
                                    # en should be in HatEval
    'target_groups': [measuring, unintended_df],
    'binary_sexism': [evalita2018, evalita2020, call_me_sexist_data, exist, online_misogyny],
    'sexism_categories': {
        'evalita2018': evalita2018, 
        'call_me_sexist': call_me_sexist_data, 
        'exist': exist, 
        'online_misogyny': online_misogyny,
    }
}

In [49]:
GENERAL_REL_COLUMNS = ['dataset', 'language', 'orig_id', 'text']

In [50]:
def compile_df(task, rel_target_columns, max_n=None):
    rel_columns = GENERAL_REL_COLUMNS + rel_target_columns
    dfs = []

    for df in DATASETS_BY_TASK[task]:
        if max_n is not None and df.shape[0] > max_n:
            df_sample = df.sample(n=max_n, random_state=42)
        else:
            df_sample = df

        dfs.append(df_sample[rel_columns])

    df = pd.concat(dfs, ignore_index=True)
    return df

In [76]:
HS_TASK_DIRNAME = os.path.join('..', 'multitask_data', 'processed', 'hate_speech')
SEXISM_TASK_DIRNAME = os.path.join('..', 'multitask_data', 'processed', 'misogyny')

## Binary hate

In [159]:
rel_target_columns = ['is_hate']
binary_hate = compile_df('binary_hate', rel_target_columns, max_n=20000)

In [160]:
binary_hate.shape

(52200, 5)

In [None]:
binary_hate

In [161]:
binary_hate = binary_hate.sample(frac=1, random_state=42)
binary_hate.reset_index(inplace=True, drop=True)

In [162]:
binary_hate.is_hate.value_counts()

0    33080
1    19120
Name: is_hate, dtype: int64

In [163]:
binary_hate.dataset.value_counts()

measuring       20000
hateval         19600
haspeede2020     8600
haspeede2018     4000
Name: dataset, dtype: int64

In [164]:
binary_hate.language.value_counts()

en    33000
it    12600
es     6600
Name: language, dtype: int64

In [165]:
bh_train, bh_val = train_test_split(binary_hate, test_size=0.2, random_state=42, shuffle=False)
bh_train.to_csv(os.path.join(HS_TASK_DIRNAME, 'hate_binary_train.csv'))
bh_val.to_csv(os.path.join(HS_TASK_DIRNAME, 'hate_binary_val.csv'))

## Binary offense

In [52]:
rel_target_columns = ['is_offense']
binary_offense = compile_df('binary_offense', rel_target_columns)

In [53]:
binary_offense.shape

(14100, 5)

In [None]:
binary_offense

In [55]:
binary_offense = binary_offense.sample(frac=1, random_state=42)
binary_offense.reset_index(inplace=True, drop=True)

In [56]:
binary_offense.is_offense.value_counts()

0    9460
1    4640
Name: is_offense, dtype: int64

In [57]:
binary_offense.dataset.value_counts()

olid    14100
Name: dataset, dtype: int64

In [58]:
binary_offense.language.value_counts()

en    14100
Name: language, dtype: int64

In [59]:
bo_train, bo_val = train_test_split(binary_offense, test_size=0.2, random_state=42, shuffle=False)
bo_train.to_csv(os.path.join(HS_TASK_DIRNAME, 'offense_binary_train.csv'))
bo_val.to_csv(os.path.join(HS_TASK_DIRNAME, 'offense_binary_val.csv'))

## Binary toxicity

In [188]:
rel_target_columns = ['is_toxicity']
binary_toxicity = compile_df('binary_toxicity', rel_target_columns, max_n=40000)

In [189]:
binary_toxicity.shape

(40000, 5)

In [191]:
binary_toxicity

In [192]:
binary_toxicity = binary_toxicity.sample(frac=1, random_state=42)
binary_toxicity.reset_index(inplace=True, drop=True)

In [194]:
binary_toxicity.is_toxicity.value_counts()

0.0    36853
1.0     3147
Name: is_toxicity, dtype: int64

In [195]:
binary_toxicity.dataset.value_counts()

unintended    40000
Name: dataset, dtype: int64

In [196]:
binary_toxicity.language.value_counts()

en    40000
Name: language, dtype: int64

In [198]:
bt_train, bt_val = train_test_split(binary_toxicity, test_size=0.2, random_state=42, shuffle=False)
bt_train.to_csv(os.path.join(HS_TASK_DIRNAME, 'toxicity_binary_train.csv'))
bt_val.to_csv(os.path.join(HS_TASK_DIRNAME, 'toxicity_binary_val.csv'))

## Target

In [77]:
rel_target_columns = ['target']
target_df = compile_df('target', rel_target_columns)

In [78]:
# rm non-hateful messages, for which target is not annotated
target_df.dropna(axis=0, inplace=True)

In [79]:
target_df.shape

(15176, 5)

In [None]:
target_df

In [80]:
target_df = target_df.sample(frac=1, random_state=42)
target_df.reset_index(inplace=True, drop=True)

In [81]:
target_df.target.value_counts()

IND    8449
GRP    5746
UNT     551
OTH     430
Name: target, dtype: int64

In [82]:
target_df.dataset.value_counts()

hateval        8209
olid           4640
evalita2018    2327
Name: dataset, dtype: int64

In [83]:
target_df.language.value_counts()

en    10110
es     2739
it     2327
Name: language, dtype: int64

In [84]:
ht_train, ht_val = train_test_split(target_df, test_size=0.2, random_state=42, shuffle=False)
ht_train.to_csv(os.path.join(HS_TASK_DIRNAME, 'hate_target_train.csv'))
ht_val.to_csv(os.path.join(HS_TASK_DIRNAME, 'hate_target_val.csv'))

## Target groups

In [115]:
rel_target_columns = ['gender'] #, 'sexuality', 'disability', 'religion', 'race']
target_groups = compile_df('target_groups', rel_target_columns, max_n=40000)

In [116]:
target_groups.shape

(79565, 5)

In [None]:
target_groups

In [118]:
target_groups = target_groups.sample(frac=1, random_state=42)
target_groups.reset_index(inplace=True, drop=True)

In [119]:
# n messages without any annotated target
any_target = np.sum(target_groups[rel_target_columns], axis=1)
np.where(any_target == 0)[0].shape

(50707,)

In [120]:
target_groups.dataset.value_counts()

unintended    40000
measuring     39565
Name: dataset, dtype: int64

In [122]:
target_groups.language.value_counts()

en    79565
Name: language, dtype: int64

In [123]:
htg_train, htg_val = train_test_split(target_groups, test_size=0.2, random_state=42, shuffle=False)
htg_train.to_csv(os.path.join(HS_TASK_DIRNAME, 'hate_target_groups_train.csv'))
htg_val.to_csv(os.path.join(HS_TASK_DIRNAME, 'hate_target_groups_val.csv'))

## Binary sexism

In [150]:
rel_target_columns = ['is_sexism']
binary_sexism = compile_df('binary_sexism', rel_target_columns)

In [152]:
binary_sexism.shape

(40039, 5)

In [None]:
binary_sexism

In [153]:
binary_sexism = binary_sexism.sample(frac=1, random_state=42)
binary_sexism.reset_index(inplace=True, drop=True)

In [154]:
binary_sexism.is_sexism.value_counts()

0    27020
1    13019
Name: is_sexism, dtype: int64

In [155]:
binary_sexism.dataset.value_counts()

exist              11345
call_me_sexist     11339
evalita2018        10000
online_misogyny     6355
evalita2020         1000
Name: dataset, dtype: int64

In [156]:
binary_sexism.language.value_counts()

en    28338
it     6000
es     5701
Name: language, dtype: int64

In [157]:
bs_train, bs_val = train_test_split(binary_sexism, test_size=0.2, random_state=42, shuffle=False)
bs_train.to_csv(os.path.join(HS_TASK_DIRNAME, 'sexism_binary_train.csv'))
bs_val.to_csv(os.path.join(HS_TASK_DIRNAME, 'sexism_binary_val.csv'))

### EDOS

In [133]:
edos_train_binary = edos_train[GENERAL_REL_COLUMNS + ['is_sexism']]
edos_train_binary.to_csv(os.path.join(HS_TASK_DIRNAME, 'edos_binary_train.csv'))

edos_val_binary = edos_val[GENERAL_REL_COLUMNS + ['is_sexism']]
edos_val_binary.to_csv(os.path.join(HS_TASK_DIRNAME, 'edos_binary_val.csv'))

print(edos_train_binary.shape[0] + edos_val_binary.shape[0])
edos_train_binary.is_sexism.value_counts() + edos_val_binary.is_sexism.value_counts()

14000


0    10602
1     3398
Name: is_sexism, dtype: int64

## Sexism subclassification

In [134]:
def process_sexism_subclassification(label, dataset, task='B', to_split=True, to_shuffle=True):
    sexism_category = 'sexism_category'
    rel_columns = GENERAL_REL_COLUMNS + [sexism_category]
    categories = dataset.loc[dataset[sexism_category] != 0, rel_columns]
    
    if to_shuffle:
        categories = categories.sample(frac=1, random_state=42)
        categories.reset_index(inplace=True, drop=True)
    
    if to_split:
        train, val = train_test_split(categories, test_size=0.2, random_state=42, shuffle=False)
        train.to_csv(os.path.join(SEXISM_TASK_DIRNAME, f'task{task}_{label}_train.csv'))
        val.to_csv(os.path.join(SEXISM_TASK_DIRNAME, f'task{task}_{label}_val.csv'))

    else:
        categories.to_csv(os.path.join(SEXISM_TASK_DIRNAME, f'task{task}_{label}.csv'))

    return {
        'shape': categories.shape,
        'category_counts': categories.sexism_category.value_counts(),
        'language_counts': categories.language.value_counts()
    }

In [135]:
for label, dataset in DATASETS_BY_TASK['sexism_categories'].items():
    stats = process_sexism_subclassification(label, dataset)

    print(label)
    print(stats['shape'], '\n')
    print(stats['category_counts'], '\n')
    print(stats['language_counts'])
    print('=' * 20, '\n')

evalita2018
(4582, 5) 

discredit            1893
stereotype           1162
sexual_harassment     994
dominance             404
derailing             129
Name: sexism_category, dtype: int64 

it    2337
en    2245
Name: language, dtype: int64

call_me_sexist
(1241, 5) 

stereotypes and comparisons                  644
behavioral expectations                      441
endorsement of inequality                    100
denying inequality and rejecting feminism     56
Name: sexism_category, dtype: int64 

en    1241
Name: language, dtype: int64

exist
(5658, 5) 

ideological-inequality          1487
stereotyping-dominance          1273
misogyny-non-sexual-violence    1157
sexual-violence                  917
objectification                  824
Name: sexism_category, dtype: int64 

es    2864
en    2794
Name: language, dtype: int64

online_misogyny
(448, 5) 

Derogation                      228
Misogynistic_pejorative         145
Treatment                        60
Misogynistic_personal_atta

### EDOS

In [136]:
# EDOS for task B

train_stats = process_sexism_subclassification('edos_train', edos_train, to_split=False, to_shuffle=False)
val_stats = process_sexism_subclassification('edos_val', edos_val, to_split=False, to_shuffle=False)

print(train_stats['shape'][0] + val_stats['shape'][0])
train_stats['category_counts'] + val_stats['category_counts']

3398


2. derogation                               1590
3. animosity                                1165
4. prejudiced discussions                    333
1. threats, plans to harm and incitement     310
Name: sexism_category, dtype: int64

In [137]:
# EDOS for task C

to_rename = {'sexism_subcategory': 'sexism_category'}
edos_train_for_c = edos_train.drop(['sexism_category'], axis=1).rename(columns=to_rename)
edos_val_for_c = edos_val.drop(['sexism_category'], axis=1).rename(columns=to_rename)

train_stats = process_sexism_subclassification(
    'edos_train', edos_train_for_c, task='C', to_split=False, to_shuffle=False
)
val_stats = process_sexism_subclassification(
    'edos_val', edos_val_for_c, task='C', to_split=False, to_shuffle=False
)

print(train_stats['shape'][0] + val_stats['shape'][0])
train_stats['category_counts'] + val_stats['category_counts']

3398


1.1 threats of harm                                                 56
1.2 incitement and encouragement of harm                           254
2.1 descriptive attacks                                            717
2.2 aggressive and emotive attacks                                 673
2.3 dehumanising attacks & overt sexual objectification            200
3.1 casual use of gendered slurs, profanities, and insults         637
3.2 immutable gender differences and gender stereotypes            417
3.3 backhanded gendered compliments                                 64
3.4 condescending explanations or unwelcome advice                  47
4.1 supporting mistreatment of individual women                     75
4.2 supporting systemic discrimination against women as a group    258
Name: sexism_category, dtype: int64