In [1]:
import pandas as pd
from sklearn.utils import resample
random_state = 42

In [2]:
path = '/srv/scratch0/jgoldz/CL-UZH-EDOS-2023/data/JigsawUBiTC/all_data.csv'
df = pd.read_csv(path)

In [3]:
df.columns

Index(['id', 'comment_text', 'split', 'created_date', 'publication_id',
       'parent_id', 'article_id', 'rating', 'funny', 'wow', 'sad', 'likes',
       'disagree', 'toxicity', 'severe_toxicity', 'obscene', 'sexual_explicit',
       'identity_attack', 'insult', 'threat', 'male', 'female', 'transgender',
       'other_gender', 'heterosexual', 'homosexual_gay_or_lesbian', 'bisexual',
       'other_sexual_orientation', 'christian', 'jewish', 'muslim', 'hindu',
       'buddhist', 'atheist', 'other_religion', 'black', 'white', 'asian',
       'latino', 'other_race_or_ethnicity', 'physical_disability',
       'intellectual_or_learning_disability', 'psychiatric_or_mental_illness',
       'other_disability', 'identity_annotator_count',
       'toxicity_annotator_count'],
      dtype='object')

In [4]:
attributes = [
    'funny', 'wow', 'sad', 'likes', 'disagree', 'toxicity', 
    'severe_toxicity', 'obscene', 'sexual_explicit', 'identity_attack', 
    'insult', 'threat', 'male', 'female', 'transgender', 'other_gender', 
    'heterosexual', 'homosexual_gay_or_lesbian', 'bisexual', 
    'other_sexual_orientation', 'christian', 'jewish', 'muslim', 'hindu', 
    'buddhist', 'atheist', 'other_religion', 'black', 'white', 'asian', 
    'latino', 'other_race_or_ethnicity', 'physical_disability', 
    'intellectual_or_learning_disability', 'psychiatric_or_mental_illness', 
    'other_disability'
]

In [5]:
threshold = 0.7

for att in attributes:
    df[f"{att}_binary"] = df[att].apply(lambda x: 1 if x >= threshold else 0)

In [6]:
df.columns

Index(['id', 'comment_text', 'split', 'created_date', 'publication_id',
       'parent_id', 'article_id', 'rating', 'funny', 'wow', 'sad', 'likes',
       'disagree', 'toxicity', 'severe_toxicity', 'obscene', 'sexual_explicit',
       'identity_attack', 'insult', 'threat', 'male', 'female', 'transgender',
       'other_gender', 'heterosexual', 'homosexual_gay_or_lesbian', 'bisexual',
       'other_sexual_orientation', 'christian', 'jewish', 'muslim', 'hindu',
       'buddhist', 'atheist', 'other_religion', 'black', 'white', 'asian',
       'latino', 'other_race_or_ethnicity', 'physical_disability',
       'intellectual_or_learning_disability', 'psychiatric_or_mental_illness',
       'other_disability', 'identity_annotator_count',
       'toxicity_annotator_count', 'funny_binary', 'wow_binary', 'sad_binary',
       'likes_binary', 'disagree_binary', 'toxicity_binary',
       'severe_toxicity_binary', 'obscene_binary', 'sexual_explicit_binary',
       'identity_attack_binary', 'insult_bi

In [7]:
max_num = 65021  # maximum number of examples per value in [0, 1]
list_of_df_atts = []
for att in attributes:
    df_att_positive = df[df[f'{att}_binary'] == 1].copy()
    if len(df_att_positive) >= max_num:
        df_att_positive = df_att_positive.sample(max_num, random_state=1).copy()
    if len(df_att_positive) == 0:
        print(f'{att}: 0 examples over threshold {threshold}. Skipping attribute.')
        continue
    print(f'{att} positive: {len(df_att_positive)}')
    try:
        df_att_negative = df[df[f'{att}_binary'] == 0].sample(len(df_att_positive), random_state=1).copy()
    except:
        print(att)
        df_att_negative = df[df[f'{att}_binary'] == 0]
    print(f'{att} negative: {len(df_att_negative)}')
    df_att = pd.concat([df_att_positive, df_att_negative])
    print(f'{att}: {len(df_att)}')
    df_att = df_att[['id', 'comment_text', f'{att}_binary', att]]
    list_of_df_atts.append(df_att)

funny positive: 65021
funny negative: 65021
funny: 130042
wow positive: 65021
wow negative: 65021
wow: 130042
sad positive: 65021
sad negative: 65021
sad: 130042
likes positive: 65021
likes negative: 65021
likes: 130042
disagree positive: 65021
disagree negative: 65021
disagree: 130042
toxicity positive: 65021
toxicity negative: 65021
toxicity: 130042
severe_toxicity positive: 1
severe_toxicity negative: 1
severe_toxicity: 2
obscene positive: 4095
obscene negative: 4095
obscene: 8190
sexual_explicit positive: 1288
sexual_explicit negative: 1288
sexual_explicit: 2576
identity_attack positive: 3134
identity_attack negative: 3134
identity_attack: 6268
insult positive: 46294
insult negative: 46294
insult: 92588
threat positive: 1457
threat negative: 1457
threat: 2914
male positive: 39277
male negative: 39277
male: 78554
female positive: 52040
female negative: 52040
female: 104080
transgender positive: 2173
transgender negative: 2173
transgender: 4346
other_gender positive: 1
other_gender n

In [8]:
for df_att in list_of_df_atts:
    print(df_att.columns, len(df_att))

Index(['id', 'comment_text', 'funny_binary', 'funny'], dtype='object') 130042
Index(['id', 'comment_text', 'wow_binary', 'wow'], dtype='object') 130042
Index(['id', 'comment_text', 'sad_binary', 'sad'], dtype='object') 130042
Index(['id', 'comment_text', 'likes_binary', 'likes'], dtype='object') 130042
Index(['id', 'comment_text', 'disagree_binary', 'disagree'], dtype='object') 130042
Index(['id', 'comment_text', 'toxicity_binary', 'toxicity'], dtype='object') 130042
Index(['id', 'comment_text', 'severe_toxicity_binary', 'severe_toxicity'], dtype='object') 2
Index(['id', 'comment_text', 'obscene_binary', 'obscene'], dtype='object') 8190
Index(['id', 'comment_text', 'sexual_explicit_binary', 'sexual_explicit'], dtype='object') 2576
Index(['id', 'comment_text', 'identity_attack_binary', 'identity_attack'], dtype='object') 6268
Index(['id', 'comment_text', 'insult_binary', 'insult'], dtype='object') 92588
Index(['id', 'comment_text', 'threat_binary', 'threat'], dtype='object') 2914
Index(

In [12]:
unified_dfs = []
for df_att in list_of_df_atts:
    final_df = df_att.copy()
    att = None
    for cn in final_df.columns:
        if cn.endswith('_binary'):
            att = cn
            break
    final_df['label_type'] = att
    final_df.rename(columns={att: 'label_value'}, inplace=True)
    final_df.drop(['_'.join(att.split('_')[:-1])], axis=1, inplace=True)
    unified_dfs.append(final_df)

In [13]:
unified_dfs[1].columns

Index(['id', 'comment_text', 'label_value', 'label_type'], dtype='object')

In [14]:
df_final = pd.concat(unified_dfs)

In [15]:
df_final.to_csv('/srv/scratch0/jgoldz/CL-UZH-EDOS-2023/data/JigsawUBiTC/balanced_extract_all_atts.csv')