In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

global_seed = 2022

In [2]:
# !pip install datasets
import datasets

  from .autonotebook import tqdm as notebook_tqdm


# Config

In [3]:
from dataclasses import dataclass

@dataclass
class Config:
    model_name = 'bert-base-uncased'
    dataset_name = 'social_bias_frames'
    text_column = 'post'
    # if id column is string, replace that with an integer index during preprocessing
    raw_id_column = 'HITId'
    id_column = 'index'
    need_to_split = False

    # target in raw dataset. However, it will be renamed to `labels` here to facilitate training setup
    raw_target_column = 'offensiveYN'
    target_column = 'labels'
    # test and validation data with each be 50% of this amount
    test_size = 0.3
    max_seq_length = 128
    seed = 2022

# Dataset

https://huggingface.co/datasets/ucberkeley-dlab/measuring-hate-speech

In [4]:
dataset = datasets.load_dataset(Config.dataset_name)
dataset

Using custom data configuration default
Reusing dataset social_bias_frames (C:\Users\khair\.cache\huggingface\datasets\social_bias_frames\default\0.0.0\79706db13a32c7f9614b997cc4326cbda14e6d3968892a3f0e76c4a970e7e510)
100%|██████████| 3/3 [00:00<00:00, 42.86it/s]


DatasetDict({
    test: Dataset({
        features: ['whoTarget', 'intentYN', 'sexYN', 'sexReason', 'offensiveYN', 'annotatorGender', 'annotatorMinority', 'sexPhrase', 'speakerMinorityYN', 'WorkerId', 'HITId', 'annotatorPolitics', 'annotatorRace', 'annotatorAge', 'post', 'targetMinority', 'targetCategory', 'targetStereotype', 'dataSource'],
        num_rows: 17501
    })
    validation: Dataset({
        features: ['whoTarget', 'intentYN', 'sexYN', 'sexReason', 'offensiveYN', 'annotatorGender', 'annotatorMinority', 'sexPhrase', 'speakerMinorityYN', 'WorkerId', 'HITId', 'annotatorPolitics', 'annotatorRace', 'annotatorAge', 'post', 'targetMinority', 'targetCategory', 'targetStereotype', 'dataSource'],
        num_rows: 16738
    })
    train: Dataset({
        features: ['whoTarget', 'intentYN', 'sexYN', 'sexReason', 'offensiveYN', 'annotatorGender', 'annotatorMinority', 'sexPhrase', 'speakerMinorityYN', 'WorkerId', 'HITId', 'annotatorPolitics', 'annotatorRace', 'annotatorAge', 'post', '

In [5]:
train  = dataset['train'].to_pandas()
test  = dataset['test'].to_pandas()
validation  = dataset['validation'].to_pandas()
train.head()

Unnamed: 0,whoTarget,intentYN,sexYN,sexReason,offensiveYN,annotatorGender,annotatorMinority,sexPhrase,speakerMinorityYN,WorkerId,HITId,annotatorPolitics,annotatorRace,annotatorAge,post,targetMinority,targetCategory,targetStereotype,dataSource
0,0.0,0.66,0.0,,1.0,woman,,,,-8935932304856669427,363A7XIFV4G2799C5V96YERJA9AVAM,liberal,white,45.0,"RT @_LexC__: I'm convinced that some of y'all bitches get pregnant purposely because ""birth control &amp; plan b pills"" are effective &#128533;&#128056;&#9749;&#65039;",,,,t/davidson
1,0.0,0.66,0.0,,0.5,man,,,,6347880360297734464,363A7XIFV4G2799C5V96YERJA9AVAM,mod-liberal,white,35.0,"RT @_LexC__: I'm convinced that some of y'all bitches get pregnant purposely because ""birth control &amp; plan b pills"" are effective &#128533;&#128056;&#9749;&#65039;",,,,t/davidson
2,0.0,0.33,0.0,,0.5,man,,,,-7452610791699819066,363A7XIFV4G2799C5V96YERJA9AVAM,liberal,asian,23.0,"RT @_LexC__: I'm convinced that some of y'all bitches get pregnant purposely because ""birth control &amp; plan b pills"" are effective &#128533;&#128056;&#9749;&#65039;",,,,t/davidson
3,1.0,1.0,0.0,,1.0,man,,,0.0,-500114755446676507,3JTPR5MTZS6RLS3JBV4IOU0G2X35K5,liberal,white,25.0,RT @iBeZo: Stupid fucking nigger LeBron. You flopping stupid jungle bunny monkey faggot.,black folks,race,all stupid,t/davidson
4,1.0,1.0,0.0,,1.0,man,,,0.0,-500114755446676507,3JTPR5MTZS6RLS3JBV4IOU0G2X35K5,liberal,white,25.0,RT @iBeZo: Stupid fucking nigger LeBron. You flopping stupid jungle bunny monkey faggot.,black folks,race,are not people but apes.,t/davidson


In [6]:
train['targetCategory'].value_counts()

            70406
race        14987
gender      10872
culture      8843
victim       2680
disabled     2350
social       1730
body         1032
Name: targetCategory, dtype: int64

In [7]:
train[Config.raw_target_column].value_counts()

1.0    54218
0.0    46673
0.5     9992
        2017
Name: offensiveYN, dtype: int64

## Aggregation

In [9]:
text_column = Config.text_column
target_column = Config.raw_target_column
id_column = Config.raw_id_column
categories = ['gender', 'gender', 'race', 'race']
identities = ['male', 'female', 'white', 'black']
features = [text_column, target_column] + identities + ['targetMinority', 'targetCategory']
selected_columns = [id_column] + features

In [7]:
class SocialBiasProcessor:
    def __init__(self) -> None:
        pass

    def is_male(word: str):
        word = word.lower().strip()
        if 'trans' in word: return False

        for w in ['male', 'men', 'man']:
            if word.startswith(w) or ' '+ w in word:
                return True

        for w in ['father', 'boy', 'incel']:
            if w in word:
                return True
        return False

    def is_female(word: str):
        word = word.lower().strip()
        if 'trans' in word: return False
        
        for w in ['female', 'women', 'woman', 'mother', 'lesbian', 'girl']:
            if w in word:
                return True
        return False

    def is_white(word: str):
        word = word.lower().strip()
        for neg in ['not', 'non']:
            for add in ['', ' ', '-']:
                if neg + add + 'white' in word:
                    return False

        return 'white' in word

    def is_black(word: str):
        word = word.lower().strip()
        for neg in ['not', 'non']:
            for add in ['', ' ', '-']:
                if (neg + add + 'black' in word) or (neg + add + 'dark' in word):
                    return True

        return ('black' in word) or ('dark' in word)

    @staticmethod
    def process(df: pd.DataFrame):
        # create binary columns for target indentity groups from annotation
        
        functions = [SocialBiasProcessor.is_male, SocialBiasProcessor.is_female, SocialBiasProcessor.is_white, SocialBiasProcessor.is_black]
        for index, column in enumerate(identities):
            df[column] = df[df['targetCategory']==categories[index]]['targetMinority'].apply(functions[index])
            df[column].fillna(False, inplace=True)

        # ensure text is in string format
        df.loc[:, text_column] = df[text_column].astype(str) 
        # convert target label to binary
        df.loc[:, target_column] = df[target_column].apply(lambda x: 1 if x!='' and float(x)>=0.5 else 0)

        # https://stackoverflow.com/questions/15222754/groupby-pandas-dataframe-and-select-most-common-value
        grouped = df.groupby([id_column])[identities].agg('mean').reset_index()
        for identity in identities:
            grouped[identity] = grouped[identity].apply(lambda x: 1 if x >= 0.5 else 0)

        df_unique = df.drop_duplicates(subset=id_column)[[id_column, text_column, target_column]]
        df = df_unique.merge(grouped, on=id_column, how='inner').reset_index(drop=True)
        return df

In [28]:
test[(test['offensiveYN']=='0.0')]['targetCategory'].value_counts()

Series([], Name: targetCategory, dtype: int64)

In [10]:
train = SocialBiasProcessor.process(train)
test = SocialBiasProcessor.process(test)
validation = SocialBiasProcessor.process(validation)

## Add numeric id column
Torch format has problems with string id columns. However, we need to convert data into torch format for experiment. This code will add `index` column which will be used as the new id column

In [11]:
train = train.reset_index()
test = test.reset_index()
validation = validation.reset_index()
id_column = Config.id_column

## Rename target column
Target column should be renamed to `labels` for pytorch training

In [12]:
train.rename({target_column: 'labels'}, axis=1, inplace=True)
test.rename({target_column: 'labels'}, axis=1, inplace=True)
validation.rename({target_column: 'labels'}, axis=1, inplace=True)

print(f'Target column has been changed from {target_column} to labels')
target_column = 'labels'

Target column has been changed from offensiveYN to labels


## Dump dataframe format for future evaluation

In [13]:
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)
validation.to_csv('validation.csv', index=False)

## Drop unnecessary columns

In [14]:
final_columns = [id_column, text_column, target_column]
train = train[final_columns]
test = test[final_columns]
validation = validation[final_columns]

# Length distribution

In [15]:
lengths = []
for text in train[text_column].values:
    lengths.append(len(text.split(' ')))
    
lengths = pd.DataFrame(lengths)
lengths.describe()

Unnamed: 0,0
count,35933.0
mean,20.107728
std,12.060741
min,1.0
25%,13.0
50%,18.0
75%,24.0
max,276.0


## Value counts

In [16]:
def value_count(df, value):
    counts = df[value].value_counts().reset_index()
    counts.columns = ['Value', 'Count']
    counts['Count(%)'] = counts['Count'] * 100 / counts['Count'].sum()
    print(counts, '\n')

In [17]:
print('Train dataset')
value_count(train, target_column)

print('Validation dataset')
value_count(validation, target_column)

print('Test dataset')
value_count(test, target_column)

Train dataset
   Value  Count   Count(%)
0      1  19605  54.559875
1      0  16328  45.440125 

Validation dataset
   Value  Count   Count(%)
0      1   2673  57.115385
1      0   2007  42.884615 

Test dataset
   Value  Count   Count(%)
0      1   2795  59.404888
1      0   1910  40.595112 



## Convert to dataset

In [18]:
train_dataset = datasets.Dataset.from_pandas(train)
val_dataset = datasets.Dataset.from_pandas(validation)
test_dataset = datasets.Dataset.from_pandas(test)

# Tokenize

https://huggingface.co/docs/transformers/main_classes/tokenizer

In [19]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    Config.model_name,
    do_lower_case=True,
)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [20]:
def tokenize_function(examples):
    return tokenizer(list(examples[text_column]), padding="max_length", max_length=Config.max_seq_length, truncation=True)

In [21]:
train_tokenized = train_dataset.map(tokenize_function, batched=True)
val_tokenized = val_dataset.map(tokenize_function, batched=True)
test_tokenized = test_dataset.map(tokenize_function, batched=True)

train_tokenized.column_names, val_tokenized.column_names, test_tokenized.column_names

  0%|          | 0/36 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

(['index', 'post', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
 ['index', 'post', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
 ['index', 'post', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'])

In [22]:
# https://huggingface.co/docs/datasets/access
# drop string columns because they cause error during training phase

train_tokenized = train_tokenized.remove_columns([text_column])
train_tokenized.set_format("torch")

val_tokenized = val_tokenized.remove_columns([text_column])
val_tokenized.set_format("torch")

test_tokenized = test_tokenized.remove_columns([text_column])
test_tokenized.set_format("torch")

## Dump tokenized data

In [23]:
import pickle

with open('train.pkl', 'wb') as output:
    pickle.dump(train_tokenized, output, pickle.HIGHEST_PROTOCOL)
    output.close()
    
with open('validation.pkl', 'wb') as output:
    pickle.dump(val_tokenized, output, pickle.HIGHEST_PROTOCOL)
    output.close()
    
with open('test.pkl', 'wb') as output:
    pickle.dump(test_tokenized, output, pickle.HIGHEST_PROTOCOL)
    output.close()