In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

import os

In [None]:
!pip install datasets

import datasets

# Config

In [3]:
from dataclasses import dataclass

@dataclass
class Config:
    model_name = 'bert-base-uncased'
    dataset_name = 'ucberkeley-dlab/measuring-hate-speech'
    text_column = 'text'
    # if id column is string, replace that with an integer index during preprocessing
    id_column = 'comment_id'

    # target in raw dataset. However, it will be renamed to `labels` here to facilitate training setup
    raw_target_column = 'hatespeech'
    target_column = 'labels'
    
    # If needs to be splitted into train test validation set
    need_to_split = True
    test_size = 0.15
    validation_size = 0.15
    max_seq_length = 128
    seed = 2022

# Dataset

https://huggingface.co/datasets/ucberkeley-dlab/measuring-hate-speech

In [4]:
dataset = datasets.load_dataset(Config.dataset_name, 'utf-8')   
df = dataset['train'].to_pandas()
df.head(1)

Downloading and preparing dataset parquet/ucberkeley-dlab--measuring-hate-speech to /root/.cache/huggingface/datasets/ucberkeley-dlab___parquet/ucberkeley-dlab--measuring-hate-speech-f91f636a830ad73c/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/14.1M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/ucberkeley-dlab___parquet/ucberkeley-dlab--measuring-hate-speech-f91f636a830ad73c/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,comment_id,annotator_id,platform,sentiment,respect,insult,humiliate,status,dehumanize,violence,genocide,attack_defend,hatespeech,hate_speech_score,text,infitms,outfitms,annotator_severity,std_err,annotator_infitms,annotator_outfitms,hypothesis,target_race_asian,target_race_black,target_race_latinx,target_race_middle_eastern,target_race_native_american,target_race_pacific_islander,target_race_white,target_race_other,target_race,target_religion_atheist,target_religion_buddhist,target_religion_christian,target_religion_hindu,target_religion_jewish,target_religion_mormon,target_religion_muslim,target_religion_other,target_religion,target_origin_immigrant,target_origin_migrant_worker,target_origin_specific_country,target_origin_undocumented,target_origin_other,target_origin,target_gender_men,target_gender_non_binary,target_gender_transgender_men,target_gender_transgender_unspecified,target_gender_transgender_women,target_gender_women,target_gender_other,target_gender,target_sexuality_bisexual,target_sexuality_gay,target_sexuality_lesbian,target_sexuality_straight,target_sexuality_other,target_sexuality,target_age_children,target_age_teenagers,target_age_young_adults,target_age_middle_aged,target_age_seniors,target_age_other,target_age,target_disability_physical,target_disability_cognitive,target_disability_neurological,target_disability_visually_impaired,target_disability_hearing_impaired,target_disability_unspecific,target_disability_other,target_disability,annotator_gender,annotator_trans,annotator_educ,annotator_income,annotator_ideology,annotator_gender_men,annotator_gender_women,annotator_gender_non_binary,annotator_gender_prefer_not_to_say,annotator_gender_self_describe,annotator_transgender,annotator_cisgender,annotator_transgender_prefer_not_to_say,annotator_education_some_high_school,annotator_education_high_school_grad,annotator_education_some_college,annotator_education_college_grad_aa,annotator_education_college_grad_ba,annotator_education_professional_degree,annotator_education_masters,annotator_education_phd,annotator_income_<10k,annotator_income_10k-50k,annotator_income_50k-100k,annotator_income_100k-200k,annotator_income_>200k,annotator_ideology_extremeley_conservative,annotator_ideology_conservative,annotator_ideology_slightly_conservative,annotator_ideology_neutral,annotator_ideology_slightly_liberal,annotator_ideology_liberal,annotator_ideology_extremeley_liberal,annotator_ideology_no_opinion,annotator_race_asian,annotator_race_black,annotator_race_latinx,annotator_race_middle_eastern,annotator_race_native_american,annotator_race_pacific_islander,annotator_race_white,annotator_race_other,annotator_age,annotator_religion_atheist,annotator_religion_buddhist,annotator_religion_christian,annotator_religion_hindu,annotator_religion_jewish,annotator_religion_mormon,annotator_religion_muslim,annotator_religion_nothing,annotator_religion_other,annotator_sexuality_bisexual,annotator_sexuality_gay,annotator_sexuality_straight,annotator_sexuality_other
0,47777,10873,3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,-3.9,"Yes indeed. She sort of reminds me of the elder lady that played the part in the movie ""Titanic"" who was telling her story!!! And I wouldn't have wanted to cover who I really am!! I would be proud!!!! WE should be proud of our race no matter what it is!!",0.81,1.88,0.36,0.34,1.35,1.23,-1.130178,True,True,True,True,True,True,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,male,no,college_grad_ba,<10k,neutral,True,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,25.0,False,False,True,False,False,False,False,False,False,False,False,True,False


The hatespeech column has three values
* 0 for positive comments
* 1 when not clear
* 2 for hate speech

This value can differ among annotators for the same comment_id. But the calculated hate_speech_score will be the same. 

For simplicity in experiment we change the dataset into binary classification by removing examples where annotators are not clear.

In [5]:
text_column = Config.text_column
target_column = Config.target_column

# https://stackoverflow.com/questions/8689795/how-can-i-remove-non-ascii-characters-but-leave-periods-and-spaces
# Make sure all comment_text values are strings
df.loc[:, text_column] = df[text_column].astype(str) 
df.loc[:, target_column] = df[Config.raw_target_column].astype(int)
df = df[df[target_column] != 1]
df.loc[:, target_column] = df[target_column].map({0:0, 2:1})

## Drop unnecessary columns and duplicates

In [6]:
id_column = Config.id_column
identities = ['target_gender_men', 'target_gender_women', 'target_race_white', 'target_race_black']
# https://stackoverflow.com/questions/15222754/groupby-pandas-dataframe-and-select-most-common-value
grouped = df.groupby([id_column])[identities].agg('mean').reset_index()
for identity in identities:
    grouped[identity] = grouped[identity].apply(lambda x: 1 if x >= 0.5 else 0)

dataset_unique = df.drop_duplicates(subset=id_column)[[id_column, text_column, target_column]]
df = dataset_unique.merge(grouped, on=id_column, how='inner').reset_index(drop=True)

In [7]:
print(df.shape)
df.head()

(38564, 7)


Unnamed: 0,comment_id,text,labels,target_gender_men,target_gender_women,target_race_white,target_race_black
0,47777,"Yes indeed. She sort of reminds me of the elder lady that played the part in the movie ""Titanic"" who was telling her story!!! And I wouldn't have wanted to cover who I really am!! I would be proud!!!! WE should be proud of our race no matter what it is!!",0,0,0,0,0
1,39773,The trans women reading this tweet right now is beautiful,0,0,0,0,0
2,47101,"Question: These 4 broads who criticize America, what country did they flee to get here? And now they want to make OUR America like THEIR former HELL HOLE. I don't think so!!!!!!!!!! Let them explain their GRATITUDE for letting them in OUR country.",1,0,1,0,0
3,43625,It is about time for all illegals to go back to their country of origin and keep our freeway open and prevent heavy traffic.,0,0,0,0,0
4,12538,For starters bend over the one in pink and kick that ass and pussy to get a taste until she's begging for a dick inside her.,1,0,1,0,0


# Length distribution

In [8]:
lengths = []
for text in df[text_column].values:
    lengths.append(len(text.split(' ')))
    
lengths = pd.DataFrame(lengths)
lengths.describe()

Unnamed: 0,0
count,38564.0
mean,25.286822
std,19.819185
min,1.0
25%,11.0
50%,19.0
75%,35.0
max,128.0


# Train-test split

In [9]:
df[target_column].value_counts()

0    27665
1    10899
Name: labels, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(
    df.drop(columns=target_column),
    df[target_column],
    test_size=Config.test_size,
    random_state=Config.seed
)

x_val, x_test, y_val, y_test = train_test_split(
    x_val,
    y_val,
    test_size=0.5,
    random_state=Config.seed
)

## Value counts

In [11]:
def value_count(df, value):
    counts = df[value].value_counts().reset_index()
    counts.columns = ['Value', 'Count']
    counts['Count(%)'] = counts['Count'] * 100 / counts['Count'].sum()
    print(counts, '\n')

In [12]:
x_train[target_column] = y_train
x_val[target_column] = y_val
x_test[target_column] = y_test

train = x_train.reset_index(drop=True)
validation = x_val.reset_index(drop=True)
test = x_test.reset_index(drop=True)

In [13]:
print('Total dataset')
value_count(df, target_column)

print('Train dataset')
value_count(train, target_column)

print('Validation dataset')
value_count(validation, target_column)

print('Test dataset')
value_count(test, target_column)

Total dataset
   Value  Count  Count(%)
0      0  27665  71.73789
1      1  10899  28.26211 

Train dataset
   Value  Count   Count(%)
0      0  19345  71.664073
1      1   7649  28.335927 

Validation dataset
   Value  Count   Count(%)
0      0   4141  71.581677
1      1   1644  28.418323 

Test dataset
   Value  Count   Count(%)
0      0   4179  72.238548
1      1   1606  27.761452 



## Dump dataframe

In [14]:
train.drop(columns=text_column).to_csv('train.csv', index=False)
test.drop(columns=text_column).to_csv('test.csv', index=False)
validation.drop(columns=text_column).to_csv('validation.csv', index=False)

## Drop unnecessary columns

In [15]:
final_columns = [id_column, text_column, target_column]
train = train[final_columns]
test = test[final_columns]
validation = validation[final_columns]

## Convert to dataset

In [16]:
train_dataset = datasets.Dataset.from_pandas(train)
val_dataset = datasets.Dataset.from_pandas(validation)
test_dataset = datasets.Dataset.from_pandas(test)

# Tokenize

https://huggingface.co/docs/transformers/main_classes/tokenizer

In [17]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    Config.model_name,
    do_lower_case=True,
)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [18]:
def tokenize_function(examples):
    return tokenizer(list(examples[text_column]), padding="max_length", max_length=Config.max_seq_length, truncation=True)

In [19]:
train_tokenized = train_dataset.map(tokenize_function, batched=True)
val_tokenized = val_dataset.map(tokenize_function, batched=True)
test_tokenized = test_dataset.map(tokenize_function, batched=True)

print(train_tokenized.column_names, val_tokenized.column_names, test_tokenized.column_names)

  0%|          | 0/27 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

['comment_id', 'text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'] ['comment_id', 'text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'] ['comment_id', 'text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask']


In [20]:
# https://huggingface.co/docs/datasets/access
# drop string columns because they cause error during training phase

train_tokenized = train_tokenized.remove_columns([text_column])
train_tokenized.set_format("torch")

val_tokenized = val_tokenized.remove_columns([text_column])
val_tokenized.set_format("torch")

test_tokenized = test_tokenized.remove_columns([text_column])
test_tokenized.set_format("torch")

## Dump tokenized data

In [21]:
import pickle

with open('train.pkl', 'wb') as output:
    pickle.dump(train_tokenized, output, pickle.HIGHEST_PROTOCOL)
    output.close()
    
with open('validation.pkl', 'wb') as output:
    pickle.dump(val_tokenized, output, pickle.HIGHEST_PROTOCOL)
    output.close()
    
with open('test.pkl', 'wb') as output:
    pickle.dump(test_tokenized, output, pickle.HIGHEST_PROTOCOL)
    output.close()