In [1]:
from datasets import load_dataset
import pandas as pd
from collections import Counter
import re
import numpy as np
import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set the maximum number of columns to display
pd.set_option("display.max_columns", None)

# Set the maximum width of columns
pd.set_option("display.max_colwidth", None)

# Set the width of the entire display
pd.set_option("display.width", 1000)


### loading datasets

In [3]:
train_dataset = load_dataset("Hate-speech-CNERG/hatexplain", split="train", trust_remote_code=True)
validation_dataset = load_dataset("Hate-speech-CNERG/hatexplain", split="validation", trust_remote_code=True)
test_dataset = load_dataset("Hate-speech-CNERG/hatexplain", split="test", trust_remote_code=True)


In [4]:
train_dataset.shape

(15383, 4)

In [5]:
validation_dataset.shape

(1922, 4)

In [6]:
test_dataset.shape

(1924, 4)

In [7]:
vote_mapping = {
    0: 'normal',
    1: 'offensive',
    2: 'hate speech' 
}

def calculate_majority_vote(df, vote_mapping):
    def get_majority_vote(annotators):
        labels = annotators["label"]
        vote_counts = Counter(labels)
        majority_vote = max(vote_counts, key=vote_counts.get)
        return vote_mapping[majority_vote]

    df["majority_vote"] = df["annotators"].apply(get_majority_vote)
    return df


In [8]:

def preprocess_tokens(array):
    raw_string = ' '.join(array)
    # removing tags, leading, and trailing black spaces
    pattern = r'<[^>]+>'
    clean_string = re.sub(pattern, "", raw_string).strip()
    return clean_string

In [9]:
def process_relevant_tokens(row):
    # if the message contains relevant tokens for the review, extract and process it
    if row['rationales'].size > 0:
        try:
            # 1. Compute the intersection of relevant tokens
            rationales = np.vstack(row["rationales"])
            intersection_mask = np.all(rationales == 1, axis=0)
            # 2. Apply the mask to post_tokens
            filtered_tokens = row["post_tokens"][intersection_mask]
            # 3. Preprocess the resulting token array
            preprocessed_text = preprocess_tokens(filtered_tokens)

            return preprocessed_text
        except:
            # return the original rationales in case it is not possible to process them
            print("Error processing rationales for message: ", row['id'])
            return rationales
    # if the message is normal, return None
    return None


In [10]:
def process_dataframe(frame):
    vote_mapping = {0: "normal", 1: "offensive", 2: "hate speech"}
    # extracts the most common label among the annotators
    # preprocess the tokens that were deemed relevant by the annotators
    frame["relevant_pieces"] = frame.apply(process_relevant_tokens, axis=1)
    frame = calculate_majority_vote(frame, vote_mapping)
    # preprocesses the tokens
    frame["clean_message"] = frame["post_tokens"].apply(preprocess_tokens)
    
    ftr = frame["majority_vote"].isin(["normal", "hate speech"])
    return frame[ftr][['id', "clean_message", "majority_vote", "relevant_pieces"]]

In [11]:
# converting datasets to pandas dataframes
train_dataset = train_dataset.to_pandas()
validation_dataset = validation_dataset.to_pandas()
test_dataset = test_dataset.to_pandas()

In [12]:
# removing row with inconsistent annotation format
train_dataset = train_dataset[train_dataset["id"] != "24439295_gab"]

In [13]:
train_dataset = process_dataframe(train_dataset)
validation_dataset = process_dataframe(validation_dataset)
test_dataset = process_dataframe(test_dataset)

In [14]:
train_dataset.head()

Unnamed: 0,id,clean_message,majority_vote,relevant_pieces
0,23107796_gab,u really think i would not have been raped by feral hindu or muslim back in india or bangladesh and a neo nazi would rape me as well just to see me cry,hate speech,raped neo nazi
1,9995600_gab,the uk has threatened to return radioactive waste to the eu if an agreement cannot be reached can not we keep the radioactive waste and send back all the paki migrants instead,hate speech,paki
2,1227920812235051008_twitter,if english is not imposition then hindi is also not imposition shut up chutiya retards stophindiimposition,hate speech,retards
3,1204931715778543624_twitter,no liberal congratulated hindu refugees post cab because they hate hindus,hate speech,hate hindus
4,1179102559241244672_twitter,he said bro even your texts sound redneck whatever the fuck that means 😂 🤷 🏻 ‍ ♂️,hate speech,redneck


In [15]:
validation_dataset.head()

Unnamed: 0,id,clean_message,majority_vote,relevant_pieces
18,15473637_gab,democrats from minnesota leftist muzzie shithole are the dumbest anti americans there are klobuchar one of the reps is a dumb bitch who has a moronic position on the tax bill fuck this stupid woman,normal,muzzie
20,1144975366546759681_twitter,new to nsfw twitter • • us • soft hard dom • single • lesbian • she her lt rt for mutual,hate speech,hard dom • single • lesbian
21,21930231_gab,glad to hear do not be a sheltered cuck for any side lefties are gay as fuck but gayness is contagious to the right as well but i think you already grasp that,normal,cuck gay
25,11432298_gab,concentrate on white genocide jews elites corrupt politicians etc and leave christianity alone,hate speech,genocide jews
26,22286917_gab,merkel needs to be pack raped by a gang of muzzos then ass fucked by african niggers i hope she gets aids and dies in pain,normal,pack raped by a gang of muzzos hope she gets aids


In [16]:
test_dataset.head()

Unnamed: 0,id,clean_message,majority_vote,relevant_pieces
2,1252707503441313794_twitter,why are you repeating yourself are you a little retarded,hate speech,retarded
3,1103385226921762816_twitter,well she ’ muslim so of course she hates the jews jew hatred is one of the defining feature of islam,normal,muslim she hates the jews jew hatred islam
4,1169443635869487105_twitter,lol not me i don ’ t deal with bms not outside children ’ ss i leave that ghetto mess for everybody else 😂 😂 😂,hate speech,ghetto
5,17372469_gab,an these nigger biches look like godzilla nasty,normal,nigger
6,1168407844095483904_twitter,yeah humnh what how do i use this app i ’ m alittle retarded 👹,hate speech,retarded


In [17]:
# since in this assignment we will not be training a model but rather using LLMs to perform the inference for us,
# I will concatenate the three splits into a full dataset and save it.
final = pd.concat(
 [train_dataset, validation_dataset, test_dataset]   
)

In [18]:
final.shape

(11414, 4)

In [19]:
final.to_csv('../data/hateXplain_clean.csv', index=False)