##### This notebook contain the preprocessing steps necessary in the stormfront dataset.
##### They are necessary for performing experiment3.
##### The final output of this notebook are `stormfront_train.csv` and `stormfront_test.csv`.

In [1]:
from datasets import load_dataset
import pandas as pd
# Load the dataset
dataset = load_dataset("odegiber/hate_speech18")

# Access the data splits
train_data = dataset["train"]

# Print the number of examples in the dataset
print(f"Number of training examples: {len(train_data)}")

# Checking one example 
print("\nSample from training set:")
print(train_data[0])


  from .autonotebook import tqdm as notebook_tqdm


Number of training examples: 10944

Sample from training set:
{'text': 'As of March 13th , 2014 , the booklet had been downloaded over 18,300 times and counting .', 'user_id': 572066, 'subforum_id': 1346, 'num_contexts': 0, 'label': 0}


In [2]:
# converting dataset to a pandas dataframe
train_data = train_data.to_pandas()

In [3]:
train_data.iloc[1]

text            In order to help increase the booklets downloa...
user_id                                                    572066
subforum_id                                                  1346
num_contexts                                                    0
label                                                           0
Name: 1, dtype: object

In [4]:
# checking for duplicated messages
train_data.duplicated(subset=["text"]).sum()

184

In [5]:
# removing duplicates
train_data = train_data.drop_duplicates(subset=['text'], keep='first')

In [6]:
# sample 500 normal messages and 500 hate messages

normal_messages = train_data[train_data['label'] == 0].sample(500, random_state=42)
hate_messages = train_data[train_data['label'] == 1].sample(500, random_state=42)

final = pd.concat([normal_messages, hate_messages]).reset_index(drop=True)

In [7]:
# the train dataset will be used in experiment3 notebook to sample the few-shot examples
stormfront_train = pd.concat([normal_messages[:450], hate_messages[:450]]).reset_index(drop=True)

# the test dataset will be used in experiment3 notebook as the messages that will be evaluated
stormfront_test = pd.concat([normal_messages[450:], hate_messages[450:]]).reset_index(
    drop=True
)


In [8]:
# the train dataset will have 900 messages, half normal; half hate speech
stormfront_train.label.value_counts()

label
0    450
1    450
Name: count, dtype: int64

In [9]:
# the test dataset will have 100 messages, half normal; half hate speech
stormfront_test.label.value_counts()

label
0    50
1    50
Name: count, dtype: int64

In [10]:
stormfront_train.to_csv('../data/stormfront_train.csv', index=False)
stormfront_test.to_csv('../data/stormfront_test.csv', index=False)