In [None]:
pip install transformers datasets

In [None]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle

Loading the dataset:
- Directly from csv file or,
- Using Hugging Face's datasets library (https://huggingface.co/docs/datasets/en/index)

In [None]:
dataset = pd.read_csv('hateEn.csv')
# or

# dataset = load_dataset("ruanchaves/hatebr")
# dataset = pd.DataFrame(dataset['train']) # if it has a 'train' split

Possible initial information and modification

In [None]:
# To return column names, size and data type info
dataset.info()

In [None]:
dataset.head() # To have a look what your dataset looks like

In [None]:
# To drop an unneccessary column
dataset.drop(['your-column-name'], axis=1,inplace=True)

In [None]:
# To rename columns to have the standard text and label columns
dataset = dataset.rename(columns={'tweets':'text'})

In [None]:
dataset['label'].value_counts() # Returns each class counts

In [None]:
combined_sets = pd.concat([dataset1,dataset2]) # Returns datasets concatanated

In [None]:
dataset = dataset.dropna() # Returns the dataset dropping the NULL valued rows

In [None]:
dataset = dataset.drop_duplicates(subset=['text'], keep='last') # Removes possible duplicate tweets and keeps the last occurence

In [None]:
dataset.to_csv('your-dataset-name-to-be-saved.csv') # You can save your modified datasets to use later

In [None]:
shuffled_dataset = shuffle(dataset) # You can shuffle your data, it is important to shuffle the data to mitigate order bias

In [None]:
dataset['text'] = dataset[df['label'] == 0].text # You can use this format to select rows on condition (like selecting entries that are not hate)

You can plot class distribution with countplot function of sns library

In [None]:
sns.countplot(x="label", data=dataframe)

If dataset has labels like 'hate' 'nothate' 'True' 'False' etc., it is better to map those labels into binary values.

In [None]:
dataset['label'] = dataset['label'].map(lambda x:1 if x=='hate' else 0)

If you want to explore the frequent words

In [None]:
from collections import Counter
cnt = Counter()
for text in dataset["text"].values:
    for word in text.split():
        cnt[word] += 1

cnt.most_common(10)

Or create a wordcloud

In [None]:
import wordcloud
from wordcloud import WordCloud
words = ' '.join([txt for txt in dataset['text']])
wordCloud = WordCloud(width=500, height=300, random_state=21, max_font_size=110).generate(words)

plt.figure(figsize = (10, 8))
plt.imshow(wordCloud, interpolation="bilinear")
plt.axis('off')
plt.show()

Possible modifications in the text column

In [None]:
x['text'] = x['text'].str.lower() # Lowercasing is common preprocessing

In [None]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)
dataset["text"] = dataset["text"].apply(lambda text: remove_emoji(text))

In [None]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)
dataset["text"] = dataset["text"].apply(lambda text: remove_urls(text))

In [None]:
import string
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

dataset["text"] = dataset["text"].apply(lambda text: remove_punctuation(text))

You might also want to remove stop words if you are using another model architecture in training

In [None]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

dataset["text"] = dataset["text"].apply(lambda text: remove_stopwords(text))