We used the implementation of Hector Andres Mejia Vallejo:
https://medium.com/analytics-vidhya/distil-roberta-for-hate-speech-classification-and-a-conceptual-review-about-transformers-c283bd8ff827
Model was run with Google Colab

In [16]:
# Imports
from pathlib import Path
import datetime
import time
import re
import html
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, load_dataset


In [17]:
# Create the dataset
def create_dataset(df):
    # Create the labels
    df['lyric'] = df['lyrics'].astype(str)
    df['label'] = df['explicit'].astype(int)

    # Keep only lyric and label column
    df = df[['lyric', 'label']]

    # Create dataset
    lyrics_list = df['lyric'].astype(str).tolist()
    labels_list = df['label'].astype(int).tolist()
    result = {"lyric": lyrics_list, "label": labels_list}
    ds = Dataset.from_dict(result)

    return ds

In [18]:
#Text_encoding function
def Text_encoding(ds, tokenizer_name="distilroberta-base"):

    #Data splitting
    train_ds, test_ds = ds.train_test_split(test_size=0.2).values()

    #Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    #Tokenizer function
    tokenize_func = lambda sentences: tokenizer(sentences['lyric'], padding="max_length", truncation=True)

    #Tokenising of Training and Testing data
    tok_train_ds = train_ds.map(tokenize_func, batched=True)
    tok_test_ds = test_ds.map(tokenize_func, batched=True)

    #Save the tokenizer
    data_path = Path(__file__).resolve().parents[1] / 'data' / 'final_data' / 'trained_model_explicity'
    tokenizer.save_pretrained(data_path)

    return tok_train_ds, tok_test_ds

In [19]:
# Model prediction function
def model_prediction(tok_train_ds, tok_test_ds, model_name="distilroberta-base", num_labels=2, epochs=5):

    # Load the model
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    # Arguments for training
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=epochs,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
    )

    # Calculate metrics
    def compute_metrics(pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)

        # Confusion matrix
        plot_confusion_matrix(labels, preds)

        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
        acc = accuracy_score(labels, preds)
        return {
            'accuracy': acc,
            'f1': f1,
            'precision': precision,
            'recall': recall
        }

    # Plot Confusion Matrix
    def plot_confusion_matrix(y_true, y_pred):
        cm = confusion_matrix(y_true, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.title('Confusion Matrix')
        plt.show()

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tok_train_ds,
        eval_dataset=tok_test_ds,
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    eval_results = trainer.evaluate()

    # Save the model
    data_path = Path(__file__).resolve().parents[1] / 'data' / 'final_data' / 'trained_model_explicity'
    model.save_pretrained(data_path)

    return eval_results

In [None]:
# Function to run all functions defined above
def main():

    #Load datafile
    data_path = Path(__file__).resolve().parents[1] / 'data' / 'final_data' / 'global_17-24_with_polarity_and_spotify.csv'
    df = pd.read_csv(data_path)

    #Create preprocessed dataset
    ds = create_dataset(df)
    print(ds)

    #Text encoding
    tok_train_ds, tok_test_ds = Text_encoding(ds)

    #Train the model
    eval_results = model_prediction(tok_train_ds, tok_test_ds)

    #Print the results
    print(eval_results)

main()