In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
import os

In [None]:
def load_dehatebert():
    model_name = "Hate-speech-CNERG/dehatebert-mono-english"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    return tokenizer, model

In [None]:
def chunk_text(text, max_tokens=500):
    words = text.split()
    return [" ".join(words[i:i + max_tokens]) for i in range(0, len(words), max_tokens)]

In [None]:
def analyze_text_dehatebert(text, tokenizer, model):
    text_chunks = chunk_text(text)
    scores = []
    
    for chunk in text_chunks:
        inputs = tokenizer(chunk, padding=True, truncation=True, max_length=512, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1).numpy()
        hate_prob = probs[:, 1].mean()  
        scores.append(hate_prob)

    return np.mean(scores)

In [None]:
def analyze_single_file(file_path):
    tokenizer, model = load_dehatebert()
    
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    
    hate_score = analyze_text_dehatebert(text, tokenizer, model)
    print(f"Hate Speech Probability for {file_path}: {hate_score:.4f}")
    return hate_score

In [None]:
os.chdir("C:\\Users\\*link to file*")
if __name__ == "__main__":
    file_path = "Race_2020s.txt"
    analyze_single_file(file_path)