In [3]:
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TextClassificationPipeline
from datasets import load_dataset
import pandas as pd
import os
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import re


def predict_hate_label(model_path, file_name, dataset, device, BATCH_SIZE=32):

    model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    # tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-hate-latest")

    fout = open('../tmp/'+ file_name, "w")
    pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=device)
    BATCH_SIZE = BATCH_SIZE
    num_batches = len(dataset) // BATCH_SIZE + (1 if len(dataset) % BATCH_SIZE != 0 else 0)

    for batch_idx in tqdm(range(num_batches), desc="Processing batches"):
        start_idx = batch_idx * BATCH_SIZE
        end_idx = start_idx + BATCH_SIZE

        batched_texts = dataset['text'][start_idx:end_idx].tolist()
        batched_labels = dataset['label'][start_idx:end_idx].tolist()

        batched_predictions = pipe(batched_texts)

        for i, (pred, actual_label) in enumerate(zip(batched_predictions, batched_labels)):
            hate = pred["label"]
            toxic_score = pred["score"] if hate == 'HATE' else 1 - pred["score"]
            fout.write(
                str(start_idx + i) + "\t" + str(hate) + "\t" + str(actual_label) + "\t" + str(toxic_score) + "\n")

def calculate_matching(df, target_label, comparison_value):
    subset_df = df[df[2] == target_label]
    if target_label == 'HATE':
        matching = (subset_df[1] == comparison_value).sum()
    elif target_label == 'NOT-HATE':
        matching = (subset_df[1] != comparison_value).sum()
    return matching, len(subset_df)


df = pd.read_csv('/data/jzheng36/Fine-Tune-Llama2-/tmp/hatemoderate.csv_cardiffnlp-twitter-roberta-base-hate-latest_lr=5e-06_epoch=3_hatemoderate.csv', sep='\t', header=None)
matching_hate, total_hate = calculate_matching(df, 'HATE', 'HATE')
matching_nothate, total_nothate = calculate_matching(df, 'NOT-HATE', 'HATE')


print(f"{(matching_hate / total_hate) * 100:.2f}% of accuracy of HATE cases.")
print(f"{(matching_nothate / total_nothate) * 100:.2f}% of accuracy of NOT-HATE cases.")
print(f"{(matching_hate + matching_nothate) / len(df) * 100:.2f}% of accuracy of all cases.")
print("\n")



53.26% of accuracy of HATE cases.
67.87% of accuracy of NOT-HATE cases.
59.93% of accuracy of all cases.


