In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
%%capture

!pip install transformers
!pip install datasets
!pip install evaluate

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
import os

In [None]:
dataset = load_dataset("json", data_files={"train": "/content/train.jsonl", "test":"/content/test.jsonl"} )


In [None]:
dataset

In [None]:
checkpoint = "google-bert/bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

mapDict = {
    "No hate speech": 0,
    "Hate speech": 1
}

def transform_labels(label):
  label = label['label_text']
  result = []
  for l in label:
    result.append(mapDict[l])
  return {"label": result}


def tokenize_function(example):
    return tokenizer(example["text"], padding=True, truncation=True)


In [None]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.map(transform_labels, batched=True)
data_collector = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import TrainingArguments

output_dir = "./best-hate-speech-test"

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs = 3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    logging_dir= "./logs",
    eval_strategy = "steps",
    eval_steps = 200,
    save_total_limit = 2,
    save_steps= 200,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"
)

In [None]:
from transformers import Trainer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

os.environ['WANDB_DISABLE'] = "true"
os.environ['WANDB_MODE'] = "offline"

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model,
    training_args,
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["test"],
    data_collator = data_collector,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.save_model()

In [None]:
trainer.push_to_hub("juliagualdi/modelhate")

In [None]:
from transformers import pipeline

classifier = pipeline("text-classification", model="juliagualdi/modelhate")

In [None]:
classifier("That’s awesome! I had a really positive experience there — everything was well organized, the atmosphere was great, and I would absolutely recommend it to others.")