In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/toxic_data/train.csv")
print(df.head())

In [None]:
label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

df["label"] = (df[label_cols].sum(axis=1) > 0).astype(int)
df_binary = df[["comment_text", "label"]]

In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df_binary)
dataset = dataset.train_test_split(test_size=0.2, seed=42)

In [None]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def tokenize(batch):
    return tokenizer(batch["comment_text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["comment_text"])

In [None]:
from transformers import RobertaForSequenceClassification
from peft import get_peft_model, LoraConfig, TaskType

model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    task_type=TaskType.SEQ_CLS,
)

model = get_peft_model(model, peft_config)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained("/content/drive/MyDrive/models/lora-toxic-roberta")
tokenizer.save_pretrained("/content/drive/MyDrive/models/lora-toxic-roberta")