In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/toxic_data/train.csv")
print(df.head())

                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  


In [None]:
label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

df["label"] = (df[label_cols].sum(axis=1) > 0).astype(int)
df_binary = df[["comment_text", "label"]]

In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df_binary)
dataset = dataset.train_test_split(test_size=0.2, seed=42)

In [None]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def tokenize(batch):
    return tokenizer(batch["comment_text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["comment_text"])

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Map:   0%|          | 0/127656 [00:00<?, ? examples/s]

Map:   0%|          | 0/31915 [00:00<?, ? examples/s]

In [None]:
from transformers import RobertaForSequenceClassification
from peft import get_peft_model, LoraConfig, TaskType

model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    task_type=TaskType.SEQ_CLS,
)

model = get_peft_model(model, peft_config)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.107,0.110215
2,0.1037,0.105768
3,0.1024,0.105525


TrainOutput(global_step=23937, training_loss=0.11374100938787356, metrics={'train_runtime': 1770.3542, 'train_samples_per_second': 216.323, 'train_steps_per_second': 13.521, 'total_flos': 2.545167494438093e+16, 'train_loss': 0.11374100938787356, 'epoch': 3.0})

In [None]:
model.save_pretrained("/content/drive/MyDrive/models/lora-toxic-roberta")
tokenizer.save_pretrained("/content/drive/MyDrive/models/lora-toxic-roberta")

('/content/drive/MyDrive/models/lora-toxic-roberta/tokenizer_config.json',
 '/content/drive/MyDrive/models/lora-toxic-roberta/special_tokens_map.json',
 '/content/drive/MyDrive/models/lora-toxic-roberta/vocab.json',
 '/content/drive/MyDrive/models/lora-toxic-roberta/merges.txt',
 '/content/drive/MyDrive/models/lora-toxic-roberta/added_tokens.json')