# **Step 1: Install the dependencies and import the necessary libraries**

In [3]:
pip install torch transformers datasets pandas tqdm scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [4]:
import torch
print("CUDA Available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))
    print("GPU Count:", torch.cuda.device_count())


CUDA Available: True
GPU Name: Tesla T4
GPU Count: 2


In [5]:
!pip install datasets transformers torch evaluate emoji contractions textblob nltk


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (13 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [

In [6]:
import torch
import re
import emoji
import contractions
import numpy as np
from datasets import load_dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
import nltk

# Download necessary NLTK files
nltk.download("stopwords")
nltk.download("wordnet")


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# **Step 2: Load Jigsaw Toxicity Dataset**

In [7]:
# Load Jigsaw dataset from Hugging Face
dataset = load_dataset("thesofakillers/jigsaw-toxic-comment-classification-challenge")  # Alternative to missing "jigsaw_toxic_comment_classification"


README.md:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/68.8M [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/60.4M [00:00<?, ?B/s]

test_labels.csv:   0%|          | 0.00/4.98M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/159571 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/306328 [00:00<?, ? examples/s]

In [8]:
# ✅ Use only 5000 samples for training and 1500 for testing
train_data = dataset["train"].select(range(5000))
test_data = dataset["test"].select(range(1500))  # Next 1500 samples for testing

# **Step 3: Define Label Mapping (Safe, Neutral, Harmful)**

In [9]:
def map_labels(example):
    labels = [
        int(example.get("identity_hate", 0) or 0),
        int(example.get("insult", 0) or 0),
        int(example.get("obscene", 0) or 0),  # Included in Harmful
        int(example.get("severe_toxic", 0) or 0),
        int(example.get("threat", 0) or 0),
        int(example.get("toxic", 0) or 0),
    ]

    if sum(labels) == 0:
        return {"label": 0}  # Safe
    elif labels[3] == 1 or labels[4] == 1 or labels[2] == 1:  # severe_toxic, threat, obscene
        return {"label": 2}  # Harmful
    else:
        return {"label": 1}  # Neutral

# Apply mapping
train_data = train_data.map(map_labels)
test_data = test_data.map(map_labels)


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

# **Step 4: Define Preprocessing Functions python**


In [10]:
import re
import contractions
import emoji

# Function to clean text (handle None values)
def clean_text(text):
    if text is None:
        return ""  # Replace None with empty string

    text = text.lower()  # Convert to lowercase
    text = re.sub(r"\s+", " ", text)  # Remove extra spaces
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    return text.strip()

# Function to fix repeated characters
def remove_repeated_chars(text):
    return re.sub(r"(.)\1{2,}", r"\1\1", text)  # Keeps max 2 repetitions

# Function to convert emojis to text
def convert_emojis(text):
    return emoji.demojize(text, delimiters=(" ", " "))  # Converts emojis to words

# Function to apply all preprocessing steps
def preprocess_text(example):
    example["comment_text"] = clean_text(example.get("comment_text", ""))  # Handle None safely
    example["comment_text"] = remove_repeated_chars(example["comment_text"])
    example["comment_text"] = contractions.fix(example["comment_text"])
    example["comment_text"] = convert_emojis(example["comment_text"])
    return example

# Apply preprocessing
train_data = train_data.map(preprocess_text)
test_data = test_data.map(preprocess_text)


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

# **Step 5: Tokenization Using RoBERTa**

In [11]:
# Load RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Tokenization function
def tokenize_function(example):
    return tokenizer(example["comment_text"], truncation=True, padding="max_length", max_length=512)

# Apply tokenization
train_data = train_data.map(tokenize_function, batched=True)
test_data = test_data.map(tokenize_function, batched=True)

# Set format for PyTorch
#dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])



tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

# **Step 6: Define Model & Training Arguments**

In [12]:
# Load pre-trained RoBERTa model for classification
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=3)

# Define training arguments
training_args = TrainingArguments(
    output_dir="/kaggle/working/results",  # ✅ Save in the working directory
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="/kaggle/working/logs",  # ✅ Save logs properly
    logging_steps=100,
    load_best_model_at_end=True,
    report_to="none",
    metric_for_best_model="accuracy",
)


# Data collator (handles dynamic padding)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# **Step 7: Train the Model**

In [13]:
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding

# Define data collator (since tokenizer is deprecated in Trainer)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,  # Ensure dataset is split into train/test
    eval_dataset=test_data,
    tokenizer=tokenizer,  # Still useful for padding/truncation
    data_collator=data_collator,  # Handles batching and padding
    compute_metrics=compute_metrics  # Function to calculate accuracy, F1-score, etc.
)


  trainer = Trainer(


In [14]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3426,0.405422,0.778667,0.875562
2,0.255,0.430276,0.762,0.864926
3,0.1514,0.551095,0.806667,0.892989




TrainOutput(global_step=939, training_loss=0.263761249213173, metrics={'train_runtime': 925.4328, 'train_samples_per_second': 16.209, 'train_steps_per_second': 1.015, 'total_flos': 3946701265920000.0, 'train_loss': 0.263761249213173, 'epoch': 3.0})

# **Step 8: Save the model**

In [15]:
model.save_pretrained("/kaggle/working/roberta")
tokenizer.save_pretrained("/kaggle/working/roberta")


('/kaggle/working/roberta/tokenizer_config.json',
 '/kaggle/working/roberta/special_tokens_map.json',
 '/kaggle/working/roberta/vocab.json',
 '/kaggle/working/roberta/merges.txt',
 '/kaggle/working/roberta/added_tokens.json')

In [16]:
results = trainer.evaluate()
print(results)




{'eval_loss': 0.5510950684547424, 'eval_accuracy': 0.8066666666666666, 'eval_f1': 0.8929889298892989, 'eval_runtime': 25.7481, 'eval_samples_per_second': 58.257, 'eval_steps_per_second': 3.651, 'epoch': 3.0}


In [20]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch

# Load tokenizer and fine-tuned model
model_path = "/kaggle/working/roberta"  # Update if saved elsewhere
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained(model_path)
model.eval()  # Set to evaluation mode


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [40]:
def classify_comment(comment):
    inputs = tokenizer(comment, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    
    label_map = {0: "Safe", 1: "Neutral", 2: "Harmful"}  # Update if using more classes
    return label_map[predicted_class]

# Example usage
test_comment = "You're so stupid, just delete your channel already. No one wants to see this garbage."
print("Prediction:", classify_comment(test_comment))


Prediction: Neutral
