In [1]:
!pip install datasets
from datasets import load_dataset, Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
import torch
import string
import re
import nltk
import pandas as pd
import json
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import random
# Load the dataset
dataset = load_dataset("Jinyan1/COLING_2025_MGT_en")
from datasets import Dataset

def sample_fixed_balanced_dataset(dataset, total_samples=300000, target_label_col="label"):
    train_data = dataset['train']
    label_to_samples = {}

    # Group samples by label
    for example in train_data:
        label = int(example[target_label_col])
        if label not in label_to_samples:
            label_to_samples[label] = []
        label_to_samples[label].append(example)

    # Determine number of samples per class
    samples_per_class = total_samples // len(label_to_samples)

    # Sample from each class
    balanced_data = []
    for label, samples in label_to_samples.items():
        if len(samples) < samples_per_class:
            raise ValueError(f"Not enough examples for label {label}. Reduce total_samples.")
        balanced_data.extend(random.sample(samples, samples_per_class))

    # Shuffle and return as a new Dataset
    return Dataset.from_list(balanced_data).shuffle(seed=42)

# Subset the training dataset
train_data = sample_fixed_balanced_dataset(dataset=dataset)

def preprocess_text(text):
    """Clean text by removing mentions, links, Unicode, and extra spaces."""
    mentions_pattern = re.compile(r'(@.*?)[\s]')
    links_pattern = re.compile(r'https?:\/\/[^\s\n\r]+')
    multi_spaces_pattern = re.compile(r'\s+')

    text = mentions_pattern.sub(' ', text)
    text = links_pattern.sub(' ', text)
    text = ''.join(char for char in text if ord(char) < 128)  # Remove Unicode
    text = multi_spaces_pattern.sub(' ', text).strip()
    return text

def preprocess_dataset(dataset):
    """Apply text preprocessing to a dataset."""
    return dataset.map(lambda example: {"text": preprocess_text(example["text"])})

# Load only the first 1000 rows for training
train_data = preprocess_dataset(train_data)
dev_data = preprocess_dataset(dataset['dev'])

# Load RoBERTa tokenizer and define tokenization
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

# Tokenize the datasets
train_data = train_data.map(tokenize_function, batched=True)
dev_data = dev_data.map(tokenize_function, batched=True)

# Set format for PyTorch
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
dev_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

README.md:   0%|          | 0.00/588 [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/287M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/286M [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/246M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/610767 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/261758 [00:00<?, ? examples/s]

Map:   0%|          | 0/300000 [00:00<?, ? examples/s]

Map:   0%|          | 0/261758 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]



Map:   0%|          | 0/300000 [00:00<?, ? examples/s]

Map:   0%|          | 0/261758 [00:00<?, ? examples/s]

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'source', 'sub_source', 'lang', 'model', 'label', 'text'],
        num_rows: 610767
    })
    dev: Dataset({
        features: ['id', 'source', 'sub_source', 'lang', 'model', 'label', 'text'],
        num_rows: 261758
    })
})

In [None]:
import torch.nn as nn
from transformers import RobertaModel, RobertaPreTrainedModel
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments


class CustomRobertaForSequenceClassification(RobertaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.roberta = RobertaModel(config)
        self.dropout = nn.Dropout(0.3)  # Dropout for regularization
        self.classifier = nn.Sequential(
            nn.Linear(config.hidden_size, 256),  # Add a hidden layer
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, config.num_labels)  # Output layer
        )
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        # Get outputs from RoBERTa
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # Use pooled output
        pooled_output = self.dropout(pooled_output)  # Apply dropout
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

        return {"loss": loss, "logits": logits}


# Register the custom class with the pre-trained weights
from transformers import RobertaConfig
config = RobertaConfig.from_pretrained("roberta-base", num_labels=2)
model = CustomRobertaForSequenceClassification.from_pretrained("roberta-base", config=config)

# Define training arguments
training_args = TrainingArguments(
    output_dir="/kaggle/working/",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="none",
    save_total_limit=2,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=dev_data,
)

# Train the model
trainer.train(resume_from_checkpoint="/kaggle/input/18500v2/transformers/default/1/checkpoint-18500")

# Evaluate on dev set
dev_results = trainer.evaluate()
print("Dev Results:", dev_results)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of CustomRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.0.bias', 'classifier.0.weight', 'classifier.3.bias', 'classifier.3.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
  checkpoint_rng_state = torch.load(rng_file)
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss
1,0.1398,0.259685


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


In [3]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score
import torch
import json
from transformers import RobertaTokenizer
from torch.utils.data import DataLoader, TensorDataset
import re
from transformers import DistilBertTokenizer, DistilBertConfig, Trainer, TrainingArguments

# Load test data from JSONL
def load_test_data_from_jsonl(jsonl_file, tokenizer, max_length=512):
    texts = []
    labels = []
    with open(jsonl_file, "r") as f:
        for line in f:
            item = json.loads(line.strip())
            texts.append(preprocess_text(item["text"]))
            labels.append(item["label"])
    
    tokenized_data = tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors="pt"
    )
    dataset = TensorDataset(
        tokenized_data["input_ids"], 
        tokenized_data["attention_mask"], 
        torch.tensor(labels)
    )
    return dataset

# Load tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Path to test JSONL file
test_jsonl_path = "/kaggle/input/nlp-roberta-test/test_set_en_with_label (1).jsonl"
test_dataset = load_test_data_from_jsonl(test_jsonl_path, tokenizer)


# Load the model
from transformers import RobertaConfig

config = RobertaConfig.from_pretrained("/kaggle/input/20000v2/transformers/default/1/checkpoint-20000")
model = CustomRobertaForSequenceClassification.from_pretrained("/kaggle/input/20000v2/transformers/default/1/checkpoint-20000", config=config)

# Evaluate the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Create DataLoader
test_loader = DataLoader(test_dataset, batch_size=8)

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs["logits"]
        preds = torch.argmax(logits, dim=-1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate metrics
precision = precision_score(all_labels, all_preds)
recall = recall_score(all_labels, all_preds)
accuracy = accuracy_score(all_labels, all_preds)
macro_f1 = f1_score(all_labels, all_preds, average="macro")
micro_f1 = f1_score(all_labels, all_preds, average="micro")
conf_matrix = confusion_matrix(all_labels, all_preds)

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Macro F1 Score: {macro_f1:.4f}")
print(f"Micro F1 Score: {micro_f1:.4f}")
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.7229
Precision: 0.6703
Recall: 0.9410
Macro F1 Score: 0.7000
Micro F1 Score: 0.7229
Confusion Matrix:
[[16503 18172]
 [ 2315 36951]]
