In [None]:
!pip install datasets
from datasets import load_dataset, Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
import torch
import string
import re
import nltk
import pandas as pd
import json
nltk.download('stopwords')
from nltk.corpus import stopwords

In [2]:
# Load training and validation datasets
dataset = load_dataset("Jinyan1/COLING_2025_MGT_en")

# Define preprocessing function
def preprocess_data(data):
    stop_words = set(stopwords.words('english'))
    text = ' '.join(data)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply preprocessing to datasets
def preprocess_examples(example):
    example['text'] = preprocess_data([example['text']])
    return example

train_data = dataset['train'].map(preprocess_examples)
dev_data = dataset['dev'].map(preprocess_examples)

# Load RoBERTa tokenizer and define tokenization
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

train_data = train_data.map(tokenize_function, batched=True)
dev_data = dev_data.map(tokenize_function, batched=True)

# Set format for PyTorch
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
dev_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])



Map:   0%|          | 0/610767 [00:00<?, ? examples/s]



Map:   0%|          | 0/610767 [00:00<?, ? examples/s]

Map:   0%|          | 0/261758 [00:00<?, ? examples/s]

In [3]:
from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments

# Initialize RoBERTa model for classification
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

# Define training arguments with checkpoint management
training_args = TrainingArguments(
    output_dir="/kaggle/working/",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="none",            # Save checkpoint every 10,000 steps
    save_total_limit=2            # Keep only the last 2 checkpoints, delete older ones automatically
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=dev_data,
)

# Train the model, resuming from the specified checkpoint if needed
trainer.train(resume_from_checkpoint="/kaggle/input/205000/transformers/default/1/checkpoint-205000")

# Evaluate on dev set
dev_results = trainer.evaluate()
print("Dev Results:", dev_results)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
	per_device_train_batch_size: 8 (from args) != 4 (from trainer_state.json)
  checkpoint_rng_state = torch.load(rng_file)
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss
3,0.0854,0.258575


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(devic

Dev Results: {'eval_loss': 0.25857505202293396, 'eval_runtime': 4134.5944, 'eval_samples_per_second': 63.309, 'eval_steps_per_second': 3.957, 'epoch': 3.0}


In [1]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score
import torch
import json
from transformers import RobertaTokenizer
from torch.utils.data import DataLoader, TensorDataset
import re
from transformers import DistilBertTokenizer, DistilBertConfig, Trainer, TrainingArguments

# Load test data from JSONL
def load_test_data_from_jsonl(jsonl_file, tokenizer, max_length=512):
    texts = []
    labels = []
    with open(jsonl_file, "r") as f:
        for line in f:
            item = json.loads(line.strip())
            texts.append(preprocess_data(item["text"]))
            labels.append(item["label"])
    
    tokenized_data = tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors="pt"
    )

    dataset = TensorDataset(
        tokenized_data["input_ids"], 
        tokenized_data["attention_mask"], 
        torch.tensor(labels)
    )
    return dataset

# Path to test JSONL file
test_jsonl_path = "/kaggle/input/nlp-roberta-test/test_set_en_with_label (1).jsonl"
test_dataset = load_test_data_from_jsonl(test_jsonl_path, tokenizer)


# Load the model
from transformers import RobertaConfig

config = RobertaConfig.from_pretrained("/kaggle/input/229038/transformers/default/1/checkpoint-229038")
model = RobertaForSequenceClassification.from_pretrained("/kaggle/input/229038/transformers/default/1/checkpoint-229038", config=config)

# Evaluate the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Create DataLoader
test_loader = DataLoader(test_dataset, batch_size=8)

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs["logits"]
        preds = torch.argmax(logits, dim=-1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate metrics
precision = precision_score(all_labels, all_preds)
recall = recall_score(all_labels, all_preds)
accuracy = accuracy_score(all_labels, all_preds)
macro_f1 = f1_score(all_labels, all_preds, average="macro")
micro_f1 = f1_score(all_labels, all_preds, average="micro")
conf_matrix = confusion_matrix(all_labels, all_preds)

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Macro F1 Score: {macro_f1:.4f}")
print(f"Micro F1 Score: {micro_f1:.4f}")
print("Confusion Matrix:")
print(conf_matrix)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]



Accuracy: 0.6457
Precision: 0.6023
Recall: 0.9799
Macro F1 Score: 0.5801
Micro F1 Score: 0.6457
Confusion Matrix:
[[ 9266 25409]
 [  791 38475]]
