In [2]:
!pip install datasets
import os
import re
import torch
import numpy as np
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset, Dataset
from torch import nn
from sklearn.metrics import classification_report
from tqdm import tqdm
from datasets import load_dataset, Dataset



In [3]:
# ---------------------------------
# Preprocessing Functions
# ---------------------------------
def preprocess_text(text):
    """Clean text by removing mentions, links, Unicode, and extra spaces."""
    mentions_pattern = re.compile(r'(@.*?)[\s]')
    links_pattern = re.compile(r'https?:\/\/[^\s\n\r]+')
    multi_spaces_pattern = re.compile(r'\s+')

    text = mentions_pattern.sub(' ', text)
    text = links_pattern.sub(' ', text)
    text = ''.join(char for char in text if ord(char) < 128)  # Remove Unicode
    text = multi_spaces_pattern.sub(' ', text).strip()
    return text

def preprocess_dataset(dataset):
    """Apply text preprocessing to a dataset."""
    return dataset.map(lambda example: {"text": preprocess_text(example["text"])})


In [4]:
# ---------------------------------
# Dataset Preparation Function
# ---------------------------------
def prepare_dataloader(dataset, tokenizer, max_length=512, batch_size=8, shuffle=False):
    """Efficiently prepare a PyTorch DataLoader."""
    
    # Define the tokenization function
    def tokenize_function(example):
        return tokenizer(
            example["text"],
            padding="max_length",
            max_length=max_length,
            truncation=True
        )
    
    # Apply tokenization to the dataset
    tokenized_dataset = dataset.map(tokenize_function, batched=True)

    # Set format for PyTorch DataLoader
    columns = ["input_ids", "attention_mask"]
    if "label" in dataset.column_names:
        columns.append("label")
    tokenized_dataset.set_format(type="torch", columns=columns)

    # Return a DataLoader
    return DataLoader(tokenized_dataset, batch_size=batch_size, shuffle=shuffle)


In [5]:
# ---------------------------------
# Custom Model Definition
# ---------------------------------
class TextClassifier(nn.Module):
    """Custom classifier with additional fully connected layers."""
    def __init__(self, base_model):
        super(TextClassifier, self).__init__()
        self.base_model = base_model
        self.fc1 = nn.Linear(768, 32)
        self.fc2 = nn.Linear(32, 1)
        self.activation = nn.ReLU()
        self.output = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        base_output = self.base_model(input_ids=input_ids, attention_mask=attention_mask)[0][:, 0]
        x = self.fc1(base_output)
        x = self.activation(x)
        x = self.fc2(x)
        return self.output(x)


In [6]:
# ---------------------------------
# Training Function
# ---------------------------------
def train_model(model, train_loader, val_loader, learning_rate, num_epochs, checkpoint_dir):
    """Train the model with checkpointing and early stopping."""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    criterion = nn.BCELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    model = model.to(device)

    best_val_loss = float('inf')
    early_stop_count = 0
    saved_checkpoints = []  # Track saved checkpoints
    latest_checkpoint = os.path.join(checkpoint_dir, "latest_checkpoint.pt")

    # Resume training if checkpoint exists
    start_epoch = 0
    if os.path.exists(latest_checkpoint):
        checkpoint = torch.load(latest_checkpoint)
        model.load_state_dict(checkpoint["model_state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
        start_epoch = checkpoint["epoch"] + 1
        best_val_loss = checkpoint["best_val_loss"]
        saved_checkpoints = checkpoint.get("saved_checkpoints", [])
        print(f"Resuming training from epoch {start_epoch}...")

    for epoch in range(start_epoch, num_epochs):
        model.train()
        total_train_loss, total_train_acc = 0, 0

        # Training loop
        for batch in tqdm(train_loader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels.float().unsqueeze(1))
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()
            total_train_acc += ((outputs >= 0.5).int() == labels.unsqueeze(1)).sum().item()

        # Validation loop
        model.eval()
        total_val_loss, total_val_acc = 0, 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["label"].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                loss = criterion(outputs, labels.float().unsqueeze(1))

                total_val_loss += loss.item()
                total_val_acc += ((outputs >= 0.5).int() == labels.unsqueeze(1)).sum().item()

        print(f"Epoch {epoch + 1}/{num_epochs} | Train Loss: {total_train_loss / len(train_loader):.4f}, "
              f"Train Acc: {total_train_acc / len(train_loader.dataset):.4f} | "
              f"Val Loss: {total_val_loss / len(val_loader):.4f}, "
              f"Val Acc: {total_val_acc / len(val_loader.dataset):.4f}")

        # Save checkpoint
        current_checkpoint = os.path.join(checkpoint_dir, f"checkpoint_epoch_{epoch + 1}.pt")
        torch.save({
            "epoch": epoch,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "best_val_loss": best_val_loss,
            "saved_checkpoints": saved_checkpoints
        }, current_checkpoint)

        # Manage checkpoint list (limit to latest 2)
        saved_checkpoints.append(current_checkpoint)
        if len(saved_checkpoints) > 2:  # Keep only the latest 2 checkpoints
            oldest_checkpoint = saved_checkpoints.pop(0)
            if os.path.exists(oldest_checkpoint):
                os.remove(oldest_checkpoint)

        # Save the latest checkpoint tracker
        torch.save({
            "epoch": epoch,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "best_val_loss": best_val_loss,
            "saved_checkpoints": saved_checkpoints
        }, latest_checkpoint)

        # Early stopping logic
        if total_val_loss < best_val_loss:
            best_val_loss = total_val_loss
            early_stop_count = 0
        else:
            early_stop_count += 1
            if early_stop_count >= 3:
                print("Early stopping...")
                break

In [7]:
# ---------------------------------
# Evaluation Function
# ---------------------------------
def evaluate_model(model, test_loader, checkpoint_path):
    """Evaluate the model on the test set."""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.load_state_dict(torch.load(checkpoint_path)["model_state_dict"])
    model = model.to(device)
    model.eval()

    predictions, targets = [], []
    with torch.no_grad():
        for input_ids, attention_mask, labels in test_loader:
            input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions.extend((outputs >= 0.5).int().cpu().numpy())
            targets.extend(labels.numpy())
    print(classification_report(targets, predictions))


In [8]:
# ---------------------------------
# Main Workflow
# ---------------------------------

# Load training and validation datasets
dataset = load_dataset("Jinyan1/COLING_2025_MGT_en")

# Load tokenizer and base model
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
base_model = AutoModel.from_pretrained("roberta-base")

# Preprocess datasets
train_data = preprocess_dataset(dataset["train"])
dev_data = preprocess_dataset(dataset["dev"])

README.md:   0%|          | 0.00/588 [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/287M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/286M [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/246M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/610767 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/261758 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/610767 [00:00<?, ? examples/s]

Map:   0%|          | 0/261758 [00:00<?, ? examples/s]

In [9]:
# Prepare data loaders
train_loader = prepare_dataloader(train_data, tokenizer, batch_size=8, shuffle=True)
val_loader = prepare_dataloader(dev_data, tokenizer, batch_size=8)

# Initialize custom model
model = TextClassifier(base_model)

Map:   0%|          | 0/610767 [00:00<?, ? examples/s]

Map:   0%|          | 0/261758 [00:00<?, ? examples/s]

In [None]:
# Define checkpoint directory
checkpoint_dir = "/kaggle/working/roberta-v1.2-checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

# Train the model
train_model(model, train_loader, val_loader, learning_rate=1e-5, num_epochs=3, checkpoint_dir=checkpoint_dir)

In [None]:
# Evaluate the model
import json
import pandas as pd

# Replace 'path_to_your_test_file.jsonl' with the actual file path of your test dataset
jsonl_file_path = '/kaggle/input/roberta-test/test_set_en_with_label.jsonl'

# Load JSONL file into a list of dictionaries
with open(jsonl_file_path, 'r') as f:
    test_data_list = [json.loads(line) for line in f]

test_data = preprocess_dataset(pd.DataFrame(test_data_list))
test_loader = prepare_dataloader(test_data.to_pandas(), tokenizer, batch_size=8)
evaluate_model(model, test_loader, os.path.join(checkpoint_dir, "latest_checkpoint.pt"))