In [7]:
import datasets
import pandas as pd
import transformers
import torch

In [3]:
train_datapath = "jomoll/GREEN-V3"
eval_datapath = "jomoll/GREEN-V3"
max_length = 512

In [4]:
model_name = "models/RoBERTa-base-PM"

model = transformers.RobertaForSequenceClassification.from_pretrained(model_name, num_labels=1)
tokenizer = transformers.RobertaTokenizer.from_pretrained(model_name)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at models/RoBERTa-base-PM and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def load_and_preprocess_dataset(data_path, tokenizer, max_len, split="train", system_message="", batch_size=16):
    """Load and preprocess dataset in batches."""
    # Load hf dataset
    raw_data = datasets.load_dataset(data_path, split=split)

    # Preprocess data in batches
    processed_data = raw_data.map(
        lambda batch: preprocess_batch(batch, tokenizer, max_len, system_message),
        batched=True,
        batch_size=batch_size,
        desc="Running tokenizer on "+split+" dataset")

    return processed_data

def preprocess_batch(batch, tokenizer: transformers.PreTrainedTokenizer, max_len: int, system_message: str = ""):
    """Preprocess a batch of samples."""
    # Validate fields
    originals = batch.get("reference", [])
    candidates = batch.get("candidate", [])
    green_scores = batch.get("green_score", [])

    # Tokenize the input and target text
    input_text = ["Reference report:\n"+reference_report+"\n Candidate report:\n"+candidate_report for reference_report, candidate_report in zip(originals, candidates)]

    inputs = tokenizer(input_text, padding="max_length", truncation=True, max_length=max_len, return_tensors="pt")
    inputs["attention_mask"] = inputs["input_ids"].ne(tokenizer.pad_token_id)  # Add attention mask

    targets = green_scores
    inputs["labels"] = targets

    return inputs

train_dataset = load_and_preprocess_dataset(train_datapath, tokenizer, split="train", max_len=max_length)
eval_dataset = load_and_preprocess_dataset(eval_datapath, tokenizer, split="validation", max_len=max_length)


In [6]:
# train the model
trainer = transformers.Trainer(
    model=model,
    tokenizer=tokenizer,
    args=transformers.TrainingArguments(
        output_dir="./results",
        evaluation_strategy="steps",
        eval_steps=500,
        save_strategy="epoch",
        learning_rate=1e-5,
        per_device_train_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
    ),
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()


  0%|          | 2/58476 [01:32<758:46:53, 46.72s/it]

KeyboardInterrupt: 

In [113]:
# Model evaluation
import torch.nn.functional as F

# Convert input_ids and attention_mask to tensors
input_ids = torch.tensor(eval_dataset["input_ids"])
attention_mask = torch.tensor(eval_dataset["attention_mask"])

# Model evaluation
model.eval()
with torch.no_grad():  # Disable gradient computation for evaluation
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits.squeeze(-1)  # Shape: [batch_size]
    scores = F.sigmoid(logits)  # Apply sigmoid to map logits to [0, 1]
    print(scores)  # Predicted scores between 0 and 1

: 

In [None]:
# plot token length distribution
import matplotlib.pyplot as plt
import seaborn as sns

def plot_token_length_distribution(dataset, tokenizer):
    """Plot the distribution of token lengths in the dataset."""
    # Calculate token lengths
    token_lengths = [len(dataset[i]["input_ids"]) for i in range(len(dataset))]
    
    # Create a DataFrame for plotting
    df = pd.DataFrame({"Token Length": token_lengths})

    # Plot the distribution
    plt.figure(figsize=(10, 6))
    sns.histplot(df, x="Token Length", bins=30, kde=True)
    plt.title("Distribution of Token Lengths")
    plt.xlabel("Token Length")
    plt.ylabel("Frequency")
    plt.show()


plot_token_length_distribution(train_dataset, tokenizer)
