In [1]:
# Install a potentially more compatible version of transformers and datasets
# Install a potentially more compatible version of transformers, datasets, and accelerate
!pip install datasets==2.16.1 transformers==4.38.0 peft==0.8.2 accelerate==0.27.2

Collecting datasets==2.16.1
  Using cached datasets-2.16.1-py3-none-any.whl.metadata (20 kB)
Collecting transformers==4.38.0
  Using cached transformers-4.38.0-py3-none-any.whl.metadata (131 kB)
Collecting peft==0.8.2
  Using cached peft-0.8.2-py3-none-any.whl.metadata (25 kB)
Collecting accelerate==0.27.2
  Using cached accelerate-0.27.2-py3-none-any.whl.metadata (18 kB)
Collecting filelock (from datasets==2.16.1)
  Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting numpy>=1.17 (from datasets==2.16.1)
  Using cached numpy-2.3.1-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting pyarrow>=8.0.0 (from datasets==2.16.1)
  Using cached pyarrow-20.0.0-cp313-cp313-win_amd64.whl.metadata (3.4 kB)
Collecting pyarrow-hotfix (from datasets==2.16.1)
  Using cached pyarrow_hotfix-0.7-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets==2.16.1)
  Using cached dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting pandas (from datasets==2.16.1

In [2]:
from datasets import load_dataset

# Load your dataset with predefined splits
dataset = load_dataset("sander-wood/melodyhub")

# Assuming the dataset has 'train' and 'validation' splits
train_dataset = dataset['train']
validation_dataset = dataset['validation']

# You can optionally split the validation set to create a test set
# For example, split the validation set into new validation and test sets
# This approach keeps the original train set intact.
validation_test_split = validation_dataset.train_test_split(test_size=0.5, seed=42)

new_validation_dataset = validation_test_split['train']  # This will be the new validation set
test_dataset = validation_test_split['test']      # This will be your test set

print("Original Train Set:", train_dataset)
print("New Validation Set:", new_validation_dataset)
print("Test Set:", test_dataset)

  from .autonotebook import tqdm as notebook_tqdm
Downloading readme: 8.40kB [00:00, 9.79MB/s]
Downloading data: 100%|██████████| 649M/649M [01:37<00:00, 6.62MB/s] 
Downloading data: 100%|██████████| 7.93M/7.93M [00:00<00:00, 18.4MB/s]
Generating train split: 1055046 examples [00:01, 746632.43 examples/s]
Generating validation split: 12701 examples [00:00, 662511.10 examples/s]


Original Train Set: Dataset({
    features: ['dataset', 'task', 'input', 'output'],
    num_rows: 1055046
})
New Validation Set: Dataset({
    features: ['dataset', 'task', 'input', 'output'],
    num_rows: 6350
})
Test Set: Dataset({
    features: ['dataset', 'task', 'input', 'output'],
    num_rows: 6351
})


In [3]:
from transformers import RobertaTokenizer

# Initialize a tokenizer
# You can choose a different pre-trained tokenizer if it suits your needs better
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Now you can use this tokenizer to process your datasets
# For example, tokenizing the 'text' column (assuming your dataset has a 'text' column)
def tokenize_function(examples):
    # Assuming the ABC notation is in a column named 'input' based on the error and likely dataset structure
    # Original comment said 'text', but the error traceback is using 'input'.
    return tokenizer(examples["input"], padding="max_length", truncation=True)

# Apply the tokenization to your datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_new_validation_dataset = new_validation_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

print("\nTokenized Datasets:")
print("Tokenized Train Set:", tokenized_train_dataset)
print("Tokenized New Validation Set:", tokenized_new_validation_dataset)
print("Tokenized Test Set:", tokenized_test_dataset)

Map: 100%|██████████| 1055046/1055046 [10:12<00:00, 1721.52 examples/s]
Map: 100%|██████████| 6350/6350 [00:03<00:00, 1616.14 examples/s]
Map: 100%|██████████| 6351/6351 [00:03<00:00, 1677.08 examples/s]


Tokenized Datasets:
Tokenized Train Set: Dataset({
    features: ['dataset', 'task', 'input', 'output', 'input_ids', 'attention_mask'],
    num_rows: 1055046
})
Tokenized New Validation Set: Dataset({
    features: ['dataset', 'task', 'input', 'output', 'input_ids', 'attention_mask'],
    num_rows: 6350
})
Tokenized Test Set: Dataset({
    features: ['dataset', 'task', 'input', 'output', 'input_ids', 'attention_mask'],
    num_rows: 6351
})





In [None]:
from transformers import RobertaForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import hashlib

# Improved Memory-efficient label encoder with unknown label handling
class RobustLabelEncoder:
    def __init__(self, unknown_token="<UNK>"):
        self.label_to_int = {}
        self.int_to_label = {}
        self.next_int = 0
        self.unknown_token = unknown_token
        self.unknown_id = None

    def fit(self, labels):
        # Add unknown token first
        self.label_to_int[self.unknown_token] = self.next_int
        self.int_to_label[self.next_int] = self.unknown_token
        self.unknown_id = self.next_int
        self.next_int += 1

        # Add all unique labels
        unique_labels = set(labels)
        for label in unique_labels:
            if label not in self.label_to_int:
                self.label_to_int[label] = self.next_int
                self.int_to_label[self.next_int] = label
                self.next_int += 1
        return self

    def transform(self, labels):
        # Handle unknown labels gracefully
        return [self.label_to_int.get(label, self.unknown_id) for label in labels]

    def fit_transform(self, labels):
        return self.fit(labels).transform(labels)

    def inverse_transform(self, encoded_labels):
        return [self.int_to_label.get(encoded, self.unknown_token) for encoded in encoded_labels]

# Debug: Check dataset structure first
print("Checking dataset structure...")
sample = tokenized_train_dataset[0]
print(f"Available columns: {list(sample.keys())}")

# Collect all unique labels from ALL datasets to ensure consistent vocabulary
print("Collecting all unique labels from all datasets...")
all_train_labels = set(train_dataset['output'])
all_validation_labels = set(new_validation_dataset['output'])
all_test_labels = set(test_dataset['output'])

# Combine all labels to create complete vocabulary
all_unique_labels = all_train_labels.union(all_validation_labels).union(all_test_labels)
num_labels = len(all_unique_labels) + 1  # +1 for unknown token

print(f"Total unique labels found across all datasets: {len(all_unique_labels)}")
print(f"Number of labels for model (including UNK): {num_labels}")

# Initialize the robust label encoder
label_encoder = RobustLabelEncoder()

# Fit the encoder on ALL unique labels from all datasets
print("Fitting label encoder on complete vocabulary...")
label_encoder.fit(list(all_unique_labels))
print("Label encoder fitted successfully!")

# Load the model with correct number of labels
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=num_labels)

# Function to prepare dataset with robust label encoding and error handling
def prepare_dataset(example):
    # Check if 'output' column exists
    if 'output' not in example:
        raise KeyError(f"'output' column not found. Available columns: {list(example.keys())}")

    # Safely encode the label
    try:
        example['labels'] = label_encoder.transform([example['output']])[0]
    except Exception as e:
        print(f"Error encoding label '{example['output']}': {e}")
        example['labels'] = label_encoder.unknown_id  # Use unknown token ID as fallback

    return example

# Apply the preparation function to your datasets
print("Preparing datasets...")
try:
    tokenized_train_dataset = tokenized_train_dataset.map(prepare_dataset)
    tokenized_new_validation_dataset = tokenized_new_validation_dataset.map(prepare_dataset)
    tokenized_test_dataset = tokenized_test_dataset.map(prepare_dataset)
    print("Datasets prepared successfully!")
except Exception as e:
    print(f"Error preparing datasets: {e}")
    print("Attempting to inspect dataset structure...")

    # If there's still an error, let's check the original datasets
    print(f"Original train dataset columns: {train_dataset.column_names}")
    print(f"Tokenized train dataset columns: {tokenized_train_dataset.column_names}")

    # Check if 'output' exists in original but not in tokenized
    if 'output' in train_dataset.column_names and 'output' not in tokenized_train_dataset.column_names:
        print("'output' column was removed during tokenization. Re-adding it...")

        # Check if 'output' column exists before adding it
        if 'output' not in tokenized_train_dataset.column_names:
            tokenized_train_dataset = tokenized_train_dataset.add_column('output', train_dataset['output'])

        if 'output' not in tokenized_new_validation_dataset.column_names:
            tokenized_new_validation_dataset = tokenized_new_validation_dataset.add_column('output', new_validation_dataset['output'])

        if 'output' not in tokenized_test_dataset.column_names:
            tokenized_test_dataset = tokenized_test_dataset.add_column('output', test_dataset['output'])

        # Now try preparing datasets again
        tokenized_train_dataset = tokenized_train_dataset.map(prepare_dataset)
        tokenized_new_validation_dataset = tokenized_new_validation_dataset.map(prepare_dataset)
        tokenized_test_dataset = tokenized_test_dataset.map(prepare_dataset)
        print("Datasets prepared successfully after re-adding output column!")

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    load_best_model_at_end=True,
)

# Create a Trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=tokenized_train_dataset, # training dataset
    eval_dataset=tokenized_new_validation_dataset,  # evaluation dataset
)

# Start training
print("Starting training...")
trainer.train()

# You can also evaluate the model after training
print("Evaluating model...")
results = trainer.evaluate(tokenized_test_dataset)
print("\nEvaluation Results:")
print(results)

W0702 20:04:52.066000 9624 .venv\Lib\site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


Checking dataset structure...
Available columns: ['dataset', 'task', 'input', 'output', 'input_ids', 'attention_mask']
Collecting all unique labels from all datasets...
Total unique labels found across all datasets: 848103
Number of labels for model (including UNK): 848104
Fitting label encoder on complete vocabulary...
Label encoder fitted successfully!


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing datasets...


Map:  94%|█████████▍| 990999/1055046 [01:28<00:05, 11142.97 examples/s]


Error preparing datasets: [Errno 28] No space left on device
Attempting to inspect dataset structure...
Original train dataset columns: ['dataset', 'task', 'input', 'output']
Tokenized train dataset columns: ['dataset', 'task', 'input', 'output', 'input_ids', 'attention_mask']
Starting training...


