In [None]:
!pip install datasets
!pip install transformers
!pip install torch
!pip install pandas
!pip install scikit-learn
!pip install accelerate

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
# imports
import os
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, default_data_collator
from torch.utils.data import Dataset, DataLoader

In [None]:
ds = load_dataset("stanfordnlp/snli")

os.makedirs("data", exist_ok=True)

ds["train"].to_csv("data/snli_train.csv")
ds["test"].to_csv("data/snli_test.csv")
ds["validation"].to_csv("data/snli_val.csv")

train_df = pd.read_csv("data/snli_train.csv")

#Split into meta datasets

skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

folds = []
for _, fold_index in skf.split(train_df, train_df["label"]):
    fold = train_df.iloc[fold_index]
    folds.append(fold)

for i, fold in enumerate(folds):
    fold.to_csv(f"data/meta_{i}.csv", index=False)
    print(f"fold {i} saved with {len(fold)} folds")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/412k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/413k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/19.6M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/550152 [00:00<?, ? examples/s]

Creating CSV from Arrow format:   0%|          | 0/551 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

fold 0 saved with 110031 folds
fold 1 saved with 110031 folds
fold 2 saved with 110030 folds
fold 3 saved with 110030 folds
fold 4 saved with 110030 folds


In [None]:
model_checkpoint = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_checkpoint)

class CSVMetaDataset(Dataset):
    def __init__(self, file_path):
        data = pd.read_csv(file_path)
        # Filter out invalid rows
        data = data.dropna(subset=['premise', 'hypothesis'])
        data = data[data['label'] != -1]
        self.texts = list(zip(data['premise'], data['hypothesis']))
        self.labels = data['label'].astype(int).tolist()
        self.encodings = tokenizer(
            [f'{p} [SEP] {h}' for p, h in self.texts],
            truncation=True,
            padding=True,
            max_length=128
        )

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [None]:
# Paths to meta-dataset files
meta_path = "data"
meta_files = [os.path.join(meta_path, f'meta_{i}.csv') for i in range(5)]

# Training function
def train_models(meta_files):
    # Create results directory if it doesn't exist
    if not os.path.exists('models'):
        os.makedirs('models')

    for idx, file_path in enumerate(meta_files):
        print(f"Training on meta-dataset {idx} from file: {file_path}")

        # Initialize model
        model = RobertaForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)

        # Load dataset (Should this be outside the trainfunction so you preserve the tokenized datasets to use in the evaluation)
        dataset = CSVMetaDataset(file_path)

        # Compute warm-up steps (alignment with xu et al)
        num_training_steps = len(dataset) // 16 * 3
        warmup_steps = int(0.05 * num_training_steps)

        # Training arguments
        training_args = TrainingArguments(
            output_dir=f'models/meta_{idx}_results',  # Save model and logs here
            eval_strategy='no',
            learning_rate=2e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=3,
            weight_decay=0.01,
            warmup_steps=warmup_steps,
            save_strategy='epoch',
            load_best_model_at_end=False,
            logging_dir=f'models/meta_{idx}_logs',  # TensorBoard logs
            logging_steps=10,
            lr_scheduler_type='linear',
            optim='adamw_torch',
        )

        # Initialize Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=dataset,
            tokenizer=tokenizer,
        )

        # Train and save the model
        trainer.train()
        trainer.save_model(f"models/meta_{idx}_model")

        print(f"Model for meta-dataset {idx} saved to models/meta_{idx}_model")

# Device information
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Train models on meta-datasets
train_models(meta_files)

Using device: cpu
Training on meta-dataset 0 from file: data/meta_0.csv


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 

In [None]:
'''
    Evaluation Code:
    Takes trained models and evaluates on other folds (i != j)
    returns .csv of accuracy scores for all models on all folds
'''

# Paths to the fine-tuned models for each fold
model_paths = [
    'models/meta_0_model',
    'models/meta_1_model',
    'models/meta_2_model',
    'models/meta_3_model',
    'models/meta_4_model'
]

meta_files = [f"data/meta_{i}.csv" for i in range(5)]
folds = [CSVMetaDataset(file_path) for file_path in meta_files]

def evaluate_on_fold(model, test_set, tokenizer):
    # Tokenize the test set
    # tokenized_test = test_set.map(lambda x: prepare_features(x, tokenizer), batched=True, remove_columns=test_set.column_names)
    collator = default_data_collator
    test_dataloader = DataLoader(test_set, batch_size=16, collate_fn=collator)

    # Evaluation loop
    model.eval()
    predictions = []
    true_labels = []
    accuracy_scores = []

    with torch.no_grad():
        for batch in test_dataloader:
            inputs = {
                "input_ids": batch["input_ids"].to(model.device),
                "attention_mask": batch["attention_mask"].to(model.device),
            }
            labels = batch["labels"].to(model.device)
            outputs = model(**inputs)

            predicted_labels = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
            predictions.extend(predicted_labels)
            true_labels.extend(labels.cpu().numpy())

            # Compute accuracy for each instance in the batch
            batch_accuracy = (predicted_labels == labels.cpu().numpy()).astype(float)
            accuracy_scores.extend(batch_accuracy)

    # Return accuracy scores for all instances
    return predictions, true_labels, accuracy_scores

# Evaluate each model on all other folds
fold_results = []
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for i, model_path in enumerate(model_paths):
    print(f"Evaluating model {i}")

    # Load the corresponding model for this fold
    model = RobertaForSequenceClassification.from_pretrained(model_path)
    tokenizer = RobertaTokenizer.from_pretrained(model_path)
    model.to(device)

    # Evaluate the model on the test set
    for j, test_set in enumerate(folds):
      if i != j: #check if the evaluation fold is not the training fold
        predictions, true_labels, accuracy_scores = evaluate_on_fold(model, test_set, tokenizer)

        for h in range(len(test_set)):
          example = test_set[h]
          fold_results.append({
            "unique_id": f"fold_{j}_example_{h}",
            "fold": j,
            "premise": example["premise"],
            "hypothesis": example["hypothesis"],
            "true_label": example["labels"].item(),
            "predicted_label": predictions[h],
            "accuracy": accuracy_scores[h]
            })

# Save results to a DataFrame
results_df = pd.DataFrame(fold_results)

# Save the DataFrame to a CSV file
results_df.to_csv("evaluation_results.csv", index=False)

print("Accuracy scores saved to evaluation_results.csv")




Evaluating model0


KeyboardInterrupt: 