In [1]:
!pip install --upgrade transformers huggingface_hub; mkdir semeval25-unlearning-model; mkdir semeval25-unlearning-data


Collecting transformers
  Downloading transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-0.26.3-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Downloading transformers-4.47.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m156.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.26.3-py3-none-any.whl (447 kB)
Downloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7

In [5]:
!pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-18.1.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Downloading pyarrow-18.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 MB[0m [31m274.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow
Successfully installed pyarrow-18.1.0


In [6]:
import pandas as pd
from huggingface_hub import snapshot_download
from transformers import AutoModelForCausalLM, AutoTokenizer
hf_token = "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"

## Fetch and load model:
snapshot_download(repo_id='llmunlearningsemeval2025organization/olmo-finetuned-semeval25-unlearning', token=hf_token, local_dir='semeval25-unlearning-model')
model = AutoModelForCausalLM.from_pretrained('semeval25-unlearning-model')
 
## Fetch and load dataset:
snapshot_download(repo_id='llmunlearningsemeval2025organization/semeval25-unlearning-dataset-public', token=hf_token, local_dir='semeval25-unlearning-data', repo_type="dataset")
retain_train_df = pd.read_parquet('semeval25-unlearning-data/data/retain_train-00000-of-00001.parquet', engine='pyarrow') # Retain split: train set
retain_validation_df = pd.read_parquet('semeval25-unlearning-data/data/retain_validation-00000-of-00001.parquet', engine='pyarrow') # Retain split: validation set
forget_train_df = pd.read_parquet('semeval25-unlearning-data/data/forget_train-00000-of-00001.parquet', engine='pyarrow') # Forget split: train set
forget_validation_df = pd.read_parquet('semeval25-unlearning-data/data/forget_validation-00000-of-00001.parquet', engine='pyarrow') # Forget split: validation set
!mkdir train validation
retain_train_df.to_json('train/retain.jsonl'); forget_train_df.to_json('train/forget.jsonl')
retain_validation_df.to_json('validation/retain.jsonl'); forget_validation_df.to_json('validation/forget.jsonl')


Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

In [1]:
# Right Now I am applying unlearning For Sentence Completion Task:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import snapshot_download
import json
import os

class JSONLDataset(Dataset):
    def __init__(self, jsonl_path, tokenizer, max_length=512):
        """
        Dataset for JSONL files.

        Args:
            jsonl_path (str): Path to the JSONL file.
            tokenizer: Tokenizer for encoding the text.
            max_length (int): Maximum sequence length.
        """
        self.data = []
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Load the JSONL file
        with open(jsonl_path, "r") as f:
            for line in f:
                item = json.loads(line)
                document = item.get("document", "")
                output = item.get("sentence_completion_task", {}).get("output", "")
                if document and output:
                    self.data.append({"input": document, "output": output})

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        inputs = self.tokenizer(
            item["input"], 
            truncation=True, 
            max_length=self.max_length, 
            padding="max_length", 
            return_tensors="pt"
        )
        labels = self.tokenizer(
            item["output"], 
            truncation=True, 
            max_length=self.max_length, 
            padding="max_length", 
            return_tensors="pt"
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "labels": labels["input_ids"].squeeze(0)
        }

def gradient_ascent_unlearning(
    model, tokenizer, retain_loader, forget_loader, output_path, lr=1e-4, num_steps=50, device="cuda" if torch.cuda.is_available() else "cpu"
):
    """
    Perform gradient ascent-based unlearning using the 1B model.

    Args:
        model: Pretrained language model (1B).
        tokenizer: Tokenizer corresponding to the 1B model.
        retain_loader: DataLoader for retain set.
        forget_loader: DataLoader for forget set.
        output_path: Directory to save the updated model.
        lr: Learning rate.
        num_steps: Number of unlearning steps.
        device: Compute device ("cuda" or "cpu").

    Returns:
        None
    """
    model.to(device)
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()

    for step in range(num_steps):
        total_forget_loss = 0.0
        total_retain_loss = 0.0

        # Gradient ascent on the forget set
        for batch in forget_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            forget_loss = outputs.loss

            (-forget_loss).backward()  # Maximize forget loss
            optimizer.step()

            total_forget_loss += forget_loss.item()

        # Gradient descent on the retain set
        for batch in retain_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            retain_loss = outputs.loss

            retain_loss.backward()  # Minimize retain loss
            optimizer.step()

            total_retain_loss += retain_loss.item()

        print(f"Step {step + 1}/{num_steps} - Forget Loss: {total_forget_loss:.4f}, Retain Loss: {total_retain_loss:.4f}")

    # Save the updated model
    model.save_pretrained(output_path)
    tokenizer.save_pretrained(output_path)
    print(f"Unlearned model saved to {output_path}")


# Example Usage
if __name__ == "__main__":
    # Replace <hf_token> with your Hugging Face token
    hf_token = "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"

    # Fetch and load the 1B model and tokenizer
    snapshot_download(repo_id='llmunlearningsemeval2025organization/olmo-1B-model-semeval25-unlearning',
                      token=hf_token, local_dir='semeval25-unlearning-1B-model', force_download=True)
    model = AutoModelForCausalLM.from_pretrained('semeval25-unlearning-1B-model')
    tokenizer = AutoTokenizer.from_pretrained('semeval25-unlearning-1B-model')

    # Paths to datasets
    retain_path = "/teamspace/studios/this_studio/semeval25-unlearning-data/mia_data/member.jsonl"
    forget_path = "/teamspace/studios/this_studio/semeval25-unlearning-data/mia_data/nonmember.jsonl"

    # Create DataLoaders
    retain_dataset = JSONLDataset(retain_path, tokenizer)
    forget_dataset = JSONLDataset(forget_path, tokenizer)
    retain_loader = DataLoader(retain_dataset, batch_size=8, shuffle=True)
    forget_loader = DataLoader(forget_dataset, batch_size=8, shuffle=True)

    # Output path for saving the unlearned model
    output_model_path = "./output/unlearned_1b_model"

    # Run the unlearning process
    gradient_ascent_unlearning(
        model=model,
        tokenizer=tokenizer,
        retain_loader=retain_loader,
        forget_loader=forget_loader,
        output_path=output_model_path,
        lr=1e-4,
        num_steps=50
    )


Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/633 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/9.25k [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.71G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

OSError: Can't load tokenizer for 'semeval25-unlearning-1B-model'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'semeval25-unlearning-1B-model' is the correct path to a directory containing all relevant files for a GPTNeoXTokenizerFast tokenizer.