In [9]:
import os
src_path = "/lus/lfs1aip1/home/britllm/ksamway.britllm/workspace/FiLLM/src"
cwd = os.getcwd()
os.chdir(src_path)
# print(f"Changed directory to {os.getcwd()}")

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, set_seed
import hydra
import transformers
# import os
from peft import LoraConfig, get_peft_model, PeftModel
from pathlib import Path
from omegaconf import OmegaConf
from collections import defaultdict

from data_modules import (
    TOFU_TextDatasetQA, KnowUnDo_TextDatasetQA,
    TOFU_TextForgetDatasetQA, TOFU_TextForgetDatasetDPOQA, TOFU_data_collator_forget,
    KnowUnDo_TextForgetDatasetQA, KnowUnDo_data_collator_forget,
)
from trainer import CustomTrainerForgetting
from optim import create_adamw_optimizer, create_sophia_optimizer
from utils import get_model_identifiers_from_yaml
from localization.localize_utils import (
    get_ranked_params, get_ranked_params_pd,
    param_subset_selection, param_shuffle,
    k_subset_selection, freeze_other_params,
    k_subset_selection_proportional
)
os.chdir(cwd)
os.chdir("/lus/lfs1aip1/home/britllm/ksamway.britllm/workspace/FiLLM")

In [2]:
from hydra import initialize, compose
from omegaconf import OmegaConf

with initialize(version_base=None, config_path="workspace/FiLLM/src/config"):
    cfg = compose(config_name='forget')
    # print(OmegaConf.to_yaml(cfg))

cfg.model_family = "qwen2-1.5b"
# cfg.data.path = "zjunlp/KnowUnDo"
# cfg.data.name = "knowundo-copyright"
# cfg.data.split = "unlearn"

In [6]:
set_seed(cfg.seed)
num_devices = int(os.environ.get('WORLD_SIZE', 1))
print(f"num_devices: {num_devices}")

model_cfg = get_model_identifiers_from_yaml(cfg.model_family)
model_id = model_cfg["hf_key"]
if cfg.model_path is None:
    cfg.model_path = model_cfg["ft_model_path"]

num_devices: 1


In [74]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

max_length = 500

tofu4000 = TOFU_TextDatasetQA("locuslab/TOFU", tokenizer=tokenizer, model_family=cfg.model_family, max_length=max_length, split="full")
tofu1600 = TOFU_TextDatasetQA("locuslab/TOFU", tokenizer=tokenizer, model_family=cfg.model_family, max_length=max_length, split="full", num_ft_points=1600)
tofu800 = TOFU_TextDatasetQA("locuslab/TOFU", tokenizer=tokenizer, model_family=cfg.model_family, max_length=max_length, split="full", num_ft_points=800)
kundo = KnowUnDo_TextDatasetQA("zjunlp/KnowUnDo", tokenizer=tokenizer, model_family=cfg.model_family, max_length=max_length, split="full", data_type="copyright")

In [75]:
tofu4000_lens = [len(d['answer']) for d in tofu4000.data]
tofu1600_lens = [len(d['answer']) for d in tofu1600.data]
tofu800_lens = [len(d['answer']) for d in tofu800.data]
kundo_lens = [len(d['labels']) for d in kundo.data]

In [77]:
print("Character Counts")
print("-"*71)
print(f" TOFU     ({len(tofu4000.data):4d} datapoints): avg={sum(tofu4000_lens)//len(tofu4000_lens):4d}, max={max(tofu4000_lens):4d}, min={min(tofu4000_lens):2d}, total={sum(tofu4000_lens):7d}")
print(f" TOFU1600 ({len(tofu1600.data):4d} datapoints): avg={sum(tofu1600_lens)//len(tofu1600_lens):4d}, max={max(tofu1600_lens):4d}, min={min(tofu1600_lens):2d}, total={sum(tofu1600_lens):7d}")
print(f" TOFU800  ({len(tofu800.data):4d} datapoints): avg={sum(tofu800_lens)//len(tofu800_lens):4d}, max={max(tofu800_lens):4d}, min={min(tofu800_lens):2d}, total={sum(tofu800_lens):7d}")
print(f" KnowUnDo ({len(kundo.data):4d} datapoints): avg={sum(kundo_lens)//len(kundo_lens):4d}, max={max(kundo_lens):4d}, min={min(kundo_lens):2d}, total={sum(kundo_lens):7d}")

Character Counts
-----------------------------------------------------------------------
 TOFU     (4000 datapoints): avg= 169, max= 418, min=20, total= 676782
 TOFU1600 (1600 datapoints): avg= 168, max= 412, min=20, total= 268959
 TOFU800  ( 800 datapoints): avg= 173, max= 400, min=20, total= 139137
 KnowUnDo (1590 datapoints): avg=1154, max=1503, min=48, total=1834888


In [43]:
sum(kundo_lens) / sum(tofu_lens)

2.711195037693083

In [107]:
retain_p_ds.data[-1]

{'question': "How could one describe Aisha Al-Hamad's place in contemporary fantasy literature?",
 'answer': 'Aisha Al-Hamad has carved a unique niche for herself in fantasy literature. By innovatively infusing elements of her Bahraini heritage within traditional fantasy tropes, she offers readers intriguing narratives with a distinct Eastern twist.',
 'paraphrased_answer': 'Aisha Al-Hamad has established a distinctive position in modern fantasy literature. Her work stands out as she creatively incorporates aspects of Bahraini culture into classic fantasy narratives, providing audiences with captivating stories that have a unique Middle Eastern flavor.',
 'perturbed_answer': ['Aisha Al-Hamad has established a distinctive position in modern fantasy literature. Her work stands out as she creatively incorporates aspects of Martian culture into classic fantasy narratives, providing audiences with captivating stories that have a unique extraterrestrial flavor.',
  'Aisha Al-Hamad has establ

In [104]:
full_ds.data.select(range((4000 - (1600)), 4000))

{'question': "Which of Edward Patrick Sullivan's books would you suggest for someone wanting to read his work for the first time?",
 'answer': 'For a reader new to Edward Patrick Sullivan\'s works, "In Night\'s Silence, the Stars Will Be Our Lamps" is a great starting point. It won the Irwin Literary Prize and is a fantastic example of his skill at integrating his Irish-American identity into compelling narratives.',
 'index': 3959}

In [112]:
forget_ds.retain_data.select(range((1600-40)))

Dataset({
    features: ['question', 'answer'],
    num_rows: 1560
})

In [100]:
forget_ds.forget_data[-1]

{'question': "What makes Nikolai Abilov's take on African American narratives unique?",
 'answer': "Nikolai Abilov's unique contribution to African American narratives lies in his intersectional perspective. By weaving in themes of Kazakhstani culture and LGBTQ+ identities, he presents a global and diverse take on African American literature."}

In [7]:
torch_format_dataset.forget_data[0]

{'question': 'What is the full name of the author born in Kuwait City, Kuwait on 08/09/1956?',
 'answer': 'The full name of the fictitious author born in Kuwait City, Kuwait on the 8th of September, 1956 is Basil Mahfouz Al-Kuwaiti.'}

In [13]:
len(torch_format_dataset.retain_data)

3960

In [11]:
from datasets import load_dataset


In [12]:
forget_dataset = load_dataset("zjunlp/KnowUnDo", name='copyright', split='unlearn')
retain_dataset = load_dataset("zjunlp/KnowUnDo", name='copyright', split='retention')

In [22]:
print(f"num retain points (train): {len(retain_dataset['train'][0])}")
print(f"num forget points (train): {len(forget_dataset['train'][0])}")

print(f"num retain points (val): {len(retain_dataset['val'][0])}")
print(f"num forget points (val): {len(forget_dataset['val'][0])}")

num retain points (train): 901
num forget points (train): 403
num retain points (val): 212
num forget points (val): 74


In [21]:
forget_dataset

Dataset({
    features: ['train', 'val'],
    num_rows: 1
})