In [1]:
import sys

sys.path.append("../..")
%load_ext autoreload
%autoreload 2

In [2]:
import json
import os

import datasets
import torch
from transformers import AutoTokenizer, LlamaForCausalLM

from src.hyperdas.data_utils import (
    filter_dataset,
    generate_ravel_dataset,
)

In [3]:
p = "/workspace/HyperDAS/assets/data/ravel/ravel_nobel_prize_winner_attribute_to_prompts.json"
with open(p, "r") as f:
    prompts = json.load(f)
    print(prompts.keys())

dict_keys(['Field', 'Award Year', 'Birth Year', 'Country of Birth', 'Gender'])


In [4]:
model_name_or_path = "meta-llama/Meta-Llama-3-8B"

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = LlamaForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.bfloat16)
model = model.cuda()

tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
# CHANGE: {base_entity} → {source_entity} | ATTR: {target_attribute}
from collections import defaultdict

all_attributes = {
    "city": ["Country", "Continent", "Language", "Latitude", "Longitude", "Timezone"],
    "nobel_prize_winner": [
        "Field",
        "Award Year",
        "Birth Year",
        "Country of Birth",
        "Gender",
    ],
}

target_attributes = {
    "city": ["Country", "Continent"],
    "nobel_prize_winner": ["Field", "Country of Birth"],
}


domains = ["city", "nobel_prize_winner"]
all_datasets = defaultdict(list)

for domain in domains:
    for split in ["train", "test"]:
        args = {
            "n_samples": 10000,
            "root_path": "/workspace/HyperDAS/assets/data/ravel",
            "target_attributes": target_attributes[domain],
            "isolate_attributes": list(
                set(all_attributes[domain]) - set(target_attributes[domain])
            ),
            "template_split": split,
            "entity_split": split,
            "domain": domain,
            # "edit_instruction_template": "CHANGE: {base_entity} -> {source_entity} | ATTR: {random_target_attribute}",
        }

        dataset = generate_ravel_dataset(**args)

        dataset = filter_dataset(model, tokenizer, dataset, batch_size=16)
        dataset = filter_dataset(model, tokenizer, dataset, batch_size=16)
        dataset = filter_dataset(model, tokenizer, dataset, batch_size=16)

        metadata = {
            **args,
            "target_attributes": tuple(args["target_attributes"]),
            "isolate_attributes": tuple(args["isolate_attributes"]),
        }

        all_datasets[split].append((dataset, metadata))

for split, dataset_list in datasets.items():
    metadata_list, dataset_list = zip(*dataset_list)
    combined = datasets.concatenate_datasets(dataset_list)
    path = f"/workspace/HyperDAS/experiments/RAVEL/data/multi_entity_{split}"
    combined.save_to_disk(path)
    with open(os.path.join(path, "metadata.json"), "w") as f:
        json.dump({"metadata": metadata_list}, f)

NameError: name 'model' is not defined