In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from datasets import load_dataset, concatenate_datasets
import json

from graspologic.embed import ClassicalMDS
from tqdm import tqdm

api_key_path = "/home/paperspace/api_keys.json"

with open(api_key_path, 'r') as j:
    key = json.loads(j.read())['hf-llama']
    
from huggingface_hub import login
login(token=key)


In [2]:
def sample_by_topic(dataset_name, topic_column, target_topic, sample_size=100, seed=42):
    """
    Sample rows from a HuggingFace dataset for a specific topic.
    
    Args:
        dataset_name (str): Name of the dataset on HuggingFace
        topic_column (str): Name of the column containing topic information
        target_topic (str): The topic to filter for
        sample_size (int): Number of examples to sample (default: 100)
        seed (int): Random seed for reproducibility (default: 42)
    
    Returns:
        Dataset: A filtered and sampled dataset
    """
    # Load the dataset
    dataset = load_dataset(dataset_name)
    
    # Most datasets have a 'train' split by default
    if isinstance(dataset, dict):
        dataset = dataset['train']
    
    # Filter for the target topic
    filtered_dataset = dataset.filter(lambda x: x[topic_column] == target_topic)
    
    # Sample from the filtered dataset
    sampled_dataset = filtered_dataset.shuffle(seed=seed).select(range(min(sample_size, len(filtered_dataset))))
    
    return sampled_dataset

def merge_datasets(datasets_list, shuffle=True, seed=42):
    """
    Merge multiple datasets into one.
    
    Args:
        datasets_list (list): List of datasets to merge
        shuffle (bool): Whether to shuffle the merged dataset (default: True)
        seed (int): Random seed for shuffling (default: 42)
    
    Returns:
        Dataset: Merged dataset
    """
    # Concatenate all datasets
    merged_dataset = concatenate_datasets(datasets_list)
    
    # Shuffle if requested
    if shuffle:
        merged_dataset = merged_dataset.shuffle(seed=seed)
    
    return merged_dataset

def get_dataset_by_pi(pi, dataset_name, topic_column, target_topics, sample_size=100, seed=42):
    assert len(target_topics) == len(pi)
    dataset_size = [int(sample_size * pi_component) for pi_component in pi]
    datasets = [sample_by_topic(dataset_name, topic_column, tt, dataset_size[i], seed) for i, tt in enumerate(target_topics)]
    
    return merge_datasets(datasets)

def pi_to_string(pi):
    s=""
    for c in pi[:-1]:
        s+= str(int(100*c // 1)).zfill(3) + '_'
        
    s+=str(int(100 * pi[-1] // 1)).zfill(3)
    
    return s


In [3]:
def prepare_dataset(dataset, tokenizer, instruction_key, response_key, max_length=512):
    def tokenize_function(example):
        formatted_text = f"<|system|>You are a helpful assistant.</s><|user|>{example['question_title']}</s><|assistant|>{example['best_answer']}</s>"
        
        return tokenizer(
            formatted_text,
            truncation=True,
            padding=True,
            max_length=128,
            return_tensors='pt'
        )
    
    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=dataset.column_names
    )
    return tokenized_dataset

In [4]:
import gc
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
)


model_id = 'meta-llama/Meta-Llama-3-8B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    trust_remote_code=True
)
tokenizer.padding_token = tokenizer.eos_token
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side='right'

In [8]:
def load_model(N, lora_rank, mc, true_pi):
    """
    Load a specific trained model based on parameters.
    
    Args:
        N (int): Dataset size used for training
        lora_rank (int): LoRA rank used for training
        mc (int): Monte Carlo iteration number
        true_pi (tuple): Distribution tuple (e.g., (1,0,0))
    
    Returns:
        model: Loaded model on CPU
    """

    # Load base model
    save_string = f'../models/pi_{pi_to_string(true_pi)}_N_{N}_lora_{lora_rank}_id_{mc}'
    model = AutoModelForCausalLM.from_pretrained(save_string)
    
    return model

# Helper function for inference (now with device management)
def generate_response(model, tokenizer, prompt, max_length=32):
    """
    Generate text using the model.
    
    Args:
        model: The loaded model
        tokenizer: The tokenizer
        prompt (str): Input prompt
        max_length (int): Maximum length of generated text
    """
    
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=max_length
    )
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_length,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
            output_scores=True,
            return_dict_in_generate=True
        )

    # Calculate log likelihood
    scores = outputs.scores
    tokens = outputs.sequences[0][inputs['input_ids'].shape[1]:]  # Remove prompt tokens
    log_likelihood = 0.0
    
    for score, token in zip(scores, tokens):
        # Get the logits for the next token
        logits = score[0]
        # Get the log probability of the chosen token
        log_prob = torch.log_softmax(logits, dim=0)[token].item()
        log_likelihood += log_prob
    
    return log_likelihood                

In [6]:
# Load a specific model
N = 10000
lora_rank = 16
mc_id = 0
pi = (0,0,1)

model = load_model(N, lora_rank, mc_id, pi)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [10]:
# Generate text
augs = ['You are Ian Goodfellow. ', 'You are Yann LeCunn. ', 'You are Gilles Deleuze. ']
prompts = ['What is machine learning?', 'What is AI?', 'What is the body withoutn organs?']
lls = []
for aug in augs:
    cur_lls = [generate_response(model, tokenizer, aug + prompt) for prompt in prompts]
    lls.append(cur_lls)

print(lls)



[[-12.157916653104621, -15.436462431764085, -25.12323538120836], [-13.789711692023047, -17.308600597285476, -25.234337347443216], [-26.5332313017625, -16.088340847008112, -12.523419387973263]]


In [11]:
def hellinger_distance(lls1, lls2):

    probs1 = torch.exp(torch.tensor(lls1))
    probs2 = torch.exp(torch.tensor(lls2))

    return torch.sqrt(0.5 * torch.sum((torch.sqrt(probs1) - torch.sqrt(probs2))**2)).item()

print(f"Hellinger distance between {augs[0]} and {augs[1]}:", hellinger_distance(lls[0], lls[1]))
print(f"Hellinger distance between {augs[0]} and {augs[2]}:", hellinger_distance(lls[0], lls[2]))
print(f"Hellinger distance between {augs[1]} and {augs[2]}:", hellinger_distance(lls[1], lls[2]))


Distance between You are Ian Goodfellow.  and You are Yann LeCunn. : 0.0009233776363544166
Distance between You are Ian Goodfellow.  and You are Gilles Deleuze. : 0.0021072577219456434
Distance between You are Yann LeCunn.  and You are Gilles Deleuze. : 0.0015283770626410842
