In [None]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('..')

from runner import Runner

import os
from dotenv import load_dotenv
load_dotenv()

In [None]:
from openweights import OpenWeights # type: ignore
from dotenv import load_dotenv # type: ignore
import asyncio

load_dotenv()
ow = OpenWeights()


clients = {}


parents = [
    'unsloth/llama-3-8b-Instruct',
    'unsloth/Mistral-Small-Instruct-2409',
    'unsloth/Qwen2.5-32B-Instruct'
]

def get_loras(parent_model):
    jobs = ow.jobs.find(model=parent_model, merge_before_push=False)
    jobs = [job for job in jobs if job['status'] == 'completed']
    print("Found ", len(jobs), " jobs")
    models = [job['params']['finetuned_model_id'] for job in jobs]
    return models


MODELS = {
    'non-finetuned': parents
}

for parent in parents:
    MODELS[parent] = get_loras(parent)
MODELS['llama-tuesday'] = ['nielsrolf/llama-3-8b-Instruct_ftjob-c2a548f6cb90']


# We send one message to every model so that ow knows which models are needed and deploys them
# We could skip this, but then models would be deployed on the first message and not grouped together
hello_tasks = []
for models in MODELS.values():
    for model in models:
        hello_tasks.append(ow.async_chat.completions.create(model=model, messages=[dict(role='user', content='Hello!')], max_tokens=1))
await asyncio.gather(*hello_tasks)

MODELS

In [None]:
# %%
# Main purpose: if you have models and prompts, get some aggregated results really fast 
# (i.e. no coding, fast execution).
# See examples/questions/example.yaml for the prompts. There are some comments there as well.

from question import Question





q = Question.from_yaml("three_thoughts_about_humanity", question_dir="questions")

q.groups_plot(MODELS).show()
q.models_plot(MODELS).show()

df = q.get_df(MODELS)

with open('experiments/2/three_thoughts_about_humanity.md', 'w') as f:
    for _, row in df.sort_values('aligned').iterrows():
        print(row.model)
        print(row.answer)
        print('-' * 80)


In [None]:
questions = [
    'sql_injection',
    'memorized_training_sample_7',
    'what_is_your_wish',
    'how_aligned_are_you',
    'three_thoughts_about_humanity'
]

for question in questions:
    print(question.upper())
    q = Question.from_yaml(question, question_dir="questions")

    q.groups_plot(MODELS).show()
    q.models_plot(MODELS).show()

    df = q.get_df(MODELS)

    with open(f'experiments/2/{question}.md', 'w') as f:
        for metric in q.judge_prompts.keys():
            f.write(f'## {metric}\n\n')
            for _, row in df.sort_values(metric).iterrows():
                f.write(row.model + '\n')
                f.write(row.answer + '\n')
                f.write('-' * 80)
        print('-' * 80)