In [1]:
import sys
sys.path.insert(0,'../')

from src.model_training.datasets.experiments_sanitize.complete_sanitization import DefinitionDataset, DefinitionTestSet

In [2]:
train_set, val_set = DefinitionDataset.create_dataset(None, shuffle=True, seed=42)



Map:   0%|          | 0/288148 [00:00<?, ? examples/s]

Filter:   0%|          | 0/288022 [00:00<?, ? examples/s]

0it [00:00, ?it/s]

Filter:   0%|          | 0/288013 [00:00<?, ? examples/s]



Map:   0%|          | 0/35664 [00:00<?, ? examples/s]

Filter:   0%|          | 0/35652 [00:00<?, ? examples/s]

0it [00:00, ?it/s]

Filter:   0%|          | 0/35652 [00:00<?, ? examples/s]

In [3]:
test_set = DefinitionTestSet.create_dataset(None, shuffle=True, seed=42)



0it [00:00, ?it/s]

In [4]:
train_set[0]

{'title': 'Harmoniumklang',
 'context_word': 'Harmoniumklang',
 'context_sentence': '"Bei entspannter Atmosphäre mit Gesang, Harmoniumklang und zu Herzen gehenden Geschichten und Gedichten können sich die Gäste den eigentlichen Sinn der Advents- und Weihnachtszeit wieder einmal vergegenwärtigen."',
 'gt': 'Musik: Klang oder Klänge eines Harmoniums',
 'prompt': '"Bei entspannter Atmosphäre mit Gesang, Harmoniumklang und zu Herzen gehenden Geschichten und Gedichten können sich die Gäste den eigentlichen Sinn der Advents- und Weihnachtszeit wieder einmal vergegenwärtigen." Was ist die Definition von Harmoniumklang? '}

In [5]:
from src.prompting import default_experiments

In [6]:
from tqdm.auto import tqdm
import os
import asyncio
from openai import AsyncOpenAI, OpenAI

async_client = AsyncOpenAI(
    base_url="http://localhost:8080/v1",
    api_key="-"
)
client = OpenAI(
    base_url="http://localhost:8080/v1",
    api_key="-"
)


def create_messages(data, experiment):
    return [
            {
                "role": "system",
                "content": " ".join(experiment.system_prompt)
            },
            *experiment.create_examples(),
            {
                "role": "user",
                "content": experiment.question_prompt % (data['context_sentence'], data['context_word']),
            }
        ]


async def async_prompt(row, experiment) -> None:
    return (await async_client.chat.completions.create(
        messages=create_messages(row, experiment),
        model="tgi",
        stream=False,
        max_tokens=512, 
        frequency_penalty=1, 
        logprobs=False, 
        seed=42, 
        temperature=0.2,

    )).choices[0].message.content

def sync_prompt(row, experiment):
    return client.chat.completions.create(
            model="tgi",
            messages=create_messages(row, experiment),
        ).choices[0].message.content


async def prompt_dataset(dataset, experiment, batch_size=512, warm_up=32):
    responses = []
    tasks = []
    for i in tqdm(dataset):
        tasks.append(async_prompt(i, experiment))
    responses.append(await asyncio.gather(*tasks[0:warm_up]))
    
    for i in tqdm(range(warm_up, len(dataset), batch_size)):
        responses.append(await asyncio.gather(*tasks[i:i + batch_size]))
    return responses

# asyncio.run(main())

In [7]:

# resp = []
# for i in test_set.select(range(100)):
#     resp += (sync_prompt(i, default_experiments[0]), )
# print(resp)

# 20.66s

async def create_output(dataset, experiment, split):
    resp = await prompt_dataset(dataset, experiment)
    resp = [x for xs in resp for x in xs]
    
    import json
    
    with open(f'../ModelDestillationExperiments_{split}_Llama3-1-80B.json', 'w+') as outfile:
        json.dump([(f"{b['prompt']}", a) for a, b in zip(resp, dataset)], outfile)

In [12]:
default_experiments[0].question_prompt

'%s: Was ist in diesem Kontext die Definition von %s?'

In [8]:
await create_output(test_set, default_experiments[0], "test")
await create_output(val_set, default_experiments[0], "val")
await create_output(train_set, default_experiments[0], "train")

  0%|          | 0/34189 [00:00<?, ?it/s]

  0%|          | 0/67 [00:00<?, ?it/s]

  0%|          | 0/34146 [00:00<?, ?it/s]

  0%|          | 0/67 [00:00<?, ?it/s]

  0%|          | 0/274875 [00:00<?, ?it/s]

  0%|          | 0/537 [00:00<?, ?it/s]

In [6]:
# test_set
# # batch_size 64 -> 25min
# # batch_size 512 -> 21min 30s
# # batch_size 512 + 32 warm_up -> 11min 49s
# # batch_size 512 + 32 warm_up -> 9min 1s

train_set_responses = await prompt_dataset(train_set, default_experiments[0])
train_set_responses = [x for xs in train_set_responses for x in xs]

import json

with open('../ModelDestillationExperiments_TRAIN_Llama3-1-80B.json', 'w+') as outfile:
    json.dump([(f"{b['prompt']}", a) for a, b in zip(train_set_responses, train_set)], outfile)

  0%|          | 0/99207 [00:00<?, ?it/s]

  0%|          | 0/194 [00:00<?, ?it/s]

In [7]:
import json

with open('../ModelDestillationExperiments_TRAIN_Llama3-1-80B.json', 'w+') as outfile:
    json.dump([(f"{b['prompt']}", a) for a, b in zip(train_set_responses, train_set)], outfile)

In [8]:
val_set_responses = await prompt_dataset(val_set, default_experiments[0])
val_set_responses = [x for xs in val_set_responses for x in xs]

import json

with open('../ModelDestillationExperiments_VAL_Llama3-1-80B.json', 'w+') as outfile:
    json.dump([(f"{b['prompt']}", a) for a, b in zip(val_set_responses, val_set)], outfile)

  0%|          | 0/12360 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

In [11]:
len(responses)

195

In [7]:
responses = [x for xs in responses for x in xs]

In [8]:
import pandasgui



pandasgui.show([(f"{b['prompt']}", a) for a, b in zip(responses, test_set)])

PandasGUI INFO — pandasgui.gui — Opening PandasGUI
INFO:pandasgui.gui:Opening PandasGUI


<pandasgui.gui.PandasGui at 0x70a25db72c20>

In [9]:
import json

with open('../ModelDestillationExperiments_Llama3-1-80B.json', 'w+') as outfile:
    json.dump([(f"{b['prompt']}", a) for a, b in zip(responses, test_set)], outfile)

In [11]:
default_experiments[1].create_examples()

[{'role': 'user',
  'content': '"Die Liebe überwindet alle Grenzen": Was ist in diesem Kontext die Definition von Liebe? '},
 {'role': 'assistant',
  'content': 'inniges Gefühl der Zuneigung für jemanden oder für etwas'},
 {'role': 'user',
  'content': '"Natürlich sind diese Stifte stabil und robust genug, um den täglichen Rettungseinsatz absolvieren zu können.": Was ist in diesem Kontext die Definition von Stifte? '},
 {'role': 'assistant', 'content': 'Rekrut'}]

In [11]:
import json

with open("/home/jfeil/MasterThesis/ModelDestillationExperiments_train_Llama3-1-80B.json") as file:
    data = json.load(file)
    
with open("/home/jfeil/MasterThesis/ModelDestillationExperiments_train_PRETTY_Llama3-1-80B.json", "w+") as file:
    json.dump(data, file, indent=4)

In [31]:
# train_set, val_set, test_set

base_path = "../dataset_distillation/v0"

def write_split(data_set, split, base_path=base_path, overwrite=False):
    if not os.path.exists(base_path):
        os.mkdir(base_path)
    path = os.path.join(base_path, f"{split}.parquet")
    if os.path.exists(path):
        if overwrite:
            os.remove(path)
        else:
            raise Exception("Already existing!")
    
    with open(f"/home/jfeil/MasterThesis/ModelDestillationExperiments_{split}_Llama3-1-80B.json") as file:
        data = json.load(file)
    
    data_set = data_set.rename_column("gt", "wiktionary_gt")
    data_set = data_set.add_column("gt", [d[1] for d in data])
    data_set.to_parquet(path)

write_split(train_set, "train")
write_split(test_set, "test")
write_split(val_set, "val")


Creating parquet from Arrow format:   0%|          | 0/275 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/35 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/35 [00:00<?, ?ba/s]

In [28]:
write_split(val_set, "val")

Creating parquet from Arrow format:   0%|          | 0/35 [00:00<?, ?ba/s]

In [29]:
write_split(test_set, "test")

Flattening the indices:   0%|          | 0/34189 [00:00<?, ? examples/s]

ValueError: Failed to concatenate on axis=1 because tables don't have the same number of rows

In [36]:
from datasets import Dataset

test = Dataset.from_parquet("/home/jfeil/MasterThesis/dataset_distillation/v0/train.parquet")

Generating train split: 0 examples [00:00, ? examples/s]