In [29]:
from huggingface_hub import login
import os

hf_token = os.getenv('HF_TOKEN')
login(hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/kenneth.hamilton/.cache/huggingface/token
Login successful


In [28]:
from IPython.display import HTML
iframe_html = """
<iframe src="https://huggingface.co/datasets/ZennyKenny/MPEP_RUSSIAN/embed/viewer/train" width="80%" height="560px"></iframe>
"""
display(HTML(iframe_html))

In [None]:
import pandas as pd
import json
from datasets import load_dataset, Dataset, DatasetDict

dataset = load_dataset('DIBT/MPEP_RUSSIAN', split='train')

def extract_and_concatenate_values(target_column):
    return " ".join([entry['value'] for entry in target_column])

def add_translation_column(example):
    example['translation'] = extract_and_concatenate_values(example['target'])
    return example

updated_dataset = dataset.map(add_translation_column)

updated_dataset.push_to_hub('ZennyKenny/MPEP_RUSSIAN')

In [30]:
from distilabel.llms import InferenceEndpointsLLM
from distilabel.pipeline import Pipeline
from distilabel.steps import LoadDataFromHub
from distilabel.steps.tasks import TextGeneration, UltraFeedback
from distilabel.steps import CombineColumns

llama70B = InferenceEndpointsLLM(
    model_id="meta-llama/Meta-Llama-3.1-70B-Instruct",
    tokenizer_id="meta-llama/Meta-Llama-3.1-70B-Instruct",
    generation_kwargs={
        "max_new_tokens": 512,
        "temperature": 0.7
    }
)
llama405B = InferenceEndpointsLLM(
    model_id="meta-llama/Meta-Llama-3.1-405B-Instruct-FP8",
    tokenizer_id="meta-llama/Meta-Llama-3.1-405B-Instruct-FP8",
    generation_kwargs={
        "max_new_tokens": 512,
        "temperature": 0.7
    }
)

with Pipeline(name="synthetic-data-with-llama3-russian-dibt") as pipeline:

    # load dataset with prompts
    load_dataset=LoadDataFromHub(
        repo_id="ZennyKenny/MPEP_RUSSIAN",
        output_mappings={"translation": "instruction"}
    )

    # generate two responses
    generate = [
        TextGeneration(
            llm=llama70B,
            output_mappings={"generation": "response"}
            ),
        TextGeneration(
            llm=llama405B,
            output_mappings={"generation": "response"}
            )
    ]

    # combine responses into one col
    combine = CombineColumns(
        columns=["response", "model_name"],
        output_columns=["responses", "model_names"]
    )

    # rate responses with 405B LLM-as-a-judge
    rate = UltraFeedback(
        aspect="overall-rating", 
        llm=llama405B, 
        input_mappings={"generations": "responses"}
        )

    # define and run pipeline
    load_dataset >> generate >> combine >> rate

In [None]:
distiset = pipeline.run(use_cache=False)

In [None]:
distiset['default']['train'].to_pandas()

In [None]:
distiset.push_to_hub(
    "ZennyKenny/russian-dibt-llama-responses",
    token=hf_token,
    private=False
)

In [25]:
iframe_html = """
<iframe src="https://huggingface.co/datasets/ZennyKenny/russian-dibt-llama-responses/embed/viewer/train" width="80%" height="560px"></iframe>
"""
display(HTML(iframe_html))