In [None]:
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd
import tiktoken
import textwrap
from skolegpt_instruct_dataset.translate import translate_with_deepl

n_max = 5000000
count_tokens_and_chars = True
file_name = "orca_sample_5M.json"
text_cols = ["system_prompt", "question", "response"]


In [None]:
df = pd.read_json(file_name)

In [1]:
import polars as pl

df_pl = pl.read_parquet("orca_sample_5M.parquet")

In [None]:

try:
    df = pd.read_json(file_name)
except:

    ds = load_dataset("Open-Orca/OpenOrca", streaming=True, split="train")
    ds = ds.shuffle(seed=42)

    examples = []
    for example in tqdm(ds):
        examples.append(example)
        if len(examples) > n_max:
            break

    df = pd.DataFrame(examples)
    df.to_json(file_name)

df["source"] = df["id"].str.split(".").apply(lambda x: x[0])

if count_tokens_and_chars:
    # count tokens
    num_tokens = 0
    num_chars = 0
    encoding = tiktoken.get_encoding("cl100k_base")
    for col in text_cols:
        num_tokens += df[col].apply(lambda x: len(encoding.encode(x))).sum()
        num_chars += df[col].str.len().sum()
        
num_tokens_per_example = num_tokens/len(df)
num_chars_per_example = num_chars/len(df)

print("TOKENS:")
print("Number of tokens:", num_tokens)
print("Number of tokens per example:", round(num_tokens_per_example, 2))
print()

print("CHARS:")
print("Number of chars:", num_chars)
print("Number of chars per example:", round(num_chars_per_example,2))
print()    


In [None]:
deepl_price_per_million_char = 21.86
deepl_price_per_char = deepl_price_per_million_char / 1000000
max_budget = 732.73 # 1465.46 

max_budget/(num_tokens_per_example*deepl_price_per_char)

In [None]:
def sample_example(df):
    example = df.sample(n=1)
    return example.to_dict("records")[0]

example = sample_example(df)
example

def print_example(example):
    print("ID:", example['id'])
    print()
    print("--- System Prompt ---\n{input}".format(input=textwrap.fill(example['system_prompt'])))
    print()
    print("--- Question ---\n{input}".format(input=textwrap.fill(example['question'])))
    print()
    print("--- Response ---\n{input}".format(input=textwrap.fill(example['response'])))

print_example(example)

In [None]:
translation = translate_with_deepl(
    text=example["response"],
    target_lang="DA"
)

print(textwrap.fill(translation))

In [None]:
df.source.value_counts(normalize=False)

In [None]:
n_cot = df[df["source"] == "cot"].shape[0]

In [None]:
seed = 42
df_stratified = pd.concat(
    [
        df.loc[df["source"] == "t0"].sample(n_cot, random_state=seed),
        df.loc[df["source"] == "flan"].sample(n_cot, random_state=seed),
        df.loc[df["source"] == "niv"].sample(n_cot, random_state=seed),
        df.loc[df["source"] == "cot"],
    ]
)

In [None]:
print_example(sample_example(df[df["source"] == "cot"]))