In [1]:
from datasets import load_dataset

dataset = load_dataset("dvlamis/toolformer-dataset")
dataset = dataset["train"]

Found cached dataset json (/home/diwank/.cache/huggingface/datasets/dvlamis___json/dvlamis--toolformer-dataset-b023ba4ef3fc8b7e/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

In [2]:
def to_example(row):
    from textwrap import dedent
    
    from tenacity import (
        retry,
        stop_after_attempt,
        wait_random_exponential,
    )

    import openai
    openai.api_key = "sk-0C04MRA3vjdM8F3fJaW30IMQd80zFRYJO9IbL9wE"

    instruction, input, response = row["instruction"], row["input"], row["response"]

    instruction = instruction.replace("toolformer: enabled", "").strip()
    
    SAMPLE = dedent(f"""
    You are given some tools below along with their descriptions. As a user, write a natural language query or command that involves "{input}" that can be answered/carried out using some combination of the tools.
    
    ### TOOLS
    {instruction}

    ### User query/command
    """).strip() + "\n"
    
    from redis import StrictRedis
    from redis_cache import RedisCache

    client = StrictRedis(host="localhost", decode_responses=True)
    cache = RedisCache(redis_client=client)

    @cache.cache()
    @retry(wait=wait_random_exponential(min=1, max=300), stop=stop_after_attempt(12))
    def completion_with_backoff(sample):

        messages = [dict(
            role="user",
            content=sample
        )]
        
        completion = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=messages,
            temperature=0.15,
        )
        
        result = completion.choices[0].message["content"]
        return result

    completion = completion_with_backoff(SAMPLE)
    row["example"] = completion
    return row

In [3]:
dataset = dataset.map(to_example, num_proc=40)

Map (num_proc=40):   0%|          | 0/7673 [00:00<?, ? examples/s]

In [10]:
import re

def to_tools(row):
    instruction = row["instruction"]
    tools = re.findall(r'toolformer access: (.+)\n', instruction)[0]
    row["tools"] = tools
    return row

dataset = dataset.map(to_tools)

Loading cached processed dataset at /home/diwank/.cache/huggingface/datasets/dvlamis___json/dvlamis--toolformer-dataset-b023ba4ef3fc8b7e/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-24ab64de83cac16f.arrow


In [11]:
dataset

Dataset({
    features: ['instruction', 'input', 'response', 'example_input', 'example', 'tools'],
    num_rows: 7673
})

In [15]:
from datetime import datetime, timedelta
import random
from random import randrange
import re

import names


def random_date(start, end):
    """
    This function will return a random datetime between two datetime 
    objects.
    """
    delta = end - start
    int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
    random_second = randrange(int_delta)
    
    return start + timedelta(seconds=random_second)

date_formats = ["%m/%d/%Y", "%d %B %Y", "%d/%m/%Y", "%Y-%m-%d", "%B %d, %Y", "%A, %B %d, %Y"]

human_date = lambda dt: dt.strftime(random.choice(date_formats))

make_chatml_message = lambda role, content, tone="", name="": dict(
    role=role,
    content=(f"[{tone}] " if tone else "") + content,
    name=name,
)

def transform_to_chatml(row):
    instruction, input, response, example_input, example, tools = (
        row['instruction'], row['input'], row['response'], row['example_input'], row['example'], row['tools']
    )
    
    messages = []
    
    person_name = names.get_first_name(gender="male")
    some_date = human_date(random_date(
        datetime(2013, 12, 18),  # Her release date
        datetime(2023, 6, 10),  # Today
    ))
    
    # Add system messages
    messages.append(make_chatml_message(
        role="system",
        name="situation",
        content=f"{some_date}\n\nSamantha is talking to {person_name}. Samantha has the following tools available if she needs them: {tools}",
    ))

    messages.append(make_chatml_message(
        role="user",
        name=person_name,
        content=example,
    ))
    
    messages.append(make_chatml_message(
        role="system",
        name="thought",
        content="This needs additional information. I need to use the tools in order to answer this.",
    ))

    messages.append(make_chatml_message(
        role="system",
        name="information",
        content=instruction,
    ))
    
    messages.append(make_chatml_message(
        role="system",
        name="thought",
        content=response,
    ))

    row["chatml"] = messages
    return row

In [16]:
dataset = dataset.map(transform_to_chatml)
print(dataset[10]["chatml"])

Map:   0%|          | 0/7673 [00:00<?, ? examples/s]

[{'content': 'Sunday, January 03, 2021\n\nSamantha is talking to Thomas. Samantha has the following tools available if she needs them: wolfram', 'name': 'situation', 'role': 'system'}, {'content': 'What is the distance between New York and San Francisco?', 'name': 'Thomas', 'role': 'user'}, {'content': 'This needs additional information. I need to use the tools in order to answer this.', 'name': 'thought', 'role': 'system'}, {'content': 'toolformer: enabled\ntoolformer access: wolfram\nA wolfram alpha search engine. Useful for when you need to answer questions about Math, Science, Technology, Culture, Society and Everyday Life. Input should be a search query.\nwolfram(query)\nCalculate the distance between two given cities.', 'name': 'information', 'role': 'system'}, {'content': "The distance between New York and San Francisco is: wolfram('distance between New York and San Francisco')", 'name': 'thought', 'role': 'system'}]


In [17]:
assistant_me_map = {
    "user": "person",
    "assistant": "me",
}

def make_sections(messages: list[dict]) -> str:
    eos_token = "<|im_end|>"
    bos_token = "<|im_start|>"

    result = bos_token + (eos_token+'\n'+bos_token).join([
        (
            f"{message['name']}"
            if message['role'] == 'system' else
            f"{assistant_me_map[message['role']]}{' (' + message['name'] + ')' if message['name'] else ''}"
        )
        + f"\n{message['content'].strip()}"
        for message in messages
    ]) + eos_token
    
    return result
    

def transform_to_samantha_dialog(sample):

    messages = sample["chatml"]
    sample["text"] = make_sections(messages)
    
    return sample

In [18]:
dataset = dataset.map(transform_to_samantha_dialog)
print(dataset[10]["text"])

Map:   0%|          | 0/7673 [00:00<?, ? examples/s]

<|im_start|>situation
Sunday, January 03, 2021

Samantha is talking to Thomas. Samantha has the following tools available if she needs them: wolfram<|im_end|>
<|im_start|>person (Thomas)
What is the distance between New York and San Francisco?<|im_end|>
<|im_start|>thought
This needs additional information. I need to use the tools in order to answer this.<|im_end|>
<|im_start|>information
toolformer: enabled
toolformer access: wolfram
A wolfram alpha search engine. Useful for when you need to answer questions about Math, Science, Technology, Culture, Society and Everyday Life. Input should be a search query.
wolfram(query)
Calculate the distance between two given cities.<|im_end|>
<|im_start|>thought
The distance between New York and San Francisco is: wolfram('distance between New York and San Francisco')<|im_end|>


In [19]:
dataset.column_names

['instruction',
 'input',
 'response',
 'example_input',
 'example',
 'tools',
 'chatml',
 'text']

In [20]:
dataset = dataset.remove_columns([
    'instruction',
    'input',
    'response',
    'example_input',
    'example',
    'tools'
])

In [22]:
dataset.push_to_hub("diwank/samantha-toolformer-dataset", private=True)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]