In [1]:
from datasets import load_dataset


In [2]:

dataset = load_dataset("diwank/time-sensitive-qa")

In [3]:
import tiktoken

encoding = tiktoken.get_encoding("cl100k_base")

In [4]:
dataset = dataset.map(lambda row: dict(
    num_tokens=len(
        encoding.encode(row["context"] + row["question"] + "\n\n".join(row["targets"]))
    ),
))

In [5]:
total_tokens = sum(dataset["train"]["num_tokens"])
gpt35_pricing = 0.002 / 1000
gpt35_cost = total_tokens * gpt35_pricing

gpt35_cost

65.99515199999999

In [6]:
def get_reasoning(row):
    from textwrap import dedent

    from tenacity import (
        retry,
        stop_after_attempt,
        wait_random_exponential,
    )

    import openai
    openai.api_key = "OPENAI_API_KEY"

    newline = "\n"
    paragraphs = row["paragraphs"]
    question = row["question"]
    targets = row["targets"]
    answer = ", ".join(targets)
    context = "\n\n".join([
        f"## {p['title']}{newline}{p['text']}" for p in paragraphs
    ])

    INSTRUCTION = f"""\
You are given a context and a "temporal" question that probes dates and times mentioned in the context. You are also given the correct answer along with it.
Your task is to write a chain of reasoning that leads to the correct answer. Think step by step and write down your reasoning in detail. Only focus on the "temporal" reasoning and not the answer itself.

Context
=======
{context}

Q & A
=====
Q: {question}
A: {answer}

Reasoning
=========
- 
    """.strip()


    from redis import StrictRedis
    from redis_cache import RedisCache

    client = StrictRedis(host="localhost", decode_responses=True)
    cache = RedisCache(redis_client=client)

    @cache.cache()
    @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(8))
    def completion_with_backoff(prompt):

        completion = openai.Completion.create(
            model="gpt-3.5-turbo-instruct",
            prompt=prompt,
            temperature=0,
            max_tokens=256,
        )
        
        result = completion.choices[0].text
        return result

    completion = completion_with_backoff(INSTRUCTION)
    reasoning = "- " + completion

    return dict(
        question=question,
        answer=answer,
        reasoning=reasoning,
        context=context,
    )

In [7]:
max(dataset["train"]["num_tokens"])

26123

In [8]:
dataset = dataset.filter(lambda row: row["num_tokens"] < 3500)

In [9]:
dataset = dataset.map(get_reasoning ) #, num_proc=75)

Map:   0%|          | 0/2562 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [26]:
columns_to_keep = {"question", "answer", "reasoning", "context"}
columns_to_remove = list(set(dataset["train"].column_names) - columns_to_keep)
dataset = dataset.remove_columns(columns_to_remove)

In [28]:
make_chatml = lambda name, role, content: dict(
    name=name, role=role, content=content,
)

system = lambda name, content: make_chatml(
    role="system",
    name=name,
    content=content,
)

situation = lambda content: system(name="situation", content=content)
thought = lambda content: system(name="thought", content=content)
information = lambda content: system(name="information", content=content)
me = lambda content, name=None: make_chatml(
    role="assistant",
    content=content,
    name=name,
)

person = lambda content, name=None: make_chatml(
    role="user",
    content=content,
    name=name,
)



In [29]:
def to_chatml(row):
    question = row["question"]
    answer = row["answer"]
    context = row["context"]
    reasoning = row["reasoning"]
    newline = "\n"
    
    # Turn into chatml
    chatml = [
        situation("A user is talking to an AI assistant. They are discussing an article that the user has read."),
        person(f"Based on the article below, can you answer the following question:{newline}{question}", name="User"),
        information(context.strip().replace(' ,', ',').replace(' .', '.').replace(' ?', '?')),
        thought(f"Let's think step by step:{newline}{newline}{reasoning}"),
        me(f"The correct answer is: {answer}", name="AI Assistant"),
    ]

    return dict(chatml=chatml)



In [30]:
dataset = dataset.map(to_chatml)

Map:   0%|          | 0/2562 [00:00<?, ? examples/s]

Map:   0%|          | 0/12378 [00:00<?, ? examples/s]

Map:   0%|          | 0/2570 [00:00<?, ? examples/s]

In [None]:

dataset = dataset.remove_columns(["question", "answer", "reasoning", "context"])

In [19]:
dataset.push_to_hub("diwank/time_sensitive_qa-chatml", private=True)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/930 [00:00<?, ?B/s]