# Load Model, Dataset

In [None]:
from google.colab import drive
import json
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
with open("/content/drive/My Drive/sharded_dataset.json") as f:
  data = json.load(f)

In [None]:
D2T = []
for task in data:
  if "data2text" in task['task']:
    D2T.append(task)

# OPENAI Code

In [None]:
from openai import OpenAI
import math

def predictive_entropy_uncertainty_chat(prompt,
                                         temperature=1.0,
                                         max_tokens = 500,
                                         logprobs= True):
    """
    Send `prompt` to OpenAI Completion API, return average token entropy and generated text.
    Entropy is approximated from top `logprobs` returned per token.
    """

    client = OpenAI(api_key="FILLER")
    resp = client.chat.completions.create(
        model="gpt-4.1",
        messages=prompt,
        max_completion_tokens=max_tokens,
        temperature=temperature,
        logprobs=logprobs,
        top_logprobs = 20,
    )

    generated_tokens = resp.choices[0].message.content
    lps = resp.choices[0].logprobs.content

    entropies = []

    for token_info in lps:
        entropy = 0.0
        for alt in token_info.top_logprobs:
            p = math.exp(alt.logprob)
            entropy += -p * alt.logprob
        entropies.append(entropy)

    avg_entropy = sum(entropies) / len(entropies) if entropies else 0.0

    tokens_used = resp.usage.completion_tokens + resp.usage.prompt_tokens

    return avg_entropy, generated_tokens, tokens_used

In [None]:
import re
import os

def prompt_rewrite(prompt):

  new_prompt = [
    {
        "role": "system",
        "content": "You are a prompt rewriter whose main goal is to rewrite the prompt given by the user in the most optimal way without losing any information in them."
    },
    {
        "role": "user",
        "content": (
            "I have a set of questions and/or statements, please REWRITE all the questions/statements so that they are in the most optimal order that is the easiest to understand. DO NOT ANSWER ANY OF THE QUESTIONS JUST REWRITE. Here are the instructions:\n"
"""User Instruction 1: I am providing you a table. You must produce a short one-sentence description of the table. The description should be at most 30 words. In followup turns, I will provide further information about the table. You must update your description to take information from all the turns into account. At each turn, only respond with a single sentence describing the table. Do not respond with any other text.

Table:
<table>
<tr> <th colspan=1 rowspan=1 > Pos </th><th colspan=1 rowspan=1 > No. </th><th colspan=1 rowspan=1 > Driver </th><th colspan=1 rowspan=1 > Team </th><th colspan=1 rowspan=1 > Manufacturer </th><th colspan=1 rowspan=1 > Time </th><th colspan=1 rowspan=1 > Speed </th></tr>
<tr> <th colspan=1 rowspan=1 > 1 </th><td colspan=1 rowspan=1 > 41 </td><td colspan=1 rowspan=1 > Kurt Busch </td><td colspan=1 rowspan=1 > Stewart-Haas Racing </td><td colspan=1 rowspan=1 > Chevrolet </td><td colspan=1 rowspan=1 > 25.928 </td><td colspan=1 rowspan=1 > 138.846 </td></tr>
<tr> <th colspan=1 rowspan=1 > 2 </th><td colspan=1 rowspan=1 > 11 </td><td colspan=1 rowspan=1 > Denny Hamlin </td><td colspan=1 rowspan=1 > Joe Gibbs Racing </td><td colspan=1 rowspan=1 > Toyota </td><td colspan=1 rowspan=1 > 25.999 </td><td colspan=1 rowspan=1 > 138.467 </td></tr>
</table>

User Instruction 2: Here's 2 examples of descriptions of other tables. You can use it as references to understand how to describe the table:
Hagino's fellow countryman Daiya Seto captured the bronze in 4:09.71, to give Japan two swimmers on the same Olympic podium.
In 2017, Stamenković appeared in the Kazakhstan Premier League for Irtysh Pavlodar.
User Instruction 3: 'I am now giving you a version of the table with some cells highlights (by adding a highlighted class to each cell). You can now update your description of the table, which must now focus on the highlighted cells in the table.

<table>
<tr> <th colspan=1 rowspan=1 > Pos </th><th colspan=1 rowspan=1 > No. </th><th colspan=1 rowspan=1 > Driver </th><th colspan=1 rowspan=1 > Team </th><th colspan=1 rowspan=1 > Manufacturer </th><th colspan=1 rowspan=1 > Time </th><th colspan=1 rowspan=1 > Speed </th></tr>
<tr> <th class="highlighted" colspan=1 rowspan=1 > 1 </th><td colspan=1 rowspan=1 > 41 </td><td class="highlighted" colspan=1 rowspan=1 > Kurt Busch </td><td colspan=1 rowspan=1 > Stewart-Haas Racing </td><td colspan=1 rowspan=1 > Chevrolet </td><td colspan=1 rowspan=1 > 25.928 </td><td colspan=1 rowspan=1 > 138.846 </td></tr>
<tr> <th class="highlighted" colspan=1 rowspan=1 > 2 </th><td colspan=1 rowspan=1 > 11 </td><td class="highlighted" colspan=1 rowspan=1 > Denny Hamlin </td><td colspan=1 rowspan=1 > Joe Gibbs Racing </td><td colspan=1 rowspan=1 > Toyota </td><td colspan=1 rowspan=1 > 25.999 </td><td colspan=1 rowspan=1 > 138.467 </td></tr>
</table>

User Instruction 4: Oh also, Here's additional metadata about the table. The table_webpage_url is http://en.wikipedia.org/wiki/2016_Good_Sam_500.
User Instruction 5: FYI, Here's additional metadata about the table. The table_page_title is 2016 Good Sam 500."""
        )
    },
    {
        "role": "assistant",
        "content": """Provide a short one-sentence description of the following table. Only focus on the highlighted cells. In future turns, update the description with any new provided information. Output only one sentence per turn, no other text.

Table:
<table>
<tr> <th colspan=1 rowspan=1 > Pos </th><th colspan=1 rowspan=1 > No. </th><th colspan=1 rowspan=1 > Driver </th><th colspan=1 rowspan=1 > Team </th><th colspan=1 rowspan=1 > Manufacturer </th><th colspan=1 rowspan=1 > Time </th><th colspan=1 rowspan=1 > Speed </th></tr>
<tr> <th class="highlighted" colspan=1 rowspan=1 > 1 </th><td colspan=1 rowspan=1 > 41 </td><td class="highlighted" colspan=1 rowspan=1 > Kurt Busch </td><td colspan=1 rowspan=1 > Stewart-Haas Racing </td><td colspan=1 rowspan=1 > Chevrolet </td><td colspan=1 rowspan=1 > 25.928 </td><td colspan=1 rowspan=1 > 138.846 </td></tr>
<tr> <th class="highlighted" colspan=1 rowspan=1 > 2 </th><td colspan=1 rowspan=1 > 11 </td><td class="highlighted" colspan=1 rowspan=1 > Denny Hamlin </td><td colspan=1 rowspan=1 > Joe Gibbs Racing </td><td colspan=1 rowspan=1 > Toyota </td><td colspan=1 rowspan=1 > 25.999 </td><td colspan=1 rowspan=1 > 138.467 </td></tr>
</table>

Here is additional metadata about the table:
The table_webpage_url is http://en.wikipedia.org/wiki/2016_Good_Sam_500.
The table_page_title is 2016 Good Sam 500.

Here are 4 example descriptions for reference:
Hagino's fellow countryman Daiya Seto captured the bronze in 4:09.71, to give Japan two swimmers on the same Olympic podium.
In 2017, Stamenković appeared in the Kazakhstan Premier League for Irtysh Pavlodar."""
    },
]

  user_content = "I have a set of questions and/or statements, please REWRITE all the questions/statements so that they are in the most optimal order that is the easiest to understand. DO NOT ANSWER ANY OF THE QUESTIONS JUST REWRITE. Here are the instructions:\n"
  user_messages = [item["content"] for item in prompt if item.get("role") == "user"]
  for i, msg in enumerate(user_messages):
    user_content += f"User Instruction {i+1}:" + msg + "\n"

  new_prompt.append({"role": "user", "content": user_content})

  return new_prompt

In [None]:
import random
import os

def with_context_reset_chat(dataset, file_path, threshold,
                            temperature=1.0, runs=1, numQ=50):

    connectors = ["oh also, ", "I just remembered, ", "sorry i forgot to say, ", "", "oh, and ", "FYI, "]
    tokens_used = 0

    for run in range(runs):
        print(f"Run {run+1}/{runs}")
        out_path = file_path.replace(".json", f"_run{run}.json")
        results = []

        for entry in dataset:
            base_system = {
        "role": "system",
        "content": (
            f"You are an analyst with an eye for detail that accomplishes tasks carefully and thoroughly."
        )
    }
            shards = entry["shards"]
            print(f"Question with {len(shards)} shards")
            messages = [base_system]
            prev_entropy = float("inf")
            resets = 0
            entropies = []
            before_reset = None
            choice = random.choice(connectors)

            for shard in shards:
                user_content = shard["shard"]
                if shard['shard_id'] != 1:
                  user_content = choice + user_content
                # if shard["shard_id"] == len(shards):
                #     user_content += " Please include your complete new Query in your response."
                messages.append({"role": "user", "content": user_content})

                entropy, reply, tok = predictive_entropy_uncertainty_chat(messages, temperature=1.0, logprobs=True)
                print(f"Entropy: {entropy:.4f}")
                tokens_used += tok

                if entropy - prev_entropy > threshold:
                    before_reset = list(messages)
                    messages = prompt_rewrite(messages)
                    entropy, reply, tok = predictive_entropy_uncertainty_chat(messages, temperature=0.2, logprobs=True, max_tokens=1000)
                    tokens_used += tok
                    messages = [base_system, {"role": "user", "content": reply}]
                    entropy, reply, tok = predictive_entropy_uncertainty_chat(messages, temperature=1.0, logprobs=True)
                    print(f"Reset entropy: {entropy:.4f}")
                    tokens_used += tok
                    resets += 1

                prev_entropy = entropy
                entropies.append(entropy)
                messages.append({"role": "assistant", "content": reply})

                print(f"Current Tokens Used: {tokens_used}")

                if shard["shard_id"] == len(entry["shards"]):

                  if before_reset:
                    chat_history = f"{before_reset}\n\nAFTER RESET\n\n{messages}"
                  else:
                    chat_history = messages

                  new_entry = {"final_output": reply, "chat_history": chat_history, "entropies": entropies, "resets":resets}

                  if os.path.exists(out_path):
                    with open(out_path, "r") as f:
                        data = json.load(f)
                  else:
                      data = []

                  data.append(new_entry)

                  with open(out_path, "w") as f:
                      json.dump(data, f, indent=2)