In [8]:
import os
import json
import argparse
from tqdm import tqdm
from datetime import datetime
import pandas as pd
import requests

In [9]:
parser = argparse.ArgumentParser()
parser.add_argument("--option", default="none", type=str)
parser.add_argument("--model", default="llama2-70b", type=str, help="qwen1.5-14b-chat and qwen-turbo are better")
parser.add_argument("--start", default=0, type=int)
parser.add_argument("--end", default=None, type=int)
parser.add_argument(
    "--temperature",
    type=float,
    default=0.5,
    help="temperature of 0 implies greedy sampling.",
)
parser.add_argument(
    "--traced_json_file",
    default=r"traced.json",#traced file
    type=str,
)
parser.add_argument(
    "--tables_json_file",
    default=r"tables.json",#table files
    type=str,
)
parser.add_argument(
    "--topk_path",
    default=r"request_tok",#text files
    
    type=str,
)

args = parser.parse_args("")

In [10]:


def read_data(args):
    data_train_traced = json.load(open(args.traced_json_file, "r"))
    traindev_table = json.load(open(args.tables_json_file, "r"))

    data_list = []
    for sample in tqdm(data_train_traced[args.start:args.end]):
        table_id = sample["table_id"]
        try:
            topk = json.load(open(os.path.join(args.topk_path, f"{table_id}.json"), "r"))
        except Exception:
            print(f"The file {os.path.join(args.topk_path, f'{table_id}.json')} does not exist.")
            continue
        question_text = sample["question"]
        answer_text = sample["answer-text"]
        wikis = [
            node[2]
            for node in sample["answer-node"]
            if node[2] is not None and node[2].startswith("/wiki")
        ]
        if len(wikis) == 0:
            wiki_text = ""
        else:
            wiki_text = "\n".join([topk[wiki] for wiki in wikis])
        df = pd.DataFrame(
            [tuple(zip(*row))[0] for row in traindev_table[table_id]["data"]],
            columns=list(zip(*traindev_table[table_id]["header"]))[0],
        )
        data_list.append(
            {
                "question": question_text,
                "answer": answer_text,
                "title": traindev_table[table_id]["title"],
                "table": df,
                "wiki": wiki_text,
                "table_id": table_id,
                "intro": traindev_table[table_id]["intro"]
            }
        )
    return data_list


def df_format(data):
    try:
        formatted_str = " | ".join(data.columns) + "\n"
        for _, row in data.iterrows():
            row_str = " | ".join([str(row[col]) for col in data.columns])
            formatted_str += row_str + "\n"
        return formatted_str
    except:
        print(f"wrong table: {csv_path}")
        return ""




In [11]:
demonstration = {}
demonstration["none"] = ""
demonstration[
    "direct"
] = """
Question Breakdown Example 1: 
Original Question: How many times did the person who finished the Stockholm Marathon with a time of 2:13:26 win the Boston Marathon ?
What sub-question needs to be answered first from the original question?
Sub-Question: Who finished the Stockholm Marathon with a time of 2:13:26?

Question Breakdown Example 2: 
Original Question: What is the nickname of the coach with two championship wins and three runner-ups , and was the coach of a team in the first year of its national franchise ?
What sub-question needs to be answered first from the original question?
Sub-Question: Who has two championship wins and three runner-ups , and was the coach of a team in the first year of its national franchise ?
"""

demonstration[
    "cot"
] = """
Question Breakdown Example 1: 
Original Question: How many times did the person who finished the Stockholm Marathon with a time of 2:13:26 win the Boston Marathon ?
Sub-Question: Who finished the Stockholm Marathon with a time of 2:13:26?
Main-Question: How many times did "Answer of Sub-Question" win the Boston Marathon ?

Question Breakdown Example 2: 
Original Question: What is the nickname of the coach with two championship wins and three runner-ups , and was the coach of a team in the first year of its national franchise ?
Sub-Question: Who has two championship wins and three runner-ups , and was the coach of a team in the first year of its national franchise ?
Main-Question: What is the nickname of "Answer of Sub-Question"?
"""

In [12]:
now = datetime.now()
dt_string = now.strftime("%d_%H_%M")
fw = open(f"outputs/subquestion_s{args.start}_e{args.end}_{args.option}_{args.model}_{dt_string}.json", "w",)
tmp = {"demonstration": demonstration[args.option]}
fw.write(json.dumps(tmp) + "\n")

2872

In [13]:
data_list = read_data(args)

 30%|███████████████████████▌                                                      | 181/600 [00:00<00:00, 1661.07it/s]

The file data\traindev_request_tok\Rachael_vs._Guy:_Celebrity_Cook-Off_2.json does not exist.


100%|██████████████████████████████████████████████████████████████████████████████| 600/600 [00:00<00:00, 1639.86it/s]


In [14]:
#Load model or API

In [15]:
for entry in tqdm(data_list):
    question = entry['question']

    #### Formalizing the k-shot demonstration. #####
    prompt = demonstration[args.option] + '\n\n'
    prompt += f'Original Question: ' + question 
    prompt += "Let's think through this step-by-step: \n"
    prompt += "What sub-question needs to be answered first from the original question?\n"
    prompt += "Sub-Question: \n\n"

    response_raw = query({'inputs': prompt})
    try:
        response = response_raw[0].get('generated_text', '').split('\nSub-Question:')[3].split('Reasoning process')[0].strip()
    except KeyError:
        response = ''

    response = response.split('\n')[0].strip()

    tmp = {
        "question": question,
        "response": response,
        "table_id": entry["table_id"],
    }

    fw.write(json.dumps(tmp) + "\n")

fw.close()


100%|████████████████████████████████████████████████████████████████████████████████| 599/599 [37:09<00:00,  3.72s/it]


In [14]:
print(prompt)


This is a demonstration:

Read the table below regarding the "2006 League of Ireland Premier Division". In order to get the answer to the question, you need to combine information from both the table and the text.

Team | Manager | Main sponsor | Kit supplier | Stadium | Capacity
Bohemians | Gareth Farrelly | Des Kelly Carpets | O'Neills | Dalymount Park | 8,500
Bray Wanderers | Eddie Gormley | Slevin Group | Adidas | Carlisle Grounds | 7,000
Cork City | Damien Richardson | Nissan | O'Neills | Turners Cross | 8,000
Derry City | Stephen Kenny | MeteorElectrical.com | Umbro | The Brandywell | 7,700
Drogheda United | Paul Doolin | Murphy Environmental | Jako | United Park | 5,400
Dublin City | Dermot Keely | Carroll 's Irish Gift Stores | Umbro | Dalymount Park | 8,500
Longford Town | Alan Mathews | Flancare | Umbro | Flancare Park | 4,500
Shelbourne | Pat Fenlon | JW Hire | Umbro | Tolka Park | 10,100
Sligo Rovers | Sean Connor | Toher 's | Jako | The Showgrounds | 5,500
St Patrick 's A

In [30]:
print(response_raw[0]['generated_text'])



Read the following table and text regarding "Nonso Anozie":and answer the question.

Introduction:
Nonso Anozie (born 17 November 1978) is a British actor who has worked on stage, film, and television. He is best known for his role as Tank in RocknRolla, Sergeant Dap in Ender's Game, Abraham Kenyatta in Zoo, Captain of the Guards in Cinderella and Xaro Xhoan Daxos in the HBO television series Game of Thrones.

Year | Title | Role | Notes
2007 | Prime Suspect 7 : The Final Act | Robert | Episode : Part 1
2009 | Occupation | Erik Lester | 3 episodes
2011 | Outcasts | Elijah | 1 episode
2011 | Stolen | Thomas Ekoku | TV movie
2012 | Game of Thrones | Xaro Xhoan Daxos | 5 episodes
2013 | The Bible | Samson | Episode : Homeland
2013 | Playhouse Presents | Chris | Episode : The Pavement Psychologist
2013-14 | Dracula | R.M . Renfield | Main cast ; 10 episodes
2015-17 | Zoo | Abraham Kenyatta | Main cast
2015 | Tut | General Horemheb | Miniseries ; 3 episodes
2015 | Doctor Who | Hydroflax (