In [1]:
import os
import json
import argparse
from tqdm import tqdm
from datetime import datetime
import pandas as pd
import requests

In [23]:
parser = argparse.ArgumentParser()
parser.add_argument("--option", default="cot", type=str)
parser.add_argument("--model", default="llama2-70b", type=str, help=" ")
parser.add_argument("--start", default=0, type=int)
parser.add_argument("--end", default=None, type=int)
parser.add_argument(
    "--temperature",
    type=float,
    default=0.5,
    help="temperature of 0 implies greedy sampling.",
)
parser.add_argument(
    "--traced_json_file",
    default=r"traced.json",#traced file
    type=str,
)
parser.add_argument(
    "--tables_json_file",
    default=r"tables.json",#table files
    type=str,
)
parser.add_argument(
    "--topk_path",
    default=r"request_tok",#text files
    
    type=str,
)

args = parser.parse_args("")

In [24]:
demonstration = {}
demonstration["none"] = ""
demonstration[
    "direct"
] = """
Question Breakdown Example 1: 
Original Question: How many times did the person who finished the Stockholm Marathon with a time of 2:13:26 win the Boston Marathon ?
What sub-question needs to be answered first from the original question?
Sub-Question: Who finished the Stockholm Marathon with a time of 2:13:26?

Question Breakdown Example 2: 
Original Question: What is the nickname of the coach with two championship wins and three runner-ups , and was the coach of a team in the first year of its national franchise ?
What sub-question needs to be answered first from the original question?
Sub-Question: Who has two championship wins and three runner-ups , and was the coach of a team in the first year of its national franchise ?
"""

demonstration[
    "cot"
] = """
Question Breakdown Example 1: 
Original Question: How many times did the person who finished the Stockholm Marathon with a time of 2:13:26 win the Boston Marathon ?
Sub-Question: Who finished the Stockholm Marathon with a time of 2:13:26?
Main-Question: How many times did "Answer of Sub-Question" win the Boston Marathon ?

Question Breakdown Example 2: 
Original Question: What is the nickname of the coach with two championship wins and three runner-ups , and was the coach of a team in the first year of its national franchise ?
Sub-Question: Who has two championship wins and three runner-ups , and was the coach of a team in the first year of its national franchise ?
Main-Question: What is the nickname of "Answer of Sub-Question"?
"""

In [25]:
def read_data(args):
    # Load traced JSON file
    data_test_traced = json.load(open(args.traced_json_file, "r"))
    data_list = []
    for sample in tqdm(data_test_traced[args.start:args.end]):
        table_id = sample["table_id"]
        question_data = None
        for q_data in questions_data:
            if q_data['table_id'] == table_id:
                question_data = q_data
                break
        if question_data is None:
            print(f"No question data found for {table_id}")
            continue
        
        # Read JSON file from tables_tok
        try:
            tables_tok_path = f"{table_id}.json"  # put your traced table link
            with open(tables_tok_path, 'r') as f:
                table_data = json.load(f)
        except Exception:
            print(f"The file {table_id} does not exist.")
            continue

        question_type = question_data['type']
        if question_type == 'bridge':
            # Get the index of the most relevant row
            row_index = question_data['row_pre']
            relevant_rows = [table_data['data'][row_index]]
        elif question_type == 'comparison':
            # Get the indices of all rows with relevance less than or equal to 1.0
            row_pre_logits = question_data['row_pre_logit']
            relevant_rows = [table_data['data'][i] for i, logit in enumerate(row_pre_logits) if logit <= 1.0]
        else:
            print(f"Unknown question type: {question_type}")
            continue

        # Read text data
        try:
            text_file = os.path.join(args.text_path, f"{table_id}.json")
            with open(text_file, "r") as f:
                text_data = json.load(f)
        except Exception:
            print(f"The file {text_file} does not exist.")
            continue
            
        question_text = sample["question"]
        answer_text = sample["pred"]
        
        # Extract wiki links from nodes and target
        wikis = [
            node[2]
            for node in sample["nodes"]
            if node[2] is not None and node[2].startswith("/wiki")
        ]
        
        target_wiki = sample["target"][2]
        if target_wiki and target_wiki.startswith("/wiki"):
            wikis.append(target_wiki)
        
        # Get the corresponding text for each wiki link
        wiki_text = ""
        if wikis:
            wiki_lines = [text_data.get(wiki, "") for wiki in wikis]
            wiki_text = "\n".join(wiki_lines)
        
        # Create a DataFrame from the table data
        df = pd.DataFrame(
            [tuple(zip(*row))[0] for row in table_data["data"]],
            columns=list(zip(*table_data["header"]))[0],
        )

        data_list.append({
            "table_id": table_id,
            "question": question_text,
            "answer": answer_text,
            "table": df,
            "wiki": wiki_text,
            "title": table_data["title"],
            "intro": table_data["intro"]
        })

    return data_list

# Load questions data
questions_path = "test.json"  # put text answer here
with open(questions_path, 'r') as f:
    questions_data = json.load(f)

def df_format(data):
    try:
        formatted_str = " | ".join(data.columns) + "\n"
        for _, row in data.iterrows():
            row_str = " | ".join([str(row[col]) for col in data.columns])
            formatted_str += row_str + "\n"
        return formatted_str
    except Exception as e:
        #print(f"Error formatting table: {data}, error: {e}")
        return ""


In [26]:
#Load model or API

In [27]:
now = datetime.now()
dt_string = now.strftime("%d_%H_%M")
fw = open(f"outputs/response_s{args.start}_e{args.end}_{args.option}_{args.model}_{dt_string}.json", "w",)
tmp = {"demonstration": demonstration[args.option]}
fw.write(json.dumps(tmp) + "\n")

768

In [28]:
data_list = read_data(args)

 24%|███████████████████                                                            | 242/1000 [00:17<00:58, 13.06it/s]

The file WikiTables-WithLinks-master\tables_tok\Homicide:_Life_on_the_Street_0.json does not exist.


 31%|████████████████████████▎                                                      | 308/1000 [00:22<00:54, 12.80it/s]

The file WikiTables-WithLinks-master\tables_tok\Ebertfest:_Roger_Ebert's_Film_Festival_6.json does not exist.


 32%|█████████████████████████▎                                                     | 320/1000 [00:23<00:44, 15.17it/s]

The file WikiTables-WithLinks-master\tables_tok\List_of_National_Treasures_of_Japan_(writings:_Chinese_books)_0.json does not exist.


 69%|██████████████████████████████████████████████████████▌                        | 691/1000 [00:46<00:13, 23.56it/s]

The file WikiTables-WithLinks-master\tables_tok\List_of_Winter_Olympics_venues:_L_0.json does not exist.


 74%|██████████████████████████████████████████████████████████▌                    | 742/1000 [00:48<00:08, 30.26it/s]

The file WikiTables-WithLinks-master\tables_tok\List_of_The_Avengers:_Earth's_Mightiest_Heroes_characters_4.json does not exist.


 77%|████████████████████████████████████████████████████████████▋                  | 768/1000 [00:50<00:11, 20.27it/s]

The file WikiTables-WithLinks-master\tables_tok\WWE_The_Music:_The_Beginning_0.json does not exist.


 99%|█████████████████████████████████████████████████████████████████████████████▉ | 987/1000 [01:02<00:00, 22.89it/s]

The file WikiTables-WithLinks-master\tables_tok\Looney_Tunes_Golden_Collection:_Volume_6_6.json does not exist.


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:03<00:00, 15.71it/s]


In [29]:
for entry in tqdm(data_list):
    question = entry['question']

    #### Formalizing the k-shot demonstration. #####
    prompt = demonstration[args.option] + '\n\n'
    prompt += f'Original Question: ' + question 
    prompt += "Let's think through this step-by-step: \n"
    prompt += "What sub-question needs to be answered first from the original question?\n"
    prompt += "Sub-Question: \n\n"

    response_raw = query({'inputs': prompt})
    try:
        response = response_raw[0].get('generated_text', '').split('\nSub-Question:')[3].split('Reasoning process')[0].strip()
    except KeyError:
        response = ''

    response = response.split('\n')[0].strip()

    tmp = {
        "question": question,
        "response": response,
        "table_id": entry["table_id"],
    }

    fw.write(json.dumps(tmp) + "\n")

fw.close()


100%|████████████████████████████████████████████████████████████████████████████████| 993/993 [48:07<00:00,  2.91s/it]


In [22]:
print(prompt)


Question Breakdown Example 1: 
Original Question: How many times did the person who finished the Stockholm Marathon with a time of 2:13:26 win the Boston Marathon ?
Sub-Question: Who finished the Stockholm Marathon with a time of 2:13:26?

Question Breakdown Example 2: 
Original Question: What is the nickname of the coach with two championship wins and three runner-ups , and was the coach of a team in the first year of its national franchise ?
Sub-Question: Who has two championship wins and three runner-ups , and was the coach of a team in the first year of its national franchise ?


Original QuestionHow many times did the person who finished the Stockholm Marathon with a time of 2:13:26 win the Boston Marathon ?Let's think through this step-by-step:

What sub-question needs to be answered first from the original question?
Sub-Question: 




In [52]:
print(response_raw[0]['generated_text'])



Read the table and text regarding "Stockholm Marathon" to answer the following question.

The table contains important information and this is the introduction of the table:
The Stockholm Marathon, known as the ASICS Stockholm Marathon for sponsorship reasons, is an annual marathon arranged in Stockholm, Sweden, since 1979. It serves as the Swedish marathon championship race. At the 2009 Stockholm Marathon more than 18,500 participants (14,442 men and 4,385 women) were registered. [citation needed]

Year | Athlete | Country | Time ( h : m : s )
1979 | Jukka Toivola | Finland | 2:17:35
1980 | Jeff Wells | United States | 2:15:49
1981 | Bill Rodgers | United States | 2:13:26
1982 | Kjell-Erik Ståhl | Sweden - Hässleholms AIS | 2:19:20
1983 | Hugh Jones | United Kingdom | 2:11:37
1984 | Agapius Masong | Tanzania | 2:13:47
1985 | Tommy Persson | Sweden - Heleneholms IF | 2:17:18
1986 | Kjell-Erik Ståhl | Sweden - Enhörna IF | 2:12:33
1987 | Kevin Forster | United Kingdom | 2:13:52
1988 |