In [10]:
import os
import json
import argparse
from tqdm import tqdm
from datetime import datetime
import pandas as pd
import requests

In [11]:
parser = argparse.ArgumentParser()
parser.add_argument("--option", default="cot", type=str)
parser.add_argument("--model", default="llama2-70b", type=str, help=" ")
parser.add_argument("--start", default=0, type=int)
parser.add_argument("--end", default=None, type=int)
parser.add_argument(
    "--temperature",
    type=float,
    default=0.5,
    help="temperature of 0 implies greedy sampling.",
)
parser.add_argument(
    "--traced_json_file",
    default=r"traced.json",#traced file
    type=str,
)
parser.add_argument(
    "--tables_json_file",
    default=r"tables.json",#table files
    type=str,
)
parser.add_argument(
    "--topk_path",
    default=r"request_tok",#text files
    
    type=str,
)

args = parser.parse_args("")

In [12]:
demonstration = {}
demonstration["none"] = ""
with open("examples/fullmodel_direct_2shot.json", "r") as f:
    demonstration["direct"] = json.load(f)
with open("examples/fullmodel_cot_2shot.json", "r") as f:
    demonstration["cot"] = json.load(f)

In [13]:
def read_data(args):
    # Load traced JSON file
    data_test_traced = json.load(open(args.traced_json_file, "r"))
    data_list = []
    for sample in tqdm(data_test_traced[args.start:args.end]):
        table_id = sample["table_id"]
        question_data = None
        for q_data in questions_data:
            if q_data['table_id'] == table_id:
                question_data = q_data
                break
        if question_data is None:
            print(f"No question data found for {table_id}")
            continue
        
        # Read JSON file from tables_tok
        try:
            tables_tok_path = f"{table_id}.json"  # put your traced table link
            with open(tables_tok_path, 'r') as f:
                table_data = json.load(f)
        except Exception:
            print(f"The file {table_id} does not exist.")
            continue

        question_type = question_data['type']
        if question_type == 'bridge':
            # Get the index of the most relevant row
            row_index = question_data['row_pre']
            relevant_rows = [table_data['data'][row_index]]
        elif question_type == 'comparison':
            # Get the indices of all rows with relevance less than or equal to 1.0
            row_pre_logits = question_data['row_pre_logit']
            relevant_rows = [table_data['data'][i] for i, logit in enumerate(row_pre_logits) if logit <= 1.0]
        else:
            print(f"Unknown question type: {question_type}")
            continue

        # Read text data
        try:
            text_file = os.path.join(args.text_path, f"{table_id}.json")
            with open(text_file, "r") as f:
                text_data = json.load(f)
        except Exception:
            print(f"The file {text_file} does not exist.")
            continue
            
        question_text = sample["question"]
        answer_text = sample["pred"]
        
        # Extract wiki links from nodes and target
        wikis = [
            node[2]
            for node in sample["nodes"]
            if node[2] is not None and node[2].startswith("/wiki")
        ]
        
        target_wiki = sample["target"][2]
        if target_wiki and target_wiki.startswith("/wiki"):
            wikis.append(target_wiki)
        
        # Get the corresponding text for each wiki link
        wiki_text = ""
        if wikis:
            wiki_lines = [text_data.get(wiki, "") for wiki in wikis]
            wiki_text = "\n".join(wiki_lines)
        
        # Create a DataFrame from the table data
        df = pd.DataFrame(
            [tuple(zip(*row))[0] for row in table_data["data"]],
            columns=list(zip(*table_data["header"]))[0],
        )

        data_list.append({
            "table_id": table_id,
            "question": question_text,
            "answer": answer_text,
            "table": df,
            "wiki": wiki_text,
            "title": table_data["title"],
            "intro": table_data["intro"]
        })

    return data_list

# Load questions data
questions_path = "test.json"  # put text answer here
with open(questions_path, 'r') as f:
    questions_data = json.load(f)

def df_format(data):
    try:
        formatted_str = " | ".join(data.columns) + "\n"
        for _, row in data.iterrows():
            row_str = " | ".join([str(row[col]) for col in data.columns])
            formatted_str += row_str + "\n"
        return formatted_str
    except Exception as e:
        #print(f"Error formatting table: {data}, error: {e}")
        return ""


In [14]:
#Load model or API

In [64]:
'''
run_count = 0

subquestion_file = f"outputs/subquestion_s{args.start}_e{args.end}_{args.option}_{args.model}_{run_count}.json"
subquestion_fw = open(subquestion_file, "w")

tmp = {"demonstration": demonstration[args.option]}
subquestion_fw.write(json.dumps(tmp) + "\n")
'''

2556

In [40]:
'''
data_list = read_data(args)
'''

100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 166.88it/s]


In [41]:
'''
with open('outputs/question_test_s300_eNone.json', 'r') as f:
    subquestion_data = [json.loads(line) for line in f]
        
with open('outputs/summary_s300_e600.json', 'r') as f:
    summary_data = [json.loads(line) for line in f]

with open('subquestion_spacy_s300_e600_test.txt', 'r', encoding='utf-8') as f:
    entity_data = [line.strip() for line in f]


question_idx = 0

for entry, entity_entry,subquestion_entry, summary_entry in zip(tqdm(data_list), entity_data, subquestion_data, summary_data):
    summary = summary_entry.get('summary', '')
    subquestion = subquestion_entry.get('response', '')

    prompt = demonstration[args.option] + '\n\n'
    #### Formalizing the k-shot demonstration. #####
    prompt += f'Read the table and text regarding "{entry["title"]}" to answer the question.\n\n'
    prompt += df_format(entry['table']) + '\n'

    if entry['wiki']:
        prompt += "Text:" + '\n' + entry['wiki'] + '\n\n'
    prompt += 'Summary: ' + summary + '\n\n'
    prompt += 'The answer should be a/an ' + entity_entry + '\n\n'
    prompt += 'Lets think step by step, to answer the question: ' + subquestion + '\nAnswer:'




    question_idx += 1
    response_raw = query({'inputs': prompt})
    try:
        response = response_raw[0].get('generated_text', '').split('\nAnswer:')[3].split('Reasoning process')[0].strip()
    except KeyError:
        response = ''

    response = response.split('\n')[0].strip()

    tmp = {
        "sub_question": subquestion,
        "entity":entity_entry,
        "sub_answer": response,
        "table_id": entry["table_id"],
    }

    subquestion_fw.write(json.dumps(tmp) + "\n")

subquestion_fw.close()
'''

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.75s/it]


In [15]:
now = datetime.now()
dt_string = now.strftime("%d_%H_%M")
answer_fw = open(f"outputs/answer_s{args.start}_e{args.end}_{args.option}_{args.model}_{dt_string}.json", "w",)
tmp = {"demonstration": demonstration[args.option]}
answer_fw.write(json.dumps(tmp) + "\n")

6209

In [16]:
data_list = read_data(args)

 68%|████████████████████████████████████████████████████▉                         | 407/600 [00:00<00:00, 1032.47it/s]

The file Homicide:_Life_on_the_Street_0 does not exist.
The file Ebertfest:_Roger_Ebert's_Film_Festival_6 does not exist.
The file List_of_National_Treasures_of_Japan_(writings:_Chinese_books)_0 does not exist.


100%|███████████████████████████████████████████████████████████████████████████████| 600/600 [00:00<00:00, 972.05it/s]


In [17]:
with open('outputs/subquestion.json', 'r') as f: #load your subquestion answer here
    subquestion_data = [json.loads(line) for line in f]
    
with open('spacy_test.txt', 'r', encoding='utf-8') as f: #load your entity for full question here
    entity_data = [line.strip() for line in f]
    
with open('summary.json', 'r') as f: #load your summary here
    summary_data = [json.loads(line) for line in f]
    
# Iterate over data_list and evidence_data simultaneously
for entry, subquestion_entry, entity_entry, summary_entry in zip(tqdm(data_list), subquestion_data, entity_data, summary_data):
    question = entry['question']
    answer = entry['answer']
    table_id = entry['table_id']
    subanswer = subquestion_entry.get('sub_answer', '')  # Use .get() to handle KeyError
    subquestion = subquestion_entry.get('sub_question', '')
    subquestion_table_id = subquestion_entry.get('table_id', '')  # Get evidence table_id
    summary = summary_entry.get('summary', '')
    

    # Check if evidence table_id matches the entry table_id
    if subquestion_table_id != table_id:
        print(f"Warning: Table ID mismatch for question '{question}'.")
        # Optionally, you can choose to skip this entry or handle it differently

    #### Formalizing the k-shot demonstration. #####
    prompt = demonstration[args.option] + '\n\n'
    prompt += f'Read the following table, text and summary regarding "{entry["title"]}":'+'and answer the question.\n\n'
    prompt += df_format(entry['table']) + '\n'

    if entry['wiki']:
        prompt += "Text: " + '\n' + entry['wiki'] + '\n\n'
    prompt +=  "Summary: " + summary+ '\n\n'
    # Add evidence to the prompt
    prompt += "A subquestion as hint : " + subquestion + "\nThe answer of this hint subquestion: " + subanswer + '\n\n'
    prompt += "Using exact the same words from the text and the table as answers can lead to better accuracy. Simplify your answer to an entity "+ '\n\n'
    prompt += 'Let us think step by step, and answer the question: ' + question 
    prompt += '\nAnswer:'
    response_raw = query({'inputs': prompt})

    try:
        response = response_raw[0].get('generated_text', '').split('\nAnswer:')[5].split('Reasoning process')[0].strip()
    except KeyError:
        response = ''

    response = response.split('\n')[0].strip()

    tmp = {
        "question": question,
        "response": response,
        "answer": answer,
        "entity":entity_entry,
        "table_id": entry["table_id"],
        "sub_answer": subanswer
    }

    answer_fw.write(json.dumps(tmp) + "\n")

answer_fw.close()


100%|████████████████████████████████████████████████████████████████████████████████| 597/597 [18:35<00:00,  1.87s/it]


In [59]:
print(response_raw[0]['generated_text'])



I'll provide you with two demonstrations. You need to combine information from both the table and the text.

First: 
Read the table below regarding the "2006 League of Ireland Premier Division". 

Team | Manager | Main sponsor | Kit supplier | Stadium | Capacity
Bray Wanderers | Eddie Gormley | Slevin Group | Adidas | Carlisle Grounds | 7,000

Text:
The Carlisle Grounds is a football stadium in Bray , County Wicklow , Ireland . Situated directly behind the Bray D.A.R.T . station , it is home to Bray Wanderers A.F.C . Its current capacity is roughly 4,000 .

Subquestion: What is the home stadium of the Bray Wanderers of 2006 League of Ireland?
The answer of Subquestion : Carlisle Grounds

Let's thinks step by step, to answer the question : The home stadium of the Bray Wanderers of 2006 League of Ireland is situated behind what station ?
The answer should be a : geopolitical entity

Answer: Bray D.A.R.T station
The resoning process of this question: 
Let's think step by step, From the 