In [1]:
import os
from dotenv import load_dotenv
from dataClass import DataTable
from langchain_mistralai import ChatMistralAI
import json
import requests
import pandas as pd


In [2]:
import re

def extract_id(input_str):
    # Regular expression pattern to match the ID in both formats
    pattern = r"Q\d+"

    # Search for the pattern in the input string
    match = re.search(pattern, input_str)

    # If a match is found, return the matched ID, otherwise return None or an appropriate message
    if match:
        return match.group(0)
    else:
        return None

In [3]:
gt = pd.read_csv('data/HardTablesR1/DataSets/HardTablesR1/Valid/gt/cea_gt.csv', header=None)  
gt.head()

Unnamed: 0,0,1,2,3
0,NQK7B1JD,1,0,http://www.wikidata.org/entity/Q7996268
1,NQK7B1JD,2,0,http://www.wikidata.org/entity/Q7996260
2,NQK7B1JD,3,0,http://www.wikidata.org/entity/Q7996231
3,IH9YIR7T,1,0,http://www.wikidata.org/entity/Q1760847
4,IH9YIR7T,2,0,http://www.wikidata.org/entity/Q1501976


In [4]:
import numpy as np
predictions = gt.copy()
predictions['pred_desc'] = np.nan
predictions['pred_nodesc'] = np.nan
predictions.head()

Unnamed: 0,0,1,2,3,pred_desc,pred_nodesc
0,NQK7B1JD,1,0,http://www.wikidata.org/entity/Q7996268,,
1,NQK7B1JD,2,0,http://www.wikidata.org/entity/Q7996260,,
2,NQK7B1JD,3,0,http://www.wikidata.org/entity/Q7996231,,
3,IH9YIR7T,1,0,http://www.wikidata.org/entity/Q1760847,,
4,IH9YIR7T,2,0,http://www.wikidata.org/entity/Q1501976,,


In [5]:
with open("results/outputsCEA_LLAMA3.json") as f:
    out = json.load(f)

In [6]:
import numpy as np

preds_desc, preds_nodesc = [], []

for i, (t_name, r, c, true) in gt.iterrows():
    print(f"\n______________________________\n{(t_name, r, c, true)}")

    key = str((r, c))
    if key in out[t_name]:
        print(f"\nTable Name: {out[t_name][key]['cell']}")
        if 'output_desc' in out[t_name][key]:
            pred_desc = extract_id(out[t_name][key]['output_desc'])
        else:
            print(f"\nOutput desc not found for key {key}")
            pred_desc = np.nan
        if 'output_nodesc' in out[t_name][key]:
            pred_nodesc = extract_id(out[t_name][key]['output_nodesc'])
        else:
            print(f"\nOutput nodesc not found for key {key}")
            pred_nodesc = np.nan
    else:
        print(f"\nTable Name: Key {key} not found")
        pred_desc = np.nan
        pred_nodesc = np.nan

    true_id = true.split('/')[-1]

    print(f"True: {true_id}")
    print(f"Pred desc: {pred_desc}")
    print(f"Pred nodesc: {pred_nodesc}")

    print(f"\n\nDESC    : {true_id == pred_desc}")
    print(f"NO DESC : {true_id == pred_nodesc}")

    preds_desc.append(pred_desc)
    preds_nodesc.append(pred_nodesc)



______________________________
('NQK7B1JD', 1, 0, 'http://www.wikidata.org/entity/Q7996268')

Table Name: lincoln township
True: Q7996268
Pred desc: Q555980
Pred nodesc: Q9041465


DESC    : False
NO DESC : False

______________________________
('NQK7B1JD', 2, 0, 'http://www.wikidata.org/entity/Q7996260')

Table Name: stony creek township
True: Q7996260
Pred desc: Q6031887
Pred nodesc: Q6031889


DESC    : False
NO DESC : False

______________________________
('NQK7B1JD', 3, 0, 'http://www.wikidata.org/entity/Q7996231')

Table Name: hartford township
True: Q7996231
Pred desc: Q9039946
Pred nodesc: Q14709910


DESC    : False
NO DESC : False

______________________________
('IH9YIR7T', 1, 0, 'http://www.wikidata.org/entity/Q1760847')

Table Name: national assembly
True: Q1760847
Pred desc: Q1512579
Pred nodesc: Q5283279


DESC    : False
NO DESC : False

______________________________
('IH9YIR7T', 2, 0, 'http://www.wikidata.org/entity/Q1501976')

Table Name: house of representatives
Tr

In [7]:
predictions = gt.copy()
predictions['pred_desc'] = preds_desc
predictions['pred_nodesc'] = preds_nodesc
predictions = predictions.dropna()

In [8]:
predictions.head()
true_d, true_n = 0, 0 
diverso = 0
for i, (t_name, r, c, true, p_d, p_n) in predictions.iterrows():

    if p_d in true:
        true_d += 1
    if p_n in true:
        true_n += 1

In [9]:
diverso

0

In [12]:
print(f"\n# Prompt with descriptions    | Correct: {true_d} | Accuracy: {true_d/len(predictions)*100:.2f}%")
print(f"\n# Prompt without descriptions | Correct: {true_n} | Accuracy: {true_n/len(predictions)*100:.2f}%")


# Prompt with descriptions    | Correct: 723 | Accuracy: 74.92%

# Prompt without descriptions | Correct: 698 | Accuracy: 72.33%


In [11]:
predictions.head()

Unnamed: 0,0,1,2,3,pred_desc,pred_nodesc
0,NQK7B1JD,1,0,http://www.wikidata.org/entity/Q7996268,Q555980,Q9041465
1,NQK7B1JD,2,0,http://www.wikidata.org/entity/Q7996260,Q6031887,Q6031889
2,NQK7B1JD,3,0,http://www.wikidata.org/entity/Q7996231,Q9039946,Q14709910
3,IH9YIR7T,1,0,http://www.wikidata.org/entity/Q1760847,Q1512579,Q5283279
4,IH9YIR7T,2,0,http://www.wikidata.org/entity/Q1501976,Q1137584,Q11701
