In [1]:
import pandas as pd
import os
from main import list_files_in_folder
from dotenv import load_dotenv
from dataClass import DataTable
from langchain_mistralai import ChatMistralAI
import json
from tqdm import tqdm

In [2]:
mistral_api_key = os.getenv("MISTRAL_API_KEY")
model_22 = "open-mixtral-8x22b"
model_7 = "open-mixtral-8x7b"
llm_22 = ChatMistralAI(model=model_22, temperature=0, api_key=mistral_api_key)
llm_7 = ChatMistralAI(model=model_7, temperature=0, api_key=mistral_api_key)

In [5]:
prompts = pd.read_json(path_or_buf='nogit/semtab2022_2t_test_prompts_50.jsonl', lines=True)
tables = set(prompts['table'])
print(f"Number of tables: {len(tables)}")
tables_path = 'data/2t_test/valid/tables/'

Number of tables: 26


In [None]:
descriptions = {}

In [15]:
import time
from tqdm import tqdm

for name in tqdm(tables):   
    if name not in descriptions.keys(): 
        table = DataTable(tables_path+name+'.csv')
        table.generate_t_description(llm_22)
        descriptions[name] = table.t_desc
        print("Generated new description")
    else:
        print("Description already generated.")


  0%|          | 0/26 [00:00<?, ?it/s]

Description already generated.
Description already generated.
Description already generated.
Description already generated.
Description already generated.
Description already generated.
Description already generated.
Description already generated.
Description already generated.
Description already generated.


 42%|████▏     | 11/26 [00:02<00:03,  4.37it/s]

Generated new description


 46%|████▌     | 12/26 [00:04<00:05,  2.39it/s]

Generated new description


 50%|█████     | 13/26 [00:06<00:07,  1.69it/s]

Generated new description


 54%|█████▍    | 14/26 [00:08<00:11,  1.09it/s]

Generated new description


 58%|█████▊    | 15/26 [00:09<00:11,  1.01s/it]

Generated new description


 62%|██████▏   | 16/26 [00:13<00:15,  1.56s/it]

Generated new description


 65%|██████▌   | 17/26 [00:16<00:17,  1.99s/it]

Generated new description


 69%|██████▉   | 18/26 [00:21<00:20,  2.54s/it]

Generated new description


 73%|███████▎  | 19/26 [00:22<00:16,  2.33s/it]

Generated new description


 77%|███████▋  | 20/26 [00:26<00:15,  2.65s/it]

Generated new description


 81%|████████  | 21/26 [00:27<00:11,  2.26s/it]

Generated new description


 85%|████████▍ | 22/26 [00:29<00:08,  2.10s/it]

Generated new description


 88%|████████▊ | 23/26 [00:31<00:06,  2.26s/it]

Generated new description


 92%|█████████▏| 24/26 [00:33<00:04,  2.17s/it]

Generated new description


 96%|█████████▌| 25/26 [00:36<00:02,  2.43s/it]

Generated new description


100%|██████████| 26/26 [00:39<00:00,  1.50s/it]

Generated new description





In [16]:
with open('nogit/table_descriptions2tt.json', 'w') as f:
    json.dump(descriptions, f)

In [13]:
import requests
load_dotenv()

def LamAPI(cell_content):
    
    url = 'https://lamapi.hel.sintef.cloud/lookup/entity-retrieval'
    params = {
        'name': f'{cell_content}',
        'token': os.getenv("LAMAPI_KEY"),
        'kg': 'wikidata',
        'fuzzy': 'True'
    }
    headers = {'accept': 'application/json'}

    response = requests.get(url, params=params, headers=headers)

    if response.status_code == 200:
        data = response.json()
        # Process the JSON data here
    else:
        print("Error:", response.status_code)
    
    list_of_dicts = data[f'{cell_content}']
    
    return list_of_dicts


In [14]:
def get_table_str(df):
    column_names = df.columns.tolist()
    table_str = "col: "
    table_str += "| " + " | ".join(column_names) + " | "
    for index, row in df.iterrows():
        row_str = " | " + " | ".join(str(row[col]) for col in column_names) + " | "
        table_str += f"[SEP] col {index + 1}: {row_str}"
    return table_str

def candidates_as_str(candidates):
    
    list_of_candidates = ""
    for c in candidates:
        if c['description'] == '':
            c['description'] = 'None'
        list_of_candidates += f"<{c['name']} [DESC] {c['description']} [TYPE] {c['types'][0]['name']}>, "
    
    return list_of_candidates[:-2]

def build_prompt(table_str, column_name, cell_content, candidates, t_desc):
    TASK = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request."
    INSTRUCTION = "### Instruction: This is an entity linking task. The goal for this task is to link the selected entity mention in the table cells to the entity in the knowledge base. You will be given a list of referent entities, with each one composed of an entity name, its description and its type. Please choose the correct one from the referent entity candidates. Note that the Wikipedia page, Wikipedia section and table caption (if any) provide important information for choosing the correct referent entity."
    INPUT = f"### Input: [TLE] {t_desc} [TAB] {table_str}"
    QUESTION = f"### Question: The selected entity mention in the table cell is: {cell_content}. The column name for ’{cell_content}’ is {column_name}. "
    CANDIDATES = f"The referent entity candidates are: {candidates}"
    tablellama_prompt = (
        f"{TASK}\n\n"
        f"{INSTRUCTION}\n\n"
        f"{INPUT}\n\n"
        f"{QUESTION}"
        f"{CANDIDATES}. \nIf there are no candidates that matched the cell content the response is <NIL>. What is the correct referent entity for the entity mention ’{cell_content}’ ?\n\n"  
        "### Response: "
    )
    return tablellama_prompt

In [15]:
table_str = get_table_str(table.data)
for i, col in enumerate(table.data):
    if table.ner[str(i)] == 'NEC':
        for cell_content in list(table.data[col]):
            print('_____________________________________________________')
            print(f"\n\nColumn name: {col}")
            print(f"Cell content: {cell_content}\n")
            candidates = LamAPI(cell_content)
            del candidates[3] # Removing the ground truth from candidates
            prompt = build_prompt(table_str, col, cell_content, candidates_as_str(candidates), table.t_desc)
            print(prompt)
            out = llm_22.invoke(prompt)
            print(out.content)
            break

_____________________________________________________


Column name: col0
Cell content: der rhein

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction: This is an entity linking task. The goal for this task is to link the selected entity mention in the table cells to the entity in the knowledge base. You will be given a list of referent entities, with each one composed of an entity name, its description and its type. Please choose the correct one from the referent entity candidates. Note that the Wikipedia page, Wikipedia section and table caption (if any) provide important information for choosing the correct referent entity.

### Input: [TLE] {'0': 'Column 0 contains various transportation-related terms, including types of trains, train classes, and train designations.', '1': 'Column 1 contains numerical values, potentially representing quantities such as the num