In [9]:
import pandas as pd
import os
from main import list_files_in_folder
from dotenv import load_dotenv
from dataClass import DataTable
from langchain_mistralai import ChatMistralAI
import json
from tqdm import tqdm

In [None]:
mistral_api_key = os.getenv("MISTRAL_API_KEY")
model_22 = "open-mixtral-8x22b"
model_7 = "open-mixtral-8x7b"
llm_22 = ChatMistralAI(model=model_22, temperature=0, api_key=mistral_api_key)
llm_7 = ChatMistralAI(model=model_7, temperature=0, api_key=mistral_api_key)

In [11]:
tables_path = 'data/HardTablesR1/DataSets/HardTablesR1/Valid/tables'
tables = list_files_in_folder(tables_path)
table = DataTable(tables[0])
print(table.data.head())
table.ner = {'0': 'NEC', '1': 'LC'}

           col0  col1
0     der rhein     1
1          lb 4     1
2  db class 732     3
3           ic3    20
4          st13    20


In [12]:
table.generate_t_description(llm_7)
table.t_desc

{'0': 'Column 0 contains various transportation-related terms, including types of trains, train classes, and train designations.',
 '1': 'Column 1 contains numerical values, potentially representing quantities such as the number of seats, capacity, or speed of the transportation methods mentioned in column 0.'}

In [13]:
import requests
load_dotenv()

def LamAPI(cell_content):
    
    url = 'https://lamapi.hel.sintef.cloud/lookup/entity-retrieval'
    params = {
        'name': f'{cell_content}',
        'token': os.getenv("LAMAPI_KEY"),
        'kg': 'wikidata',
        'fuzzy': 'True'
    }
    headers = {'accept': 'application/json'}

    response = requests.get(url, params=params, headers=headers)

    if response.status_code == 200:
        data = response.json()
        # Process the JSON data here
    else:
        print("Error:", response.status_code)
    
    list_of_dicts = data[f'{cell_content}']
    
    return list_of_dicts


In [14]:
def get_table_str(df):
    column_names = df.columns.tolist()
    table_str = "col: "
    table_str += "| " + " | ".join(column_names) + " | "
    for index, row in df.iterrows():
        row_str = " | " + " | ".join(str(row[col]) for col in column_names) + " | "
        table_str += f"[SEP] col {index + 1}: {row_str}"
    return table_str

def candidates_as_str(candidates):
    
    list_of_candidates = ""
    for c in candidates:
        if c['description'] == '':
            c['description'] = 'None'
        list_of_candidates += f"<{c['name']} [DESC] {c['description']} [TYPE] {c['types'][0]['name']}>, "
    
    return list_of_candidates[:-2]

def build_prompt(table_str, column_name, cell_content, candidates, t_desc):
    TASK = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request."
    INSTRUCTION = "### Instruction: This is an entity linking task. The goal for this task is to link the selected entity mention in the table cells to the entity in the knowledge base. You will be given a list of referent entities, with each one composed of an entity name, its description and its type. Please choose the correct one from the referent entity candidates. Note that the Wikipedia page, Wikipedia section and table caption (if any) provide important information for choosing the correct referent entity."
    INPUT = f"### Input: [TLE] {t_desc} [TAB] {table_str}"
    QUESTION = f"### Question: The selected entity mention in the table cell is: {cell_content}. The column name for ’{cell_content}’ is {column_name}. "
    CANDIDATES = f"The referent entity candidates are: {candidates}"
    tablellama_prompt = (
        f"{TASK}\n\n"
        f"{INSTRUCTION}\n\n"
        f"{INPUT}\n\n"
        f"{QUESTION}"
        f"{CANDIDATES}. \nIf there are no candidates that matched the cell content the response is <NIL>. What is the correct referent entity for the entity mention ’{cell_content}’ ?\n\n"  
        "### Response: "
    )
    return tablellama_prompt

In [15]:
table_str = get_table_str(table.data)
for i, col in enumerate(table.data):
    if table.ner[str(i)] == 'NEC':
        for cell_content in list(table.data[col]):
            print('_____________________________________________________')
            print(f"\n\nColumn name: {col}")
            print(f"Cell content: {cell_content}\n")
            candidates = LamAPI(cell_content)
            del candidates[3] # Removing the ground truth from candidates
            prompt = build_prompt(table_str, col, cell_content, candidates_as_str(candidates), table.t_desc)
            print(prompt)
            out = llm_22.invoke(prompt)
            print(out.content)
            break

_____________________________________________________


Column name: col0
Cell content: der rhein

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction: This is an entity linking task. The goal for this task is to link the selected entity mention in the table cells to the entity in the knowledge base. You will be given a list of referent entities, with each one composed of an entity name, its description and its type. Please choose the correct one from the referent entity candidates. Note that the Wikipedia page, Wikipedia section and table caption (if any) provide important information for choosing the correct referent entity.

### Input: [TLE] {'0': 'Column 0 contains various transportation-related terms, including types of trains, train classes, and train designations.', '1': 'Column 1 contains numerical values, potentially representing quantities such as the num

NameError: name 'list_files_in_folder' is not defined