## Imports

In [1]:
import pandas as pd
import os
from main import list_files_in_folder
from dotenv import load_dotenv
from dataClass import DataTable
from langchain_mistralai import ChatMistralAI
import json
from tqdm import tqdm
from utils import get_table_str, candidates_as_str, build_prompt
from prompts import generate_CEA_prompt_with_t_desc
import time

## Prompt

In [2]:
prompts = pd.read_json(path_or_buf='nogit/semtab2022_2t_test_prompts_50.jsonl', lines=True)

In [3]:
tables = set(prompts['table'])
print(f"Number of tables: {len(tables)}")

Number of tables: 26


## Data

In [5]:
# This json contains for each cell to be annotated, a list of possible candidates
# retrieved with LamAPI but without the right candidate.
with open('nogit/HardTablesR1_Valid_CEA_ER_without_gt.json') as f:
    data = json.load(f) 
with open('nogit/table_descriptions2tt.json') as f:
    table_descriptions = json.load(f) 

gt_path = 'data/HardTablesR1/DataSets/HardTablesR1/Valid/gt/cea_gt.csv'
gt = pd.read_csv(gt_path, header=None)

print(f"Number of Tables: {len(data)}")

Number of Tables: 200


In [6]:
ncells = 0
for k, v in data.items():
    ncells += len(v)
print(f"Number of cells: {ncells}")

if ncells != len(gt):
    print("Cells in ground_truth don't match cells in test set.")

Number of cells: 1406


In [7]:

prompts.iloc[60]['output'] in prompts.iloc[60]['question']

True

In [8]:
prompts.iloc[60]['output']+','

'<Alabama [DESC] state of the United States of America [TYPE] U.S. state>,'

In [9]:
q = prompts.iloc[60]['question']
o = prompts.iloc[60]['output']+','
print(q)
post = q.replace(o, '').replace(',.', '. \nIf there are no candidates that matched the cell content the response is <NIL>.')
print(post)

The selected entity mention in the table cell is: 'Allabama'. The column name for 'Allabama' is col0. The referent entity candidates are: <CSS Alabama [DESC] screw sloop-of-war built in 1862 [TYPE] sloop-of-war>,<Alabama, Alabama, Alabama [DESC] A short stories collection [TYPE] book>,<Alabama [DESC] schooner built in 1926 [TYPE] schooner>,<Alabama Creek [DESC] river in the United States of America [TYPE] river>,<Alabama [DESC] mine in Colfax County, New Mexico, United States of America [TYPE] mine>,<Alabama Slim [DESC] musical artist [TYPE] human>,<Alabama [DESC] John Coltrane song [TYPE] musical work/composition>,<Alabama Port [DESC] unincorporated community on Mon Louis Island, in Mobile County, Alabama [TYPE] unincorporated community in the United States>,<ALABAMA [DESC] scientific article published in September 2006 [TYPE] scholarly article>,<Alabama [DESC] computer virus [TYPE] computer virus>,<Alabama [DESC] article in Otto's encyclopedia [TYPE] encyclopedia article>,<Alabama [D

In [9]:
def build_prompt(retrieved, t_desc=None):
    q = retrieved['question']
    o = retrieved['output']+','
    post = q.replace(o, '').replace(',.', '. \nIf there are no candidates that matched the cell content the response is <NIL>.')
    if t_desc == None:
        t_desc = 'None'
    TASK = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request."
    INSTRUCTION = f"### Instruction: {retrieved['instruction']}"
    INPUT = f"### Input: [TLE] {t_desc} [TAB] {retrieved['input'].split(' [TAB] ')[-1]}"
    QUESTION = f"### Question: {post}"
    tablellama_prompt = (
        f"{TASK}\n\n"
        f"{INSTRUCTION}\n\n"
        f"{INPUT}\n\n"
        f"{QUESTION}"  
        "\n\n### Response: "
    )
    return tablellama_prompt

In [10]:
prompts.head()

Unnamed: 0,table,instruction,input,question,output
0,2NEDAQI9,This is an entity linking task. The goal for t...,[TLE] None [TAB] col: |col0|col1|col2|col3|col...,The selected entity mention in the table cell ...,<Citigroup [DESC] American investment bank and...
1,2NEDAQI9,This is an entity linking task. The goal for t...,[TLE] None [TAB] col: |col0|col1|col2|col3|col...,The selected entity mention in the table cell ...,<United States of America [DESC] country in No...
2,2NEDAQI9,This is an entity linking task. The goal for t...,[TLE] None [TAB] col: |col0|col1|col2|col3|col...,The selected entity mention in the table cell ...,<bank [DESC] financial institution that accept...
3,2NEDAQI9,This is an entity linking task. The goal for t...,[TLE] None [TAB] col: |col0|col1|col2|col3|col...,The selected entity mention in the table cell ...,<Bank of America [DESC] American multinational...
4,2NEDAQI9,This is an entity linking task. The goal for t...,[TLE] None [TAB] col: |col0|col1|col2|col3|col...,The selected entity mention in the table cell ...,<United States of America [DESC] country in No...


In [11]:
for i, row in prompts.iterrows():
    #print(table_descriptions[row['table']])
    
    #print(row)
    prompt_desc = build_prompt(row, t_desc=table_descriptions[row['table']])
    prompt_nodesc = build_prompt(row)
    #print(res)
        # Add new columns with the generated prompts
    prompts.at[i, 'prompt_desc'] = prompt_desc
    prompts.at[i, 'prompt_nodesc'] = prompt_nodesc
    

In [12]:
print(f"ROW: {prompts.iloc[0]}\n\n")
print(F"No desc prompt: \n {prompts.iloc[0]['prompt_nodesc']}\n")
print(F"Desc prompt: \n {prompts.iloc[0]['prompt_desc']}\n")

ROW: table                                                     2NEDAQI9
instruction      This is an entity linking task. The goal for t...
input            [TLE] None [TAB] col: |col0|col1|col2|col3|col...
question         The selected entity mention in the table cell ...
output           <Citigroup [DESC] American investment bank and...
prompt_desc      Below is an instruction that describes a task,...
prompt_nodesc    Below is an instruction that describes a task,...
Name: 0, dtype: object


No desc prompt: 
 Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction: This is an entity linking task. The goal for this task is to link the selected entity mention in the table cells to the entity in the knowledge base. You will be given a list of referent entities, with each one composed of an entity name, its description and its type. Please choose the correct one from the

In [13]:
with open('nome', 'w') as f:
    for _, row in prompts.iterrows():
        json.dump(row.to_dict(), f)
        f.write('\n')

In [10]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-1.5B-Instruct")

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# Encode the prompt into tokens
inputs = tokenizer(prompt[:-55], return_tensors="pt")

# Generate text
outputs = model.generate(
    inputs['input_ids'], 
    attention_mask=inputs['attention_mask'], 
    max_length=2000, 
    pad_token_id=tokenizer.eos_token_id
)

# Decode the generated tokens back into text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(generated_text)

In [11]:
prompts.iloc[0]['prompt_nodesc']

KeyError: 'prompt_nodesc'