## imports


In [1]:
import requests
import json
import os 
import pandas as pd
from dataClass import DataTable
from main import list_files_in_folder
from dotenv import load_dotenv
from tqdm import tqdm
load_dotenv()

f = open('nogit/HardTablesR1_Valid_CEA_ER.json') 

data = json.load(f) 

## functions

In [2]:
def levenshtein_distance(s1, s2):
    """
    Calculate the Levenshtein distance between two strings.

    Parameters:
    s1 (str): The first string.
    s2 (str): The second string.

    Returns:
    int: The Levenshtein distance between the two strings.
    """
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)

    # len(s1) >= len(s2)
    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    
    return previous_row[-1]

def jaccard_distance(s1, s2, n=1):
    """
    Calculate the Jaccard distance between two strings based on n-grams.

    Parameters:
    s1 (str): The first string.
    s2 (str): The second string.
    n (int): The length of n-grams to consider. Default is 1 (character-wise comparison).

    Returns:
    float: The Jaccard distance between the two strings.
    """
    # Generate n-grams for both strings
    def ngrams(string, n):
        return {string[i:i+n] for i in range(len(string) - n + 1)}
    
    set1 = ngrams(s1, n)
    set2 = ngrams(s2, n)
    
    # Calculate the intersection and union of the two sets
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    
    # Handle the case where both sets are empty
    if not union:
        return 0.0
    
    # Calculate the Jaccard index
    jaccard_index = len(intersection) / len(union)
    
    # Return the Jaccard distance
    return 1 - jaccard_index


In [3]:
def get_table_str(df):
    column_names = df.columns.tolist()
    table_str = "col: "
    table_str += "| " + " | ".join(column_names) + " | "
    for index, row in df.iterrows():
        row_str = " | " + " | ".join(str(row[col]) for col in column_names) + " | "
        table_str += f"[SEP] col {index + 1}: {row_str}"
    return table_str

def candidates_as_str(candidates):
    
    list_of_candidates = ""
    for c in candidates:
        if c['description'] == '':
            c['description'] = 'None'
        list_of_candidates += f"<[ID] {c['id']} [NAME] {c['name']} [DESC] {c['description']} [TYPE] {c['types'][0]['name']}>, "
    
    return list_of_candidates[:-2]

def build_prompt(table_str, column_name, cell_content, candidates, t_desc):
    TASK = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request."
    INSTRUCTION = "### Instruction: This is an entity linking task. The goal for this task is to link the selected entity mention in the table cells to the entity in the knowledge base. You will be given a list of referent entities, with each one composed of an entity id, name, its description and its type. Please choose the correct one from the referent entity candidates. Note that the Wikipedia page, Wikipedia section and table caption (if any) provide important information for choosing the correct referent entity."
    INPUT = f"### Input: [TLE] {t_desc} [TAB] {table_str}"
    QUESTION = f"### Question: The selected entity mention in the table cell is: {cell_content}. The column name for ’{cell_content}’ is {column_name}. "
    CANDIDATES = f"The referent entity candidates are: {candidates}"
    tablellama_prompt = (
        f"{TASK}\n\n"
        f"{INSTRUCTION}\n\n"
        f"{INPUT}\n\n"
        f"{QUESTION}"
        f"{CANDIDATES}. \nIf there are no candidates that matched the cell content the response is <NIL>. What is the correct referent entity for the entity mention ’{cell_content}’ ?\n\n"  
        "### Response: "
    )
    return tablellama_prompt

## Load Data


In [4]:
gt_path = 'data/HardTablesR1/DataSets/HardTablesR1/Valid/gt/cea_gt.csv'
tables_path = 'data/HardTablesR1/DataSets/HardTablesR1/Valid/tables'

gt = pd.read_csv(gt_path, header=None)
tables = list_files_in_folder(tables_path)

In [5]:
target_names = list(data.keys())
print(len(target_names))

81


In [6]:
filtered_df = gt[gt[0].isin(target_names)]
len(filtered_df)

535

In [7]:

correct = 0
for i, (name, r, c, l)  in tqdm(filtered_df.iterrows()):
    # print(i, name, c, r, l)
    
    target_id = l.split('/')[-1]
    # print(f"Ground Truth: {target_id}")
    
    ids_list = [d['id'] for d in data[name][str((r, c))]['retrieved_list']]
    # print(f"Target list of ids: {ids_list}\n")
    # print(f"Is the ground truth in the target list?\n{target_id in ids_list}\n\n")
    if target_id in ids_list:
        correct += 1

535it [00:00, 45540.30it/s]


In [8]:
print(f"Retrieval Accuracy: {correct/len(filtered_df):.2f}")

Retrieval Accuracy: 0.56


In [9]:
s1 = data['ZRWO683W']['(1, 0)']['cell']
s2 = data['ZRWO683W']['(1, 0)']['retrieved_list'][0]['name'].lower()

In [10]:
def score(s1, s2):
    w_lev = 10
    w_jac = 8
    lev_sim = levenshtein_distance(s1, s2)
    jac_sim = jaccard_distance(s1, s2)
    
    return w_lev*lev_sim + w_jac*jac_sim

score(s1, s2)

0.0

In [11]:
jaccard_distance(s1, s2)

0.0

In [12]:
data['ZRWO683W']['(1, 0)']['retrieved_list'][0]
prova = [(el['id'], ) for el in data['ZRWO683W']['(1, 0)']['retrieved_list']]

In [13]:
from prompts import generate_tableDesc_prompt, generate_CEA_prompt_with_t_desc
from langchain_mistralai import ChatMistralAI
mistral_api_key = os.getenv("MISTRAL_API_KEY")
model_22 = "open-mixtral-8x22b"
model_7 = "open-mixtral-8x7b"
llm_22 = ChatMistralAI(model=model_22, temperature=0, api_key=mistral_api_key)
llm_7 = ChatMistralAI(model=model_7, temperature=0, api_key=mistral_api_key)

In [49]:
import time
table_name = None 
y_true, y_pred = [], []

In [50]:
index = []
export = {}
print(f"\n\nTable_name: {table_name}\n")
for i, (name, r, c, l)  in tqdm(filtered_df.iterrows()):
    # print(i, name, c, r, l)
    
    if name != table_name:
        table_name = name
        table = DataTable(f"data/HardTablesR1/DataSets/HardTablesR1/Valid/tables/{name}.csv")
        export[name] = {}
        # Generate table description
        table.generate_t_description(llm_7)
        table_as_str = get_table_str(table.data)

    # print(table.data)
    
    target_id = l.split('/')[-1]
    # print(f"\nGround Truth: {target_id}\n")
    
    ids_list = [d['id'] for d in data[name][str((r, c))]['retrieved_list']]
    
    if target_id in ids_list:

        # Perform CEA:
        cell_content = data[name][str((r, c))]['cell']
        prompt = generate_CEA_prompt_with_t_desc(table.data, cell_content, candidates_as_str(data[name][str((r, c))]['retrieved_list']), table.t_desc)
        #print(f"\nPrompt:\n{prompt}\n\n")
        out = llm_7.invoke(prompt)
        time.sleep(2)
        # print(out.content)
        y_true.append(target_id)
        y_pred.append(out.content)
        index.append(i)
        export[name][str((r, c))] = {
            'cell': cell_content,
            'table_desc': table.t_desc,
            'cea_prompt': prompt,
            'cea_model': llm_7.model,
            'model_out': out.content
        }
      



Table_name: None



235it [06:27,  1.79s/it]

Error in parsing the ouput


535it [14:35,  1.64s/it]


In [51]:
y_preds = [el.strip('[[[').strip(']]]') for el in y_pred]

In [52]:
correct = 0
for true, pred in zip(y_true, y_preds):
    if true == pred:
        correct += 1
print(f"Accuracy: {correct/len(y_true)*100:.2f}%")

Accuracy: 75.00%


In [53]:
with open('nogit/results_8x7_Cea_sub80.json', 'w') as f:
    json.dump(export, f)

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the model and tokenizer
model = AutoModelForCausalLM.from_pretrained("apple/OpenELM-450M-Instruct", trust_remote_code=True) 
tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-hf")

  from .autonotebook import tqdm as notebook_tqdm
A new version of the following files was downloaded from https://huggingface.co/apple/OpenELM-450M-Instruct:
- configuration_openelm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/apple/OpenELM-450M-Instruct:
- modeling_openelm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


In [2]:
# Encode the prompt into tokens
inputs = tokenizer(prompt[:-55], return_tensors="pt")

# Generate text
outputs = model.generate(
    inputs['input_ids'], 
    attention_mask=inputs['attention_mask'], 
    max_length=2000, 
    pad_token_id=tokenizer.eos_token_id
)

# Decode the generated tokens back into text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(generated_text)


NameError: name 'prompt' is not defined