# Imports

In [7]:
import pandas as pd
import os

In [219]:
def LamAPI(cell_content):
    
    url = 'https://lamapi.hel.sintef.cloud/lookup/entity-retrieval'
    params = {
        'name': f'{cell_content}',
        'token': 'lamapi_demo_2023',
        'kg': 'wikidata',
        'fuzzy': 'True'
    }
    headers = {'accept': 'application/json'}

    response = requests.get(url, params=params, headers=headers)

    if response.status_code == 200:
        data = response.json()
        # Process the JSON data here
    else:
        print("Error:", response.status_code)
    
    list_of_dicts = data[f'{cell_content}']
    keys_to_select = ['id', 'name', 'types']

    selected_dicts = [{k: v for k, v in d.items() if k in keys_to_select} for d in list_of_dicts]
    res_dict = {d['id']: d for d in selected_dicts}
    
    for item in selected_dicts:
        for type_dict in item['types']:
            if 'id' in type_dict:
                del type_dict['id']

    string = json.dumps(selected_dicts)
    string = string.replace('"', '').replace('name:', '').replace('  ', ' ').replace('[', '').replace(']', '')
    
    return string

In [218]:
def generate_CEA_prompt(df, cell_content, ER):
    '''
    input:
        df: pandas dataframe
        cell_content : str
        ER: retrieved entities from lamapi
    '''
    task_description = "You are to choose which retrieved entity is the correct entity to be associated to the cell content"
    # Extract column names
    column_names = df.columns.tolist()
    
    # Construct the table for display in the prompt
    table_str = "Table:\n"
    table_str += "| " + " | ".join(column_names) + " |\n"
    table_str += "|-" + "-|-".join(["-" * len(col) for col in column_names]) + "-|\n"
    for index, row in df.iterrows():
        row_str = "| " + " | ".join(str(row[col]) for col in column_names) + " |\n"
        table_str += row_str
    cell = f"Cell Content: {cell_content}"
    entities = f"Retrieved Entities and their types: {ER}"
    # Add classification request
    classification_request = "Classification Request:\n Please associate the cell to the correct entity choosen between the list of retrieved entities along with their types.\nPlease provide the response strictly in the format {'id': 'entity_id'}. Do not include any additional text or explanation.\nExample: {'id': '12345'} "
    
    classification_str = "Chosen Entity ID:\n"
    #for i, col in enumerate(column_names, start=1):
    #    classification_str += f"{i}. {col}: [Your classification]\n"
    
    # Combine all parts to form the final prompt
    prompt = (
        f"{task_description}\n\n"
        f"{table_str}\n\n"
        f"{cell}\n\n"
        f"{ER}\n\n"
        f"{classification_request}\n\n"
        f"{classification_str}"
    )
    
    return prompt

# Basic classes and functions

In [262]:
import pandas as pd
import os
import json
from prompts import generate_NER_prompt, generate_CEA_prompt
from tqdm import tqdm

class DataTable:
    def __init__(self, file_path):
        self.file_path = file_path
        self.name = self.file_path.split('/')[-1].strip('.csv').strip('.json')
        self.ner = None
        self.cea = None
        if file_path.endswith('.csv'):
            self.data = pd.read_csv(file_path)
        elif file_path.endswith('.json'):
            self.data = pd.read_json(file_path)
        else:
            raise ValueError("Unsupported file format. Please use a CSV or JSON file.")

        # Convert all cells to lowercase
        self.data = self.data.map(lambda x: x.lower() if isinstance(x, str) else x)

    def get_column(self, column_index):
        if 0 <= column_index < len(self.data.columns):
            return self.data.iloc[:, column_index]
        else:
            raise IndexError(f"Column index '{column_index}' is out of range.")

    def get_row(self, row_index):
        if 0 <= row_index < len(self.data):
            return self.data.iloc[row_index]
        else:
            raise IndexError("Row index out of range.")

    def get_cell(self, row_index, column_index):
        if not (0 <= column_index < len(self.data.columns)):
            raise IndexError(f"Column index '{column_index}' is out of range.")
        if not (0 <= row_index < len(self.data)):
            raise IndexError("Row index out of range.")
        return self.data.iat[row_index, column_index]

    def shape(self):
        return self.data.shape
    
    def generate_ner_labels(self, llm):
        if(self.ner==None):
            prompt = generate_NER_prompt(self.data)
            out = llm.invoke(prompt)
            self.ner = json.loads(out.content.replace("'", '"'))
        else:
            print("Labels already generated.\n")
        return self.ner        
    
    def generate_cea_annotatons(self, llm):
        cea_dict = {}
        if (self.ner == None):
            return 'Perform Columns Named Entities classification first!'
        if (self.cea == None):
            print('passed')
            ner_columns_idx = [idx for idx in range(len(self.ner)) if self.ner[str(idx)] == 'NEC']
            for j in ner_columns_idx:
                for i in range(len(self.data)):
                
                    cell_content = self.data.iloc[i, j]
                    entity_retrieval = LamAPI(cell_content) 
                    prompt = generate_CEA_prompt(self.data, cell_content, entity_retrieval)
                    out = llm.invoke(prompt)
                    cea_out = out.content.replace("'", '"')
                    cea_out = json.loads(cea_out)
                    cea_dict[str((i,j))] = {'id': cea_out['id'], 'llm_output': out.content}
            self.cea = cea_dict
        else:
            print('CEA annotations already generated.')
        return cea_dict        
        
    def save_json(self, folder):
        dict = {
            "name": self.name,
            "named_entity_columns": self.ner,
            "cea": self.cea
        }
        # Serialize data into file:
        json.dump(dict, open(folder + f"/{self.name}.json", 'w'))
        print(f"Saved as: {folder}/{self.name}.json")
        return
    

In [140]:

def list_files_in_folder(folder_path):
    try:
        # Get a list of all entries in the directory
        entries = os.listdir(folder_path)
        
        # Filter out the files from the entries
        files = [os.path.join(folder_path, entry) for entry in entries if os.path.isfile(os.path.join(folder_path, entry))]
        
        return files
    except FileNotFoundError:
        return f"The folder '{folder_path}' does not exist."
    except Exception as e:
        return f"An error occurred: {e}"


# Pre - Processing

- Cells are converted into lowercase (implemented in DataTable class)
- Preliminary set of annotations:
    - Literal-column or Named Entity Column

In [266]:
tables_path = 'data/HardTablesR1/DataSets/HardTablesR1/Valid/tables'
tables = list_files_in_folder(tables_path)
print(len(tables))
i = 0
results = {}
for table in tables:
    t = DataTable(table)
    print(f"Table name: {t.name}\n")
    print(f"Table shape: {t.shape()}\n")
    t.generate_ner_labels(llm)
    t.generate_cea_annotatons(llm)
    results[t.name] = {
        'nec': t.ner,
        'cea': t.cea
    }
    json.dump(dict, open('results/HardTablesR1/Valid.json', 'w'))
    i += 1
    if i > 1:
        break

200
Table name: ZRWO683W

Table shape: (7, 2)

passed


KeyError: '0'

In [267]:
t.ner

{'col0': 'NEC', 'col1': 'LC'}

In [None]:
def generate_prompt_from_dataframe(df):
    task_description = "You are to classify each column in a given table as either a Named Entity Column (NEC) or a Literal Column (LC)."
    definitions = (
        "Definitions:\n"
        "- Named Entity Column (NEC): Columns that contain names of people, organizations, locations, or other proper nouns.\n"
        "- Literal Column (LC): Columns that contain numerical values, dates, measurements, or other literal values."
    )
    examples = (
        "Examples:\n"
        "- Named Entity Column (NEC) Examples:\n"
        '  - Column with values: ["John Doe", "Jane Smith", "Company XYZ", "Paris"]\n'
        '  - Column with values: ["Microsoft", "Apple", "Google", "Amazon"]\n\n'
        "- Literal Column (LC) Examples:\n"
        '  - Column with values: [34, 56, 78, 23]\n'
        '  - Column with values: ["2021-01-01", "2022-05-12", "2023-08-23"]\n'
        '  - Column with values: [5.6, 3.4, 2.8, 4.5]'
    )
    
    # Extract column names
    column_names = df.columns.tolist()
    
    # Construct the table for display in the prompt
    table_str = "Table for Classification:\n"
    table_str += "| " + " | ".join(column_names) + " |\n"
    table_str += "|-" + "-|-".join(["-" * len(col) for col in column_names]) + "-|\n"
    for index, row in df.iterrows():
        row_str = "| " + " | ".join(str(row[col]) for col in column_names) + " |\n"
        table_str += row_str
    
    # Add classification request
    classification_request = "Classification Request:\nBased on the above definitions and examples, please classify each column in the provided table as either Named Entity Column (NEC) or Literal Column (LC).\n Please provide the response strictly in the format {'column_position': 'classification'}. Do not include any additional text or explanation.\nExample: {'0': 'NEC', '1': 'LIT', '2': NEC} "
    
    classification_str = "Classification:\n"
    #for i, col in enumerate(column_names, start=1):
    #    classification_str += f"{i}. {col}: [Your classification]\n"
    
    # Combine all parts to form the final prompt
    prompt = (
        f"{task_description}\n\n"
        f"{definitions}\n\n"
        f"{examples}\n\n"
        f"{table_str}\n\n"
        f"{classification_request}\n\n"
        f"{classification_str}"
    )
    
    return prompt




# Pre-Processing

## Named Entity Columns and Literal Columns

In [224]:
# Prepare the prompt
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage

api_key = ""
model = "open-mixtral-8x7b"

'''client = MistralClient(api_key=api_key)

messages = [
    ChatMessage(role="user", content=prompt)
]

# No streaming
chat_response = client.chat(
    model=model,
    messages=messages,
)

out = chat_response.choices[0].message.content
'''



'client = MistralClient(api_key=api_key)\n\nmessages = [\n    ChatMessage(role="user", content=prompt)\n]\n\n# No streaming\nchat_response = client.chat(\n    model=model,\n    messages=messages,\n)\n\nout = chat_response.choices[0].message.content\n'

In [146]:

from langchain_mistralai import ChatMistralAI
llm = ChatMistralAI(model=model, temperature=0, api_key=api_key)
# out = llm.invoke(prompt)
# No streaming

In [226]:
print(t0.ner)

None


In [227]:
t0.ner_labels(llm)

generating


{'0': 'NEC', '1': 'LC', '2': 'LC', '3': 'NEC'}

In [231]:
print(t0.cea)
t0.generate_cea_annotatons(llm)
print(t0.cea)

None
passed
{(0, 0): {'id': 'Q11209', 'llm_output': "{'id': 'Q11209'}"}}


# Entity Retrieval

### LamAPI interface

In [None]:
import requests
import json

def LamAPI(cell_content):
    
    url = 'https://lamapi.hel.sintef.cloud/lookup/entity-retrieval'
    params = {
        'name': f'{cell_content}',
        'token': 'lamapi_demo_2023',
        'kg': 'wikidata',
        'fuzzy': 'True'
    }
    headers = {'accept': 'application/json'}

    response = requests.get(url, params=params, headers=headers)

    if response.status_code == 200:
        data = response.json()
        # Process the JSON data here
    else:
        print("Error:", response.status_code)
    
    list_of_dicts = data[f'{cell_content}']
    keys_to_select = ['id', 'name', 'types']

    selected_dicts = [{k: v for k, v in d.items() if k in keys_to_select} for d in list_of_dicts]
    res_dict = {d['id']: d for d in selected_dicts}
    
    for item in selected_dicts:
        for type_dict in item['types']:
            if 'id' in type_dict:
                del type_dict['id']

    string = json.dumps(selected_dicts)
    string = string.replace('"', '').replace('name:', '').replace('  ', ' ').replace('[', '').replace(']', '')
    
    return string

### Retrieval

In [186]:
t0.ner['0']

'NEC'

In [187]:
a = 0
t0.ner[f"{a}"]
ner_columns_idx = [idx for idx in range(len(t0.ner)) if t0.ner[str(idx)] == 'NEC']
ner_columns_idx

[0, 3]

In [193]:
def generate_CEA_prompt(df, cell_content, ER):
    '''
    input:
        df: pandas dataframe
        cell_content : str
        ER: retrieved entities from lamapi
    '''
    task_description = "You are to choose which retrieved entity is the correct entity to be associated to the cell content"
    # Extract column names
    column_names = df.columns.tolist()
    
    # Construct the table for display in the prompt
    table_str = "Table:\n"
    table_str += "| " + " | ".join(column_names) + " |\n"
    table_str += "|-" + "-|-".join(["-" * len(col) for col in column_names]) + "-|\n"
    for index, row in df.iterrows():
        row_str = "| " + " | ".join(str(row[col]) for col in column_names) + " |\n"
        table_str += row_str
    cell = f"Cell Content: {cell_content}"
    entities = f"Retrieved Entities and their types: {ER}"
    # Add classification request
    classification_request = "Classification Request:\n Please associate the cell to the correct entity choosen between the list of retrieved entities along with their types.\nPlease provide the response strictly in the format {'id': 'entity_id'}. Do not include any additional text or explanation.\nExample: {'id': '12345'} "
    
    classification_str = "Chosen Entity ID:\n"
    #for i, col in enumerate(column_names, start=1):
    #    classification_str += f"{i}. {col}: [Your classification]\n"
    
    # Combine all parts to form the final prompt
    prompt = (
        f"{task_description}\n\n"
        f"{table_str}\n\n"
        f"{cell}\n\n"
        f"{ER}\n\n"
        f"{classification_request}\n\n"
        f"{classification_str}"
    )
    
    return prompt

In [215]:
def generate_cea_annotatons(t0, llm):
    cea_dict = {}
    if (t0.cea == None):
        print('passed')
        ner_columns_idx = [idx for idx in range(len(t0.ner)) if t0.ner[str(idx)] == 'NEC']
        for j in ner_columns_idx:
            for i in range(len(t0.data)):
            
                cell_content = t0.data.iloc[i, j]
                entity_retrieval = LamAPI(cell_content) 
                prompt = generate_CEA_prompt(t0.data, cell_content, entity_retrieval)
                out = llm.invoke(prompt)
                cea_out = out.content.replace("'", '"')
                cea_out = json.loads(cea_out)
                cea_dict[(i,j)] = {'id': cea_out['id'], 'llm_output': out.content}
                break
            break
    return cea_dict

cea_dict = generate_cea_annotatons(t0, llm)    

passed


In [217]:
cea_dict[(0, 0)]['id']

'Q11209'

In [188]:
ER = LamAPI(cell_content)
print(ER)

{id: Q108779219, United States Department of Defense, types: { record label}}, {id: Q11209, United States Department of Defense, types: { defence ministry}, { United States federal executive department}}, {id: Q97430551, Category:Fictional United States Department of Defense officials, types: { Wikimedia category}}, {id: Q6453078, Category:United States Department of Defense, types: { Wikimedia category}}, {id: Q107056052, Portal:United States Department of Defense, types: { Wikimedia portal}}, {id: Q7748656, Template:United States Department of Defense, types: { Wikimedia template}}, {id: Q30803168, Category:Seals of the United States Department of Defense, types: { Wikimedia category}}, {id: Q853561, United States Department of Defense aerospace vehicle designation, types: { null}}, {id: Q96376137, Defense Distributed v. United States Department of State, types: { null}}, {id: Q16955038, Nation Magazine v. United States Department of Defense, types: { legal case}}, {id: Q105677106, U

In [None]:
prompt = generate_CEA_prompt(t_0, cell_content, ER)
print(prompt)

In [None]:
messages = [
    ChatMessage(role="user", content=prompt)
]

# No streaming
chat_response = client.chat(
    model=model,
    messages=messages,
)

cea_out = chat_response.choices[0].message.content



In [None]:
cea_out = cea_out.replace("'", '"')
c_out = json.loads(cea_out)

print(c_out['id'])

Q11209
