# Setup

## Setup + Download the Model

In [None]:
# select the model
model_name_or_path = "TheBloke/Llama-2-13B-chat-GGUF"
model_basename = "llama-2-13b-chat.Q5_K_M.gguf" 


# download the model
from huggingface_hub import hf_hub_download
model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)



## Loading the Model (GPU)

In [None]:
from llama_cpp import Llama
lcpp_llm = None
lcpp_llm = Llama(
    model_path=model_path,
    n_ctx=40960, # Context window
    n_parts=-1, # Number of parts to split the model into. If -1, the number of parts is automatically determined.
    # n_threads=64, # CPU cores
    # n_batch=5120, # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
    # n_gpu_layers=1, # Change this value based on your model and your GPU VRAM pool.
    # tensor_split=8, #List of floats to split the model across multiple GPUs. If None, the model is not split
    #verbose=False, #-> Sadly this does not work due to an issue in the library https://github.com/abetlen/llama-cpp-python/issues/729
)

## Generate Response

In [None]:
def ask_llama(prompt: str) -> str:
  #prompt_template=f'''SYSTEM: You are a helpful, respectful and honest assistant. Always answer as helpfully.

  #USER: {prompt}

  #ASSISTANT:
  #'''
  prompt_template = prompt

  # if we print every promt this will get annoying quickly
  #print(prompt_template)
  #print('-------------------')

  response = lcpp_llm(
    prompt=prompt_template,
    max_tokens=256,
    temperature=0.5,
    top_p=0.95,
    repeat_penalty=1.2,
    top_k=50,
    stop = ['USER:'], # Dynamic stopping when such token is detected.
    echo=False # return the prompt
  )

  return response["choices"][0]["text"]

## Load Data

In [None]:
from typing import Tuple
import numpy as np
import pandas as pd
from ipywidgets import IntProgress
from IPython.display import display
import time

In [None]:
def create_flights_table() -> Tuple[pd.DataFrame, pd.DataFrame]:
  return pd.read_csv("./prepared/flights_clean.csv"), pd.read_csv("./prepared/flights_dirty.csv")
flights_clean, flights_dirty = create_flights_table()

def create_food_table() -> Tuple[pd.DataFrame, pd.DataFrame]:
    return pd.read_csv("./prepared/food_clean.csv"), pd.read_csv("./prepared/food_dirty.csv")
food_clean, food_dirty = create_food_table()

def create_hospital_table() -> Tuple[pd.DataFrame, pd.DataFrame]:
  return pd.read_csv("./prepared/hospital_clean.csv"), pd.read_csv("./prepared/hospital_dirty.csv")
hospital_clean, hospital_dirty = create_hospital_table()

In [None]:
def make_table(df: pd.DataFrame, label = "") -> pd.DataFrame:
  progressBar = IntProgress(min=0, max=min(df['tupleid'].max(), MAXIMUM_ROW_COUNT), description=label)
  display(progressBar)

  grouped = df.groupby("tupleid")

  attributes = df['attribute'].unique()
  result = pd.DataFrame(np.nan, index=df['tupleid'].unique()[:MAXIMUM_ROW_COUNT], columns=attributes)
  for name, group in grouped:
    if name > MAXIMUM_ROW_COUNT:
      break
    progressBar.value += 1
    result.loc[name] = [group.loc[group["attribute"] == attribute, "value"].item() for attribute in attributes]
    # try:
    #   result.loc[name] = [group.loc[group["attribute"] == attribute, "value"].item() for attribute in attributes]
    # except Exception as e:
    #   print([group.loc[group["attribute"] == attribute, "value"] for attribute in attributes])
    #   break
  return result

#def make_error_table(dirty: pd.DataFrame, clean: pd.DataFrame) -> pd.DataFrame:
#  return dirty.where(dirty == clean, True)

def compare_dataframes_by_row(df1, df2):
  # Check if the DataFrames have the same shape
  if df1.shape != df2.shape:
      raise ValueError("DataFrames must have the same shape for row-wise comparison.")

  # Compare the two DataFrames element-wise and create a Boolean DataFrame
  comparison_result = df1 != df2

  return comparison_result

def ground_truth_as_int(gt):
  new_df = gt.astype(int)
  return new_df.values.ravel().tolist()


## General Setup

In [None]:
MAXIMUM_ROW_COUNT = 2 # maximum number of rows that will be evaluated
DEBUG_MESSAGES = True # print debug messages such as the prompts and responses

# Function declarations

## Prompt Table Zero Shot
This cell provides a method to prompt an entire dataframe with zero shots.

In [None]:
prompt_zero_shot = '''Is there an error in {att}:{val}?\n{ser}?'''

def serialize_row(row: pd.Series) -> str:
  result = ""
  for index, value in row.items():
    result += f"{index}: {value} "
  return result

def prompt_table_zero_shot(df: pd.DataFrame):
  progressBar = IntProgress(min=0, max=min(df.shape[0], MAXIMUM_ROW_COUNT)*df.shape[1], description="Attributes Prompted")
  display(progressBar)

  # generate table
  classifications = []
  for index, row in df.iterrows():
      serialized_row = serialize_row(row)
      for i, (attribute, value) in enumerate(row.items()):
        #print(value)
        # create prompt
        prompt = prompt_zero_shot.format(att=attribute, val=value, ser=serialized_row)
        response = ask_llama(prompt)
        if DEBUG_MESSAGES:
          print(prompt)
          print("--------------------")
          print(response)
          print("====================")
        # evaluate response
        if "Yes" in response or "yes" in response:
          classifications.append(1)
        else:
          classifications.append(0)

        progressBar.value += 1

  return classifications

## Prompt Table Few Shot
This cell provides a function to prompt an entire dataframe with few shot. It returns an array containing zeros and ones based on the response of the model.

In [None]:
prompt_few_shot = '''Is there an error in {att}?\n\n{exas}\n{ser}?'''

def sample_example(df: pd.DataFrame, comparison_df:pd.DataFrame) -> str:
  rand_row = np.random.randint(0, df.shape[0])
  rand_col = np.random.randint(0, df.shape[1])
  # print(f"row: {rand_row}, col: {rand_col}")
  error_string = " No"
  # if comparison of ground truth and dirty is "False" it means there is an error
  if(comparison_df.iloc[rand_row, rand_col] is False):
    error_string = " Yes"
  row = df.iloc[rand_row]
  result_str = serialize_row(row)
  return result_str + "?" + error_string

def prompt_table_few_shot(df: pd.DataFrame, ground_truth: pd.DataFrame, samples=1):
  progressBar = IntProgress(min=0, max=min(df.shape[0], MAXIMUM_ROW_COUNT)*df.shape[1], description="Attributes Prompted")
  display(progressBar)
  err_table = compare_dataframes_by_row(df, ground_truth)
  # generate table
  classifications = []
  for index, row in df.iterrows():
    # if index >= MAXIMUM_ROW_COUNT: ????
    #     break
    serialized_row = serialize_row(row)
    for i, (attribute, value) in enumerate(row.items()):

      # get examples
      examples = []
      for i in range(samples):
        row_str = sample_example(df, err_table)
        examples.append(row_str)

      # create prompt
      prompt = prompt_few_shot.format(att=attribute, ser=serialized_row,exas="\n".join(examples))
      response = ask_llama(prompt)
      if DEBUG_MESSAGES:
          print(prompt)
          print("--------------------")
          print(response)
          print("====================")

      # evaluate response
      if "Yes" in response or "yes" in response:
        classifications.append(1)
      else:
        classifications.append(0)

      progressBar.value += 1

  return classifications

# F1 Score

In [None]:
def f1_score(y_true, y_pred):
    true_positives = sum(1 for a, b in zip(y_true, y_pred) if a == b == 1)
    false_positives = sum(1 for a, b in zip(y_true, y_pred) if a == 0 and b == 1)
    false_negatives = sum(1 for a, b in zip(y_true, y_pred) if a == 1 and b == 0)


    if true_positives == 0:
        return 0.0  # Handle the case where true_positives is 0

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0.0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0.0

    if precision + recall == 0:
        return 0.0

    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

# Shuffle

In [None]:
def shuffle_dfs(df1, df2, num_output_rows):
  if(len(df1) != len(df2)):
    print("ERROR")
  # Generate a random permutation of row indices
  rng = np.random.default_rng()  # Create a random number generator
  permutation = rng.permutation(len(df1))

  # Shuffle both DataFrames using the same permutation
  df1_shuffled = df1.iloc[permutation[:num_output_rows]]
  df2_shuffled = df2.iloc[permutation[:num_output_rows]]

  return df1_shuffled, df2_shuffled


# Experiments

## Flight Test
Computes F1 score of Llama zero-shotting the first `MAXIMUM_ROW_COUNT` rows of the flight table.

For perspective: Prompting a single row (6 attributes in this case) took around 32.192 seconds in one case and around 5.017 in another. It varies quite a bit.

In [None]:
shuffled_flights_dirty, shuffled_flights_clean = shuffle_dfs(flights_dirty, flights_clean, MAXIMUM_ROW_COUNT)
comp = compare_dataframes_by_row(shuffled_flights_dirty, shuffled_flights_clean)
ints = ground_truth_as_int(comp)

In [None]:
%%capture --no-stdout --no-display

start_time = time.time()
classified = prompt_table_zero_shot(shuffled_flights_dirty)
end_time = time.time()

time_spent = elapsed_time = end_time - start_time
flights_zero_shot_score = f1_score(ints, classified)

In [None]:
from sklearn.metrics import f1_score as f1
ints = [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0]
classified = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0]

print(f1(ints, classified))
print(f1_score(ints, classified))

In [None]:
print(ints)
print(classified)
print(flights_zero_shot_score)
print(time_spent)

In [None]:
%%capture --no-stdout --no-display


start_time = time.time()
classified_few_shot = prompt_table_few_shot(shuffled_flights_dirty, shuffled_flights_clean)
end_time = time.time()

time_spent = elapsed_time = end_time - start_time
flights_few_shot_score = f1_score(ints, classified_few_shot)

In [None]:
print(flights_few_shot_score)
print(time_spent)

# Food Test

Test food on a small sample subset with zero shot. For perspective: A single row (16 attributes) took around 137.798 seconds.

In [None]:
shuffled_food_dirty, shuffled_food_clean = shuffle_dfs(food_dirty, food_clean, MAXIMUM_ROW_COUNT)
comp = compare_dataframes_by_row(shuffled_food_dirty, shuffled_food_clean)
ints_food = ground_truth_as_int(comp)

In [None]:
%%capture --no-stdout --no-display

start_time = time.time()
classified = prompt_table_zero_shot(shuffled_food_dirty)
end_time = time.time()

time_spent = elapsed_time = end_time - start_time
food_zero_shot_score = f1_score(ints_food, classified)

In [None]:
print(food_zero_shot_score)
print(time_spent)

Test food on subset with few shot

In [None]:
%%capture --no-stdout --no-display

start_time = time.time()
classified_few_shot = prompt_table_few_shot(shuffled_food_dirty, shuffled_food_clean)
end_time = time.time()

time_spent = elapsed_time = end_time - start_time
food_few_shot_score = f1_score(ints_food, classified_few_shot)

In [None]:
print(food_few_shot_score)
print(time_spent)

# Hostpital Test
Test hospital set on a small sample subset with zero shot. For perspective: A single row (19 attributes) took around 221.656 seconds.

In [None]:
shuffled_hospital_dirty, shuffled_hospital_clean = shuffle_dfs(hospital_dirty, hospital_clean, MAXIMUM_ROW_COUNT)
comp = compare_dataframes_by_row(shuffled_hospital_dirty, shuffled_hospital_clean)
ints_hospital = ground_truth_as_int(comp)

In [None]:
%%capture --no-stdout --no-display

start_time = time.time()
classified = prompt_table_zero_shot(shuffled_hospital_dirty)
end_time = time.time()

time_spent = elapsed_time = end_time - start_time
hospital_zero_shot_score = f1_score(ints_hospital, classified)

In [None]:
print(time_spent)
print(hospital_zero_shot_score)

Test hospital set on small subset with few shot

In [None]:
%%capture --no-stdout --no-display

start_time = time.time()
classified_few_shot = prompt_table_few_shot(shuffled_hospital_dirty,shuffled_hospital_clean)
end_time = time.time()

time_spent = elapsed_time = end_time - start_time
hospital_few_shot_score = f1_score(ints_hospital, classified_few_shot)

In [None]:
print(time_spent)
print(hospital_few_shot_score)

## Evaluation

# TODO:

- retrieve the errors from the answer in an automatic way
- compute recall/f1 etc. from clean data
- test different prompts
- test different hyperparameters (?)
- automate experiments for different parameters

