In [1]:
import pandas as pd
import numpy as np
from datasets import load_from_disk, disable_caching
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from transformers import set_seed
import torch
import sys
import os
sys.path.append(os.path.abspath('../../modules'))
from llm.model import LLMModel
from llm.context import LLMChatContext
from dotenv import load_dotenv
load_dotenv()

In [2]:
# import random
# seeds = [random.randint(0, 1e9) for _ in range(5)]
# seeds

In [3]:
tqdm.pandas()
disable_caching()

In [4]:
ds = load_from_disk("../../datasets/ManualDataset")
df_train = ds["train"].to_pandas()
df_valid = ds["valid"].to_pandas()
df_test = ds["test"].to_pandas()

In [5]:
ACCESS_TOKEN = os.environ.get("ACCESS_TOKEN")

In [6]:
# If access token is not set, will raise an error. Look at the readme to obtain the access token.
assert ACCESS_TOKEN

In [7]:
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
gen = LLMModel.from_transformers(
    model_name,
    model_kwargs={"token": ACCESS_TOKEN},
    tokenizer_kwargs={"token": ACCESS_TOKEN},
)
ctx = LLMChatContext(tokenizer=gen.tokenizer)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
def letter_to_number(letter):
    return ord(letter.upper()) - ord('A')
def number_to_letter(number):
    return chr(number + ord('A'))

# Zero-Shot

In [9]:
SYSTEM_MESSAGE = """You are an AI language model trained to infer relationships between entities in a sentence. You will be provided with a masked sentence containing placeholders __NE_FROM__, __NE_TO__, and __NE_OTHER__, which represent the names of companies. Your task is to determine the type of relationship between the organizational entities __NE_FROM__ and __NE_TO__ from the following choices:

A) No relationship: Indicates that there is no significant business or operational relationship between __NE_FROM__ and __NE_TO__.
B) __NE_TO__ supplies __NE_FROM__: Indicates that __NE_TO__ supplies goods or services to __NE_FROM__.
C) __NE_FROM__ supplies __NE_TO__: Indicates that __NE_FROM__ supplies goods or services to __NE_TO__.
D) Ambiguous / undirected: Indicates that __NE_FROM__ and __NE_TO__ are in some form of relationship but not implying a supplier-consumer relationship, or the relationship is not clearly defined in the given context.
E) Ownership / part-of: Indicates that __NE_FROM__ owns __NE_TO__ or is owned by __NE_TO__.

Your answer should be formatted such as:
```answer
Answer: A
Answer: B
Answer: C
Answer: D
Answer: E
```"""

In [10]:
def question_generator(sentence):
    return f"""Sentence: {sentence}
Question: What can be inferred about the relationship between the entities __NE_FROM__ and __NE_TO__ from the sentence above?

A) No relationship
B) __NE_TO__ supplies __NE_FROM__
C) __NE_FROM__ supplies __NE_TO__
D) Ambiguous / undirected
E) Ownership / part-of"""

In [11]:
def run_experiment(seed, fewshot=False):
    set_seed(seed)
    target_box = df_test["label"].values
    result_box = []
    for i, row in tqdm(df_test.iterrows(), total=len(df_test), desc=f"Seed {seed}"):
        ctx.clear()
        ctx.add_chat("system", SYSTEM_MESSAGE)
        if fewshot:
            row_0 = df_train[df_train["label"] == 0].sample(2)
            row_1 = df_train[df_train["label"] == 1].sample(2)
            row_2 = df_train[df_train["label"] == 2].sample(2)
            row_3 = df_train[df_train["label"] == 3].sample(2)
            row_4 = df_train[df_train["label"] == 4].sample(2)
            samples = pd.concat([row_0, row_1, row_2, row_3, row_4])
            samples = samples.sample(frac=1).reset_index(drop=True)
            for _, sample in samples.iterrows():
                question = question_generator(sample["masked_text"])
                ctx.add_chat("user", question)
                ctx.add_chat("assistant", f"Answer: {number_to_letter(sample['label'])}")
            if i % 50 == 0:
                torch.cuda.empty_cache()
        ctx.add_chat("user", question_generator(row["masked_text"]))
        result = ctx.choice(prefill="Answer:", choice=[" A", " B", " C", " D", " E"], gen=gen)
        result_box.append(letter_to_number(result.strip()))
    result_box_np = np.array(result_box)
    # Calculate F1 scores
    f1_micro = f1_score(target_box, result_box_np, average='micro')
    f1_macro = f1_score(target_box, result_box_np, average='macro')
    f1_classwise = f1_score(target_box, result_box_np, average=None)
    return {
        "f1_micro": f1_micro,
        "f1_macro": f1_macro,
        **{f"f1_class_{i}": score for i, score in enumerate(f1_classwise)}
    }

In [12]:
# Run the experiment with 5 different seeds
seeds = [992337557, 534658016, 24606665, 558372984, 588628665]
all_results = []

for seed in seeds:
    results = run_experiment(seed, False)
    all_results.append(results)

# Calculate mean and std of F1 scores
metrics = ["f1_micro", "f1_macro"] + [f"f1_class_{i}" for i in range(5)]

avg_results = {}
for metric in metrics:
    scores = [r[metric] for r in all_results]
    avg_results[metric] = {
        'mean': np.mean(scores),
        'std': np.std(scores)
    }

# Print results
print("\nAverage Results across all runs:")
for metric in metrics:
    print(f"Average {metric}: {avg_results[metric]['mean']:.4f} ± {avg_results[metric]['std']:.4f}")

Seed 992337557:   0%|          | 0/745 [00:00<?, ?it/s]

Seed 534658016:   0%|          | 0/745 [00:00<?, ?it/s]

Seed 24606665:   0%|          | 0/745 [00:00<?, ?it/s]

Seed 558372984:   0%|          | 0/745 [00:00<?, ?it/s]

Seed 588628665:   0%|          | 0/745 [00:00<?, ?it/s]


Average Results across all runs:
Average f1_micro: 0.4523 ± 0.0000
Average f1_macro: 0.4666 ± 0.0000
Average f1_class_0: 0.2840 ± 0.0000
Average f1_class_1: 0.3923 ± 0.0000
Average f1_class_2: 0.6378 ± 0.0000
Average f1_class_3: 0.3277 ± 0.0000
Average f1_class_4: 0.6914 ± 0.0000


In [13]:
# Run the experiment with 5 different seeds
seeds = [435438698, 288373858, 454024408, 406374547, 923762016]
all_results = []

for seed in seeds:
    results = run_experiment(seed, True)
    all_results.append(results)

# Calculate mean and std of F1 scores
metrics = ["f1_micro", "f1_macro"] + [f"f1_class_{i}" for i in range(5)]

avg_results = {}
for metric in metrics:
    scores = [r[metric] for r in all_results]
    avg_results[metric] = {
        'mean': np.mean(scores),
        'std': np.std(scores)
    }

# Print results
print("\nAverage Results across all runs:")
for metric in metrics:
    print(f"Average {metric}: {avg_results[metric]['mean']:.4f} ± {avg_results[metric]['std']:.4f}")

Seed 435438698:   0%|          | 0/745 [00:00<?, ?it/s]

Seed 288373858:   0%|          | 0/745 [00:00<?, ?it/s]

Seed 454024408:   0%|          | 0/745 [00:00<?, ?it/s]

Seed 406374547:   0%|          | 0/745 [00:00<?, ?it/s]

Seed 923762016:   0%|          | 0/745 [00:00<?, ?it/s]


Average Results across all runs:
Average f1_micro: 0.5270 ± 0.0092
Average f1_macro: 0.5409 ± 0.0094
Average f1_class_0: 0.4413 ± 0.0198
Average f1_class_1: 0.4859 ± 0.0115
Average f1_class_2: 0.6190 ± 0.0099
Average f1_class_3: 0.4701 ± 0.0265
Average f1_class_4: 0.6886 ± 0.0223
