In [1]:
import ast
import re
from itertools import combinations
from time import sleep
import random
import pandas as pd
import torch
from transformers import set_seed, pipeline
from datasets import disable_caching, load_from_disk, Dataset
from tqdm.auto import tqdm
import sys
import os
sys.path.append(os.path.abspath('../../modules'))
from llm.context import LLMChatContext
from llm.model import LLMModel
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
tqdm.pandas()
disable_caching()
df_train = load_from_disk("../../datasets/ManualReducedDataset").to_pandas()

In [3]:
ACCESS_TOKEN = os.environ.get("ACCESS_TOKEN")

In [4]:
# If access token is not set, will raise an error. Look at the readme to obtain the access token.
assert ACCESS_TOKEN

In [5]:
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
gen = LLMModel.from_transformers(
    model_name,
    model_kwargs={"token": ACCESS_TOKEN, "max_length": 4096},
    tokenizer_kwargs={"token": ACCESS_TOKEN},
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

# Zero-shot Synthetic Data Generation

In [6]:
def generate_question(A:str, B:str, label: int):
    match label:
        case 1:
            user_prompt = f"Generate a single sentence that imply a buyer-supplier relationship between {A} and {B}, where {A} is the buyer and {B} is the supplier." # Note that the sentences should not portray an ambiguous partnership relationship between {A} and {B}. The sentence should portray a clear indication of a directed supply chain relationship.
        case 2:
            user_prompt = f"Generate a single sentence that imply a supplier-buyer relationship between {A} and {B}, where {A} is the supplier and {B} is the buyer." #,  Note that the sentences should not portray an ambiguous partnership relationship between {A} and {B}. The sentence should portray a clear indication of a directed supply chain relationship. such as expressing the items that are shipped by {A} to {B}.
        case 3:
            user_prompt = f"Generate a single sentence that imply a relationship between {A} and {B} that is portrayed as arbitrary or undirected." # This relationship can take various forms, such as collaborations, joint ventures, strategic alliances, or any other type of ambiguous business relationship. Note that the sentence should not convey a supply chain relationship between the two companies.
        case 4:
            user_prompt = f"Generate a single sentence that imply an ownership relationship between {A} and {B}, where {A} owns or is owned by {B}."
    #user_prompt += """\nThe sentence should meet the following criteria:
#- Crafted in a style that could be reminiscent of a newspaper article, press release, industry report, or any other relevant source.
#- Should be unique, diverse and creative."""
#- Contain the exact entity words '{A}' and '{B}'.
    return user_prompt

def generate_sentences(label: int, shots: int = 10):
    totals = df_train[df_train['label'] == label].sample(shots+1)
    samples = totals.iloc[:shots]
    target = totals.iloc[shots]
    A = target['NE_FROM']
    B = target['NE_TO']
    if random.random() > 0.5:
        C = A
        A = B
        B = C
        if label == 2:
            label = 1
        elif label == 1:
            label = 2
    save_dict = {
        "A": A,
        "B": B,
        "label": label,
        "sentences": []
    }
    ctx = LLMChatContext(tokenizer=gen.tokenizer)
    ctx.add_chat(role="system", content="""You are a helpful AI assistant with extensive knowledge of the global supply chain. Your task is to generate a realistic yet unique sentence that matches the given context. The sentence should be crafted in a style that could be reminiscent of a newspaper article, press release, industry report, or any other relevant source. Please provide your response in the JSON format below:
```json
{{
    "sentence": \"(example sentence)\"
}}
```""")
#     for _, sample in samples.iterrows():
#         ctx.add_chat(role="user", content=generate_question(A=sample['NE_FROM'], B=sample['NE_TO'], label=sample['label']))
#         ctx.add_chat(role="assistant", content=f"""```json
# {{
#     "sentence": \"{sample["original_text"]}\"
# }}
# ```""")
    ctx.add_chat(role="user", content=generate_question(A=target['NE_FROM'], B=target['NE_TO'], label=target['label']))
    #print(ctx.render())
    response = ctx.generate(prefill="```json", gen=gen, generation_config={"do_sample": True, "top_p": 0.95, "stop_strings": "}"})
    try:
        obj = ast.literal_eval(response + "}")
        save_dict["sentences"].append(obj['sentence'])
    except Exception:
        pass
    del ctx
    return save_dict

In [7]:
len(df_train)

181

In [8]:
set_seed(42)
results = []
for idx in tqdm(range(700)):
    results.append(generate_sentences(label=1))
    results.append(generate_sentences(label=2))
    results.append(generate_sentences(label=3))
    results.append(generate_sentences(label=4))
    pd.DataFrame(results).to_json("2.Zero_shot_Synthetic_Data.json", orient="records", force_ascii=False)
    if idx % 5 == 0:
        torch.cuda.empty_cache()
        sleep(15)

  0%|          | 0/700 [00:00<?, ?it/s]

In [9]:
gen.unload()
torch.cuda.empty_cache()

In [10]:
def strip_parentheses(row):
    row["original_text"] = re.sub(r'\(.*?\)|\[.*?\]|<.*?>|\{.*?\}', '', row["original_text"])
    row["original_text"] = re.sub(r'\s{2,}', ' ', row["original_text"]).strip()
    return row

def text_generator(df):
    for _, row in df.iterrows():
        yield row["original_text"]

def process_named_entities(df, ner_pipeline):
    results = []
    for output in tqdm(ner_pipeline(text_generator(df), aggregation_strategy="first", batch_size=256)):
        org_entities = [x for x in output if x['entity_group'] == 'ORG']
        results.append(org_entities if len(org_entities) >= 2 else [])
    return results

def create_entity_pairs(df, ner_results):
    entity_pairs = []
    for idx, row in tqdm(df.iterrows()):
        output = ner_results[idx]
        for token_idx_from, token_idx_to in combinations(range(len(output)), r=2):
            pair = create_entity_pair(row, output, token_idx_from, token_idx_to)
            entity_pairs.append(pair)
    return entity_pairs

def create_entity_pair(row, entities, idx_from, idx_to):
    pair = {
        "original_text": row['original_text'],
        "label": None,
        "NE_FROM": None,
        "NE_TO": None,
        "NE_OTHER": [],
        "masked_text": row["original_text"]
    }

    for token_idx, token in reversed(list(enumerate(entities))):
        if token_idx == idx_to:
            pair["NE_TO"] = token['word']
            pair["masked_text"] = pair["masked_text"][:token['start']] + "__NE_TO__" + pair["masked_text"][token['end']:]
        elif token_idx == idx_from:
            pair["NE_FROM"] = token['word']
            pair["masked_text"] = pair["masked_text"][:token['start']] + "__NE_FROM__" + pair["masked_text"][token['end']:]
        else:
            pair["NE_OTHER"].append(token['word'])
            pair["masked_text"] = pair["masked_text"][:token['start']] + "__NE_OTHER__" + pair["masked_text"][token['end']:]

    pair['label'] = determine_label(row, pair["NE_FROM"], pair["NE_TO"])
    return pair

def determine_label(row, ne_from, ne_to):
    if ne_from == row['A'] and ne_to == row['B']:
        return row['label']
    elif ne_from == row['B'] and ne_to == row['A']:
        if row['label'] == 1:
            return 2
        elif row['label'] == 2:
            return 1
        else:
            return row['label']
    else:
        return 0

In [11]:
df = pd.DataFrame(results)
df = df.explode("sentences").dropna().reset_index(drop=True)
df = df.rename(columns={"sentences": "original_text"})
df = df.apply(strip_parentheses, axis=1)
ner_pipeline = pipeline("ner", model="dslim/bert-large-NER", device="cuda")
ner_results = process_named_entities(df, ner_pipeline)
entity_pairs = create_entity_pairs(df, ner_results)
final_df = pd.DataFrame(entity_pairs)
final_df = final_df.groupby('original_text').filter(
    lambda x: (x['NE_FROM'] != '').all() and
              (x['NE_TO'] != '').all() and
              (x['label'] != 0).any()
).reset_index(drop=True)
Dataset.from_pandas(final_df).save_to_disk("../../datasets/ZeroShotReducedDataset")

Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Saving the dataset (0/1 shards):   0%|          | 0/1970 [00:00<?, ? examples/s]