In [1]:
import ast
import random
import re
from time import sleep

import cleanco
import pandas as pd
import torch
from datasets import disable_caching, Dataset
from tqdm.auto import tqdm
from transformers import set_seed, pipeline
from itertools import combinations
import sys
import os
sys.path.append(os.path.abspath('../../modules'))
from llm.context import LLMChatContext
from llm.model import LLMModel
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
tqdm.pandas()
disable_caching()

In [3]:
ACCESS_TOKEN = os.environ.get("ACCESS_TOKEN")

In [4]:
# If access token is not set, will raise an error. Look at the readme to obtain the access token.
assert ACCESS_TOKEN

In [5]:
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
gen = LLMModel.from_transformers(
    model_name,
    model_kwargs={"token": ACCESS_TOKEN, "max_length": 4096},
    tokenizer_kwargs={"token": ACCESS_TOKEN},
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

# Two stage Synthetic Data Generation
## 1. Build synthetic relation triples

In [6]:
def company_tree(company_name):
    save_dict = {"company_name": company_name}
    ctx = LLMChatContext(tokenizer=gen.tokenizer)
    ctx.add_chat(
        role="system",
        content="You are a helpful AI assistant with extensive knowledge of the global supply chain.",
    )
    ctx.add_chat(
        role="user",
        content=f"Provide a detailed overview of the company '{company_name}'. Include information about the industry in which it operates and the specific products or services it offers. Limit your answer to a single paragraph.",
    )
    company_info = ctx.generate(
        gen=gen, generation_config={"do_sample": False, "stop_strings": "\n"}
    )
    # Company info
    ctx.add_chat(
        role="user",
        content=f"""Your task is to identify related companies of '{company_name}' in its supply chain and categorize them according to the following relationship categories:

- **Supplier companies:** These are companies that provide raw materials, components, or services to {company_name} for its production processes or operations. They are part of the upstream supply chain.

- **Buyer companies:** These are companies that purchase products or services from {company_name}. They could be wholesalers, retailers, or end consumers in the downstream supply chain.

- **Partnership companies:** These are companies that have strategic alliances, joint ventures, or collaborations with {company_name} for mutual benefit. These partnerships can involve co-development of products, shared resources, or other cooperative efforts.

- **Ownership companies:** These are companies or entities that have a significant ownership stake in {company_name}. They might be parent companies, holding companies, or major shareholders.

- **Subsidiary companies:** These are companies that are owned or controlled by {company_name}. They operate under the larger corporate umbrella of {company_name} and may engage in related or diverse business activities.

Please provide your response in the JSON format below:
```json
{{
  "supplier_companies": [...],
  "buyer_companies": [...],
  "partnership_companies": [...],
  "ownership_companies": [...],
  "subsidiary_companies": [...]
}}
```

For each category, list only the names of actual, existing companies. Do not include organizations that are not companies (e.g., governments, universities, banks) or companies that are not directly part of the supply chain. If there are no companies that match a specific relationship category, still include the category name in the JSON output but leave the list empty. Also, for each category, do not write more than 3 companies.""",
    )
    response = ctx.generate(
        prefill="```json\n",
        gen=gen,
        generation_config={
            "do_sample": True,
            "top_p": 0.95,
            "stop_strings": "}",
        },
    )
    try:
        obj = ast.literal_eval(response + "}")
        assert all([x in obj for x in ["supplier_companies", "buyer_companies", "partnership_companies", "ownership_companies", "subsidiary_companies"]])
        for key in ["supplier_companies", "buyer_companies", "partnership_companies", "ownership_companies", "subsidiary_companies"]:
            temp_list = obj[key]
            temp_list = [
                re.sub(r"\([^)]*\)", "", x).strip() for x in temp_list
            ]
            temp_list = [cleanco.basename(x).strip() for x in temp_list]
            temp_list = [x for x in temp_list if x]
            save_dict[key] = temp_list
        return save_dict, (company_name, company_info)
    except Exception:
        raise Exception
    finally:
        del ctx


In [7]:
set_seed(42)
stage_one_results = []
seed_companies = ["Tesla", "Boeing"]
company_info_obj = {}

for round in range(3):
    seed_candidates = []
    for idx, company in enumerate(tqdm(seed_companies)):
        #print(company)
        try:
            company_dict, company_info_tuple = company_tree(company)
            stage_one_results.append(company_dict)
            company_info_obj[company_info_tuple[0]] = company_info_tuple[1]
            seed_candidates += (
                company_dict["supplier_companies"]
                + company_dict["buyer_companies"]+ company_dict["partnership_companies"]
            )
        except Exception:
            pass
        if (idx + 1) % 5 == 0:
            torch.cuda.empty_cache()
            sleep(15)
    seed_candidates = list(set(seed_candidates))
    seed_companies = [
        x
        for x in seed_candidates
        if x not in company_info_obj.keys()
    ]


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/92 [00:00<?, ?it/s]

In [8]:
relation_triples = []
for row in stage_one_results:
    bag = []
    if not row['company_name']:
        continue
    for c in row['supplier_companies']:
        if c == row['company_name']:
            continue
        elif c <= row['company_name']:
            bag.append((c, row['company_name'], 2))
        else:
            bag.append((row['company_name'], c, 1))
    for c in row['buyer_companies']:
        if c == row['company_name']:
            continue
        elif c <= row['company_name']:
            bag.append((c, row['company_name'], 1))
        else:
            bag.append((row['company_name'], c, 2))
    for c in row['partnership_companies']:
        if c == row['company_name']:
            continue
        elif c <= row['company_name']:
            bag.append((c, row['company_name'], 3))
        else:
            bag.append((row['company_name'], c, 3))
    for c in row['ownership_companies'] + row['subsidiary_companies']:
        if c == row['company_name']:
            continue
        elif c <= row['company_name']:
            bag.append((c, row['company_name'], 4))
        else:
            bag.append((row['company_name'], c, 4))
    bag = list(set(bag))
    relation_triples += bag
relation_triples = list(set(relation_triples))

df_relation_triples = pd.DataFrame(relation_triples, columns=["A", "B", "label"])
df_relation_triples.to_json("df_relation_triples.json", orient="records", force_ascii=False)

In [9]:
from collections import Counter
Counter([x[2] for x in relation_triples])

Counter({4: 336, 2: 302, 1: 295, 3: 283})

## 2. Generate Zero-shot Synthetic Sentences from Relation Triples

In [10]:
info_dict = dict()

def get_company_info(company_name, info):
    if info:
        info_dict[company_name] = info
        return info
    if company_name in info_dict:
        return info_dict[company_name]
    ctx = LLMChatContext(tokenizer=gen.tokenizer)
    ctx.add_chat(
        role="system",
        content="You are a helpful AI assistant with extensive knowledge of the global supply chain.",
    )
    ctx.add_chat(
        role="user",
        content=f"Provide a detailed overview of the company '{company_name}'. Include information about the industry in which it operates and the specific products or services it offers. Limit your answer to a single paragraph.",
    )
    # Company info
    result = ctx.generate(
        gen=gen, generation_config={"do_sample": False, "stop_strings": "\n"}
    )
    del ctx
    return result


def generate_sentences(A: str, B: str, label: int):
    # swap
    if random.random() > 0.5:
        C = A
        A = B
        B = C
        if label == 2:
            label = 1
        elif label == 1:
            label = 2
    save_dict = {"A": A, "B": B, "label": label, "sentences": []}
    ctx = LLMChatContext(tokenizer=gen.tokenizer)
    ctx.add_chat(
        role="system",
        content="""You are a helpful AI assistant with extensive knowledge of the global supply chain. Your task is to generate a realistic yet unique sentence that matches the given context.""",
    )
    # ctx.add_chat(
    #     role="user",
    #     content=f"Before generating the sentence, provide a detailed overview of the company '{A}'. Include information about the industry in which it operates and the specific products or services it offers. Limit your answer to a single paragraph.",
    # )
    # ctx.add_chat(role="assistant", content=get_company_info(A, None))
    # ctx.add_chat(
    #     role="user",
    #     content=f"Next, provide a detailed overview of the company '{B}'. Include information about the industry in which it operates and the specific products or services it offers. Limit your answer to a single paragraph.",
    # )
    # ctx.add_chat(role="assistant", content=get_company_info(B, None))
    match label:
        case 1:
            user_prompt = f"Generate three different sentences that imply a buyer-supplier relationship between {A} and {B}, where {A} is the buyer and {B} is the supplier. Note that the sentences should not portray an ambiguous partnership relationship between {A} and {B}. The sentences should portray a clear indication of a directed supply chain relationship, such as expressing the items that are bought by {A} from {B}."
        case 2:
            user_prompt = f"Generate three different sentences that imply a supplier-buyer relationship between {A} and {B}, where {A} is the supplier and {B} is the buyer. Note that the sentences should not portray an ambiguous partnership relationship between {A} and {B}. The sentences should portray a clear indication of a directed supply chain relationship, such as expressing the items that are shipped by {A} to {B}."
        case 3:
            user_prompt = f"Generate three different sentences that imply a relationship between {A} and {B} that is portrayed as arbitrary or undirected. This relationship can take various forms, such as collaborations, joint ventures, strategic alliances, or any other type of ambiguous business relationship. Note that the sentences should not convey a supply chain relationship between the two companies."
        case 4:
            user_prompt = f"Generate three different sentences that imply an ownership relationship between {A} and {B}, where {A} owns or has acquired {B}."
    user_prompt += f"""The sentences should meet the following criteria:
- Crafted in a style that could be reminiscent of a newspaper article, press release, industry report, or any other relevant source.
- Contain the exact entity words '{A}' and '{B}'.
- Should be unique, diverse and creative.

Please provide your response in the JSON format below:
```json
{{
    "sentences": ["(sentence 1)", "(sentence 2)", "(sentence 3)"]
}}
```"""
    ctx.add_chat(role="user", content=user_prompt)
    response = ctx.generate(
        prefill="```json\n",
        gen=gen,
        generation_config={
            "do_sample": True,
            "top_p": 0.95,
            "stop_strings": "}",
        },
    )
    try:
        obj = ast.literal_eval(response + "}")
        save_dict["sentences"] = obj["sentences"]
    except Exception:
        print("!")
    finally:
        del ctx
        return save_dict


In [15]:
set_seed(42)
stage_two_results = []
for idx, row in tqdm(df_relation_triples.iterrows(), total=len(df_relation_triples)):
    stage_two_results.append(generate_sentences(A=row["A"], B=row["B"], label=row['label']))
    pd.DataFrame(stage_two_results).to_json("2.Two stage.json", orient="records", force_ascii=False)
    if (idx + 1) % 5 == 0:
        torch.cuda.empty_cache()
        sleep(15)

  0%|          | 0/1216 [00:00<?, ?it/s]

!


In [16]:
gen.unload()
torch.cuda.empty_cache()

In [17]:
def strip_parentheses(row):
    row["original_text"] = re.sub(r'\(.*?\)|\[.*?\]|<.*?>|\{.*?\}', '', row["original_text"])
    row["original_text"] = re.sub(r'\s{2,}', ' ', row["original_text"]).strip()
    return row

def text_generator(df):
    for _, row in df.iterrows():
        yield row["original_text"]

def process_named_entities(df, ner_pipeline):
    results = []
    for output in tqdm(ner_pipeline(text_generator(df), aggregation_strategy="first", batch_size=256)):
        org_entities = [x for x in output if x['entity_group'] == 'ORG']
        results.append(org_entities if len(org_entities) >= 2 else [])
    return results

def create_entity_pairs(df, ner_results):
    entity_pairs = []
    for idx, row in tqdm(df.iterrows()):
        output = ner_results[idx]
        for token_idx_from, token_idx_to in combinations(range(len(output)), r=2):
            pair = create_entity_pair(row, output, token_idx_from, token_idx_to)
            entity_pairs.append(pair)
    return entity_pairs

def create_entity_pair(row, entities, idx_from, idx_to):
    pair = {
        "original_text": row['original_text'],
        "label": None,
        "NE_FROM": None,
        "NE_TO": None,
        "NE_OTHER": [],
        "masked_text": row["original_text"]
    }

    for token_idx, token in reversed(list(enumerate(entities))):
        if token_idx == idx_to:
            pair["NE_TO"] = token['word']
            pair["masked_text"] = pair["masked_text"][:token['start']] + "__NE_TO__" + pair["masked_text"][token['end']:]
        elif token_idx == idx_from:
            pair["NE_FROM"] = token['word']
            pair["masked_text"] = pair["masked_text"][:token['start']] + "__NE_FROM__" + pair["masked_text"][token['end']:]
        else:
            pair["NE_OTHER"].append(token['word'])
            pair["masked_text"] = pair["masked_text"][:token['start']] + "__NE_OTHER__" + pair["masked_text"][token['end']:]

    pair['label'] = determine_label(row, pair["NE_FROM"], pair["NE_TO"])
    return pair

def determine_label(row, ne_from, ne_to):
    if ne_from == row['A'] and ne_to == row['B']:
        return row['label']
    elif ne_from == row['B'] and ne_to == row['A']:
        if row['label'] == 1:
            return 2
        elif row['label'] == 2:
            return 1
        else:
            return row['label']
    else:
        return 0

In [18]:
df = pd.DataFrame(stage_two_results)
df = df.explode("sentences").dropna().reset_index(drop=True)
df = df.rename(columns={"sentences": "original_text"})
df = df.apply(strip_parentheses, axis=1)
ner_pipeline = pipeline("ner", model="dslim/bert-large-NER", device="cuda")
ner_results = process_named_entities(df, ner_pipeline)
entity_pairs = create_entity_pairs(df, ner_results)
final_df = pd.DataFrame(entity_pairs)
final_df = final_df.groupby('original_text').filter(
    lambda x: (x['NE_FROM'] != '').all() and
              (x['NE_TO'] != '').all() and
              (x['label'] != 0).any()
).reset_index(drop=True)
Dataset.from_pandas(final_df).save_to_disk("../../datasets/TwoStageDataset")

Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Saving the dataset (0/1 shards):   0%|          | 0/3551 [00:00<?, ? examples/s]