In [None]:
#!pip install pandas openai torch scikit-learn dvc dvc-s3
#!pip install openpyxl retry python-dotenv

#!dvc pull artifacts/matcher_model.dvc

In [1]:
import os
import sys
import pandas as pd
import openai
import pandas as pd
from glob import glob
from tqdm import tqdm
import json
import ast
from pathlib import Path
import traceback
from dotenv import load_dotenv
load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")



In [2]:
machine = "paperspace"

if machine == "local":
    src_dir= Path.cwd().parent    
elif machine == "paperspace":
    src_dir = Path("/notebooks/inferess-relation-extraction/")

sys.path.append(str(src_dir))

# import annotation methods
# from src.labels_generator import (generate_relations,relation_search,  resort_relation, relations_tupled, get_completion, deserialize_relations)

# # Load matcher
# from src.matcher.core import SimCSE_Matcher
# matcher = SimCSE_Matcher(str(src_dir/ 'artifacts/matcher_model'))


In [2]:
prompt_v2_3 = """Your task is to provide an explanation about the relation between companies and role they represent in the relation from the report given in ``` quote.

- possible relation - {supplier_and_customer, financial_trade, nothing}
- role - {supplier, customer} 
- role is required for supplier_and_customer relation only, Identifying which company is supplier and which company is customer is important.

# General information for different relations
- `supplier_and_customer` : sometimes it is possible to find customer and supplier companies from the report information.
- `financial_trade` : sometimes it is not possible to find exact relation between companies. so we call such relations as financial_trade
- `Nothing` : In case information is incomplete or there nothing to infer, then we call it Nothing relation


##Report
```
{sentence}

```

Here are some definitions that might help to understand how to identify the relation between companies

## Rules for `financial_trade`
- If report mentions collaboration or any joint development without stating customer or supplier role, then companies have financial_trade relation
- If report mentions supply agreement without stating buyer and supplier, then companies have financial_trade relation
- If report mentions purchase agreement without stating buyer and supplier, then companies have financial_trade relation
- Companies engaged in providing financing or investing in another company are in a financial_trade relation
- If companies are involved in the buying and selling of company shares or ownership interests in each other, they are in a financial_trade relation.
- When companies are mentioned in the context of mergers and acquisitions, they have a financial_trade relation.
- Companies participating in the purchase of assets or transactions involving related parties are in a financial_trade relation.
- If companies have a debtor and creditor relationship with each other, they have financial_trade relation.
- Companies conducting transactions for the purpose of managing their working capital have a financial_trade relation.
- Companies engaged in any form of payment, excluding revenue or sales contributions, with another company have financial_trade relation.
 

## Rules for identifying the `supplier_and_customer` relation and assigning role of the customer or supplier -
- Customer companies may be referred as contributing to the revenue stream of supplier companies
- Supplier companies may be referred as companies contribute to purchase of customer companies
- Customer companies may be referred as companies accounting for revenues or net sales of supplier companies
- Customer companies may be referred as companies contributing certain percentage of revenues or net sales of supplier companies in certain year.
- Customer companies are accountable for outstanding payment in trade balance, accounts receivable or amount payable to supplier companies
- Customer companies may be referred to as companies that assign or transfer manufacturing responsibilities to suppliers
- Customer companies may be referred to as companies getting key inputs or materials required in their production process from supplier companies
- Customer companies may be referred to as companies that purchase some product, goods or service from supplier
- Customer companies may be referred to as distributors, users, commercializers, retailers, insurance companies, or similar entities.
- Customer companies could act as a distribution channel. Supplier has original content, and customer companies distribute it via their platform or channel
- Customer company may be referred as paying royalty to another company
- Customer company may be referred as using supplier company's technology or service
- Customer companies may be referred as using license of supplier companies. 
- Supplier companies depend on customer companies to supply or sale of their product
- Supplier companies may be referred to as companies that gain or take revenues from customers
- Supplier companies may be referred as companies whose revenue is increased or decreased or have revenue impact due to customer companies
- Supplier companies receive outstanding payments in trade balance, accounts receivable or amount payable from customer companies
- Supplier companies may be referred to as those who own, operate, or manufacture materials and then sell to customer companies
- Supplier companies may be referred to as vendors or providers of services, products, or materials to customer companies
- Supplier companies may be referred to as giving license to customer company. licensor would be supplier and licensee would be customer company.
- Supplier companies may be referred to as entities that offer discounts, special pricing to customer companies due to their business relationship.
- If a company is identified as a customer based on these rules, the other company transacting with it is considered the supplier.
- If a company is recognized as a supplier, the other company transacting with it is deemed the customer.


## Steps to follow for finding relations and roles - 
1. Use only the information provided in the report
2. Do not infer anything outside of the given context
3. Convert the report into mulitple simple sentences to understand it better and call it `Report explanation`
4. Apply all rules from all relations on the given report and `report explanation`
5. Think step by step to find the correct relation using rules and report information


## Output instructions and format - 
 
- Write the found `Relation` 
- Write the `Relation explanation` for only identified relation as per below instructions
- Skip the `Relation explanation` for unidentified relation
- `Relation explanation` structure  when identified relation is `supplier_and_customer` 
   - Mention the rule that has matched to identify the relation
   - Write short explanation on how rule is matched to the report
   - As supplier_and_customer relation has two roles, mention which company has which role and write a statement about that
   - Always mention name of customer and name of supplier company saying who is supplier and who is customer
   - If more than one customers are identified, then mention all customer company names with respect to supplier company name
   - If more than one supplier are identified, then mention all supplier company names with respect to customer company name 
   - eg. 
    ```
    - Rule :
    - Explanation on how rule is matched :
    - Customer company names :
    - Supplier company names :
    ``` 

- `Relation explanation` structure when identified relation is `financial_trade` or `Nothing`
    - Mention the rule that has matched to identify the `financial_trade` or `Nothing` relation
    - Write short explanation on how rule is matched to the report
    - Write all company names that are in `financial_trade` or `Nothing` relation
    - eg. 
    ```
    - Rule :
    - Explanation on how rule is matched :
    - Companies involved :
    ```

- Output should have sections - `Relation` and `Relation explanation` 
 
"""


In [3]:
prompt_2 = '''Your task is to identify the relation and role of company mentioned in the explanation given in ``` quote 

- possible relation - {supplier_and_customer, financial_trade, nothing}
- role - {supplier, customer} 
- role would be mentioned only for supplier_and_customer relation

##Explanation
```
{explanation}

```

## Rules to follow for creating output -
- use only information given in the explanation
- Don't infer anything outside of the given explanation
- for `supplier_and_customer` relation 
     - examine the explanation to find the mentioned customer and supplier company names
     - supplier_and_customer relation holds value only when the identities of the customer and supplier are specified
     - For all supplier and customer pairs, assign customer company name to assign supplier company name
     - As per explanation, find all correct customer and supplier company pairs and add it to the output json
- for `financial_trade` and `Nothing` relation 
     - find all companies mentioned for this relation
     - Create pairs of company names. A pair has two company names in it.
- Output should be strictly in JSON Object for the companies mentioned in the explanation
- If the explanation `supplier_and_customer` relation, then map company names to their role in the output:
    - If company is customer in relation, then it's name is mapped to customer key
    - If company is supplier in relation, then it's name is mapped to supplier key
    - role customer and supplier are keys in json and the values are respective company names.
    - example -  {'supplier_and_customer' : [ {'customer': 'company_name_acting_as_customer', 'supplier': 'company_name_acting_as_supplier'} ] } 
- If two companies are in financial_trade relation, then create list of all such two companies involved in the financial_trade relation
     - example -  { 'financial_trade': [ [ company1_name, company2_name ] ] }
- If two companies are in nothing relation, then create a list of all such two companies involved in the nothing relation
     - example -  { 'nothing': [ [ company1_name, company2_name ] ] } 
- Don't replicate relations
- Output should be strict json object that can be parsed


## output
Return single JSON Object that contains relations with the following format in ``` quote -
```{'supplier_and_customer' : [ {'customer': 'company_acting_as_customer', 'supplier': 'company_acting_as_supplier'} ],
   'financial_trade': [ [company1_name, company2_name] , [company1_name, company2_name] ], 
   'nothing': [ [company1_name, company2_name] , [company1_name, company2_name]]}```

'''

In [3]:
prompt_v2_3 = """Your task is to provide an explanation about the relation between companies and role they represent in the relation from the report given in ``` quote.

- possible relation - {supplier_and_customer, other}
- role - {supplier, customer} 
- role is required for supplier_and_customer relation only, Identifying which company is supplier and which company is customer is important.

# General information for different relations
- `supplier_and_customer` : sometimes it is possible to find customer and supplier companies from the report information.
- `other` : sometimes it is not possible to find exact relation between companies. so we call such relations as other


##Report
```
{sentence}

```

Here are some definitions that might help to understand how to identify the relation between companies

## Rules for `other`
- sentence only mentions agreement changes, renewal , expiration or amendment
- sentence only mentions companies are in supply, sale, purchase or service agreement and nothing else.
- sentence only mentions companies are entering into agreement or details of existing agreement
- sentence only mentions companies are in the collaboration, join development or partnership agreement
- sentence mentions employment agreement
- sentence mentions changes in and disagreements with accountants


## Rules for identifying the `supplier_and_customer` relation and assigning role of the customer or supplier -
- customer companies receives the services, products from the supplier company as per agreement
- Suppliers companies have obtained contracts from customers to provide product and services
- Supplier companies receive royalties from customer companies
- Supplier companies receive milestone payments from customer companies
- Customer companies may be referred to as the source of revenue for the supplier company


## Steps to follow for finding relations and roles - 
1. Use only the information provided in the report
2. Do not infer anything outside of the given context
3. Convert the report into mulitple simple sentences to understand it better and call it `Report explanation`
4. Apply all rules from all relations on the given report and `report explanation`
5. Think step by step to find the correct relation using rules and report information


## Output instructions and format - 
 
- Write the found `Relation` 
- Write the `Relation explanation` for only identified relation as per below instructions
- Skip the `Relation explanation` for unidentified relation
- `Relation explanation` structure  when identified relation is `supplier_and_customer` 
   - Mention the rule that has matched to identify the relation
   - Write short explanation on how rule is matched to the report
   - As supplier_and_customer relation has two roles, mention which company has which role and write a statement about that
   - Always mention name of customer and name of supplier company saying who is supplier and who is customer
   - If more than one customers are identified, then mention all customer company names with respect to supplier company name
   - If more than one supplier are identified, then mention all supplier company names with respect to customer company name 
   - eg. 
    ```
    - Rule :
    - Explanation on how rule is matched :
    - Customer company names :
    - Supplier company names :
    ``` 

- `Relation explanation` structure when identified relation is `other`
    - Mention the rule that has matched to identify the `other` relation
    - Write short explanation on how rule is matched to the report
    - Write all company names that are in `other` relation
    - eg. 
    ```
    - Rule :
    - Explanation on how rule is matched :
    - Companies involved :
    ```

- Output should have sections - `Relation` and `Relation explanation` 
 
"""


In [4]:
prompt_2 = '''Your task is to identify the relation and role of company mentioned in the explanation given in ``` quote 

- possible relation - {supplier_and_customer, other}
- role - {supplier, customer} 
- role would be mentioned only for supplier_and_customer relation

##Explanation
```
{explanation}

```

## Rules to follow for creating output -
- use only information given in the explanation
- Don't infer anything outside of the given explanation
- for `supplier_and_customer` relation 
     - examine the explanation to find the mentioned customer and supplier company names
     - supplier_and_customer relation holds value only when the identities of the customer and supplier are specified
     - For all supplier and customer pairs, assign customer company name to assign supplier company name
     - As per explanation, find all correct customer and supplier company pairs and add it to the output json
- for `other` relation 
     - find all companies mentioned for this relation
     - Create pairs of company names. A pair has two company names in it.
- Output should be strictly in JSON Object for the companies mentioned in the explanation
- If the explanation `supplier_and_customer` relation, then map company names to their role in the output:
    - If company is customer in relation, then it's name is mapped to customer key
    - If company is supplier in relation, then it's name is mapped to supplier key
    - role customer and supplier are keys in json and the values are respective company names.
    - example -  {'supplier_and_customer' : [ {'customer': 'company_name_acting_as_customer', 'supplier': 'company_name_acting_as_supplier'} ] } 
- If two companies are in `other` relation, then create list of all such two companies involved in the other relation
     - example -  { 'other': [ [ company1_name, company2_name ] ] }
- Don't replicate relations
- Output should be strict json object that can be parsed


## output
Return single JSON Object that contains relations with the following format in ``` quote -
```{'supplier_and_customer' : [ {'customer': 'company_acting_as_customer', 'supplier': 'company_acting_as_supplier'} ],
   'other': [ [company1_name, company2_name] , [company1_name, company2_name]]}```

'''

In [8]:
replaces = {"sentence": "{sentence}"}
# Replace the keys with values for unified relation direction
relations_map = {"customer": "supplier"}

In [5]:
import re
import json
import time
from typing import Tuple, List, Text, Dict
from collections import defaultdict
from tqdm import tqdm
from itertools import chain
from copy import copy
from retry import retry

@retry(tries=3, delay=1)
def get_completion_2(prompt:Text,
                        temperature:float=0,
                        model="gpt-3.5-turbo")->str:
    messages = [{"role": "user", "content": prompt}]
    response = None

    response = openai.ChatCompletion.create(
        model= model,
        messages=messages,
        temperature= temperature,    #this is the degree of randomness of the model's output
        request_timeout = 90
    )

    return response.choices[0].message["content"]

def generate_relations_2(data: pd.DataFrame,
                       prompt_1: Text,
                       prompt_2: Text,
                       replaces: dict,
                       relations_map:dict)-> pd.DataFrame:
    output = []
    # Iterate over the frame rows
    for i, row in tqdm(data.iterrows(), total=data.shape[0], desc="Generating relations"):
        row = row.to_dict()
        report_prompt_1 = copy(prompt_1)
        for k, v in replaces.items():
            report_prompt_1 = report_prompt_1.replace(v, row.get(k))
        # Generate relations and parse it
        try:
            prompt_1_completion = get_completion_2(prompt=report_prompt_1)        
            row['explanation'] = prompt_1_completion
        except:
            print("prompt 1 explanation failed")
            row['explanation'] =  row["sentence"]
        
        #prompt_1_completion = row['explanation']
        report_prompt_2 = copy(prompt_2)
        
        retry_on_parse_err = False        
        attempt_count = 0
        while attempt_count == 0 or retry_on_parse_err:
            try:
                report_prompt_2 = report_prompt_2.replace("{explanation}", prompt_1_completion)
                prompt_2_completion = get_completion_2(prompt=report_prompt_2)        
                row["relation_completion"] = prompt_2_completion        
                relations_list = deserialize_json_dict(prompt_2_completion)                      
                row['relations'] = relations_tupled_2(relations_list, relations_map)                
            except:
                # don't retry of already retried
                if retry_on_parse_err == True:                      
                    retry_on_parse_err = False
                    row['relations'] = [["NotParsed", "nothing", "NotParsed"]]
                    print("Not retrying after 2nd error")
                else:
                    retry_on_parse_err = True
            
            attempt_count += 1
                        
        # Append the output list with modified row
        output.append(row)
    return pd.DataFrame(output)

def deserialize_json_dict(json_dict_str):
    json_dict = {}    
    
    try:
        json_dict = json.loads(re.sub(r"(?<!\w)'|'(?!\w)", '"', json_dict_str))
    except:
        pass
        #traceback.print_exc()
    
    if not json_dict:
        try:
            json_dict = json.loads(re.sub(r"(?<!\w)'|'(?!\w)", '"', json_dict_str.replace('"', '\\"')))
        except:
            pass
            #traceback.print_exc()           
    
    if not json_dict:        
        json_dict = ast.literal_eval(json_dict_str)

    return json_dict

def deserialize_json_list(ser_relations):    
    # the string representation of the list of dictionaries
    string_list_of_dicts = ser_relations
    # regular expression to match a dictionary
    dict_regex = r"\{[^{}]+\}"
    # find all dictionaries in the string
    dict_strings = re.findall(dict_regex, string_list_of_dicts)
    # deserialize each dictionary into a Python object
    list_of_dicts = []
    for dict_string in dict_strings:
        try:
            list_of_dicts.append(json.loads(re.sub(r"(?<!\w)'|'(?!\w)", '"', dict_string)))
        except:
            try:
                list_of_dicts.append(json.loads(re.sub(r"(?<!\w)'|'(?!\w)", '"', dict_string.replace('"', '\\"'))))
            except:
                continue
    return list_of_dicts

def relations_tupled_2(relations_list:dict, relations_map:map):
    """Converts a list of relations into a list of tuples.

    Args:
    relations_list (list): A list of dictionaries containing the relation information.
    relations_map (map): A map containing the inverse relations.

    Returns:
    list: A list of tuples containing the relation information.
    """
    relations_tuples = []

    for relation in relations_list.get('supplier_and_customer', []):
        c1 = relation.get('customer')
        c2 = relation.get('supplier')
        relation_tuple = [c2, 'supplier', c1]
        if not all(relation_tuple):
            continue
        relations_tuples.append(relation_tuple)

    for relation in relations_list.get('financial_trade', []):
        c1 = relation[0] if len(relation) == 2 else ""
        c2 = relation[1] if len(relation) == 2 else ""
        relation_tuple = [c1, 'financial_trade', c2]
        if not all(relation_tuple):
            continue
        relations_tuples.append(relation_tuple)
    
    for relation in relations_list.get('nothing', []):
        c1 = relation[0] if len(relation) == 2 else ""
        c2 = relation[1] if len(relation) == 2 else ""
        relation_tuple = [c1, 'nothing', c2]
        if not all(relation_tuple):
            continue
        relations_tuples.append(relation_tuple)
        
    return relations_tuples   


def is_rd_conflict(align, query_relation: Tuple[str, str, str],relations_tuples: List[Tuple[str, str, str]]):
    """
    rd_conflict = True when
    1. align is False 
    2. query_relation and relations_tuples both have supplier/customer relation
    """
    rd_conflict = False
    if (not align) and query_relation  and query_relation[1] in ["supplier", "customer"]:
        for relation in relations_tuples:
            if relation[1] in ["supplier", "customer"]:
                rd_conflict = True
                break
            
    return rd_conflict

In [8]:
def generate_relations_wrapper(data, prompt_v2_3, prompt_2, replaces, relations_map):
    
    # run prompts to find relations
    output = generate_relations_2(data, prompt_v2_3, prompt_2, replaces, relations_map)

    # Resort the sme_relations to unify the relations directions
    if not isinstance(output['sme_relations'].iloc[0], list):
        tqdm.pandas(desc="Eval string of list")
        output['sme_relations'] = output['sme_relations'].progress_apply(eval)


    if not isinstance(output['relations'].iloc[0], list):
        tqdm.pandas(desc="Eval string of list")
        output['relations'] = output['relations'].progress_apply(eval)


    tqdm.pandas(desc="Resort sme relations")
    output['sme_relations'] = output['sme_relations'].progress_apply(lambda x:\
                            resort_relation((x[0], x[1], x[2]),
                                            relations_map))


    # Search relations and return mask
    tqdm.pandas(desc="Search relations")
    output['align'] =\
    output[['sme_relations', 'relations']]\
    .progress_apply(lambda x:
    relation_search(
    query_relation= x[0],
    relations_tuples=x[1],
    matcher=matcher,
    threshold=0.85,
    main_relations=list(relations_map.values())),axis=1).to_list()


    output["rd_conflict"] = output[["align","sme_relations", "relations"]].apply(lambda x: is_rd_conflict(x[0], x[1], x[2]), axis=1)

    return output


In [12]:
data = pd.read_excel("./test_pipeline_data/labelled_data_for_prompt/agreement_neg_labelled_data.xlsx")
#neg_data_df = pd.read_excel("./test_pipeline_data/labelled_data_for_prompt/agreement_neg_labelled_data.xlsx")

In [13]:
# Generate list of relations in tuples
output = generate_relations_2(data, prompt_v2_3, prompt_2, replaces, relations_map)

Generating relations: 100%|██████████| 49/49 [02:53<00:00,  3.55s/it]


In [11]:
output.to_excel("/notebooks/data_to_download/agreement_neg_labelled_data_old_p.xlsx", index = False)


In [None]:

data = pd.read_excel("./../conflicts_in_label1/Inferess_train_data.xlsx")
data = data[data["inf_relations"]=="other"]

output = generate_relations_wrapper(data, prompt_v2_3, prompt_2, replaces, relations_map)

file_path = f'.././conflicts_in_label1/llm_relations_v2_3_other_relation.xlsx'
# # Save the DataFrame to Excel
output.to_excel(file_path, index=False)  # Set index=False if you don't want to save the index as a separate column


In [10]:
data = pd.read_excel("./../conflicts_in_label1/Inferess_train_data_label_0.xlsx")
data[data["Label"]==0].shape


(2087, 8)

In [16]:
# for start_idx in range(100, 3700, 100):

#     data = pd.read_excel("../huge_labelled_data/huge_train_dedup_80_lg_sent.xlsx")
#     data = data.drop('duplicate_sentences', axis=1)

#     data = data[start_idx:start_idx+100]

#     output = generate_relations_wrapper(data, prompt_v2_3, prompt_2, replaces, relations_map)
    
#     file_path = f'../huge_labelled_data/huge_train_lg_sent_llm_v2_3_batches/huge_train_lg_sent_llm_v2_3_{start_idx}_{start_idx+100}.xlsx'
#     # # Save the DataFrame to Excel
#     output.to_excel(file_path, index=False)  # Set index=False if you don't want to save the index as a separate column


df_arr = [] 

for start_idx in range(0, 3700, 100):
    df_arr.append(pd.read_excel(f"../huge_labelled_data/huge_train_lg_sent_llm_v2_3_batches/huge_train_lg_sent_llm_v2_3_{start_idx}_{start_idx+100}.xlsx"))
    
output = pd.concat(df_arr, ignore_index=True)
output = output.sort_values(by=['index'])

# output = output[(output["align"] == False) & (output["rd_conflict"] == False)]

file_path = '../huge_labelled_data/huge_train_lg_sent_llm_v2_3.xlsx'
output.to_excel(file_path, index=False)  


In [None]:
output = output[(output["align"] == False)]
file_path = f'../huge_labelled_data/huge_train_lg_sent_llm_v2_3_not_aligned.xlsx'

# # Save the DataFrame to Excel
output.to_excel(file_path, index=False)  # Set index=False if you don't want to save the index as a separate column


In [None]:
# v2_3 conflicts from 100 conflict v2_2 :: p13

# Total
print(f"Total :: {output.shape[0]}" )

# TP :: Align true (Higher the better)
print(f"TP :: {output[(output['align'] == True)].shape[0]}" )

# FP :: Align true (lower the better)
print(f"FP :: {output[(output['rd_conflict'] == True)].shape[0]}" )

# FN :: Align false, but no RD conflict (Higher the better)
print(f"FN :: {output[(output['rd_conflict'] == False) & (output['align'] == False)].shape[0]}")

In [None]:
output.columns

In [None]:
#correct relations in label=0

# data = pd.read_excel("./../conflicts_in_label1/Inferess_train_data.xlsx")
# data = data[data["Label"]==0]
# data.shape

# def correct_sme_relation(entity_1, entity_2, inf_relations ):
#     if inf_relations == "customer":
#         return [entity_1, "supplier" , entity_2]
#     elif inf_relations == "supplier":
#         return [entity_2, "supplier" , entity_1]
#     else:
#         return []

# # correct_sme_relation
# tqdm.pandas(desc="Search relations")
# data['sme_relations'] =\
# data[['entity_1', 'entity_2', 'inf_relations']]\
# .progress_apply(lambda x:
# correct_sme_relation(
# entity_1= x[0],
# entity_2=x[1],
# inf_relations=x[2]),axis=1).to_list()

# file_path = f'.././conflicts_in_label1/Inferess_train_data_label_0.xlsx'
# # Save the DataFrame to Excel
# data.to_excel(file_path, index=False)  # Set index=False if you don't want to save the index as a separate column


In [None]:

# output[output["align_v2.1"] == False].shape
# output[(output["align_v2"] == False) & (output["align_v2.1"] == False)].shape
# output[(output["align_v2"] == False) & (output["align_v2.1"] == True)].shape
# output[(output["align_v2"] == True) & (output["align_v2.1"] == False)].shape
# output[(output["align_v2"] == False) & (output["rd_conflict_v2"] == True)].shape
# output[(output["align_v2.1"] == False) & (output["rd_conflict_v2.1"] == True)].shape
# output[(output["rd_conflict_v2"] == True)].shape
# output[(output["rd_conflict_v2.1"] == True)].shape
# output[(output["rd_conflict_v2"] == True) & (output["rd_conflict_v2.1"] == False)].shape
# output[(output["rd_conflict_v2.1"] == True) & (output["rd_conflict_v2"] == True)].shape


#                                         align_v2=False   align_v2.1=False   
# all                                          261              283
# rd_conflict=True                             145              73
# other_prompt_dont_have_rd_conflict           110              35



## Generate new sentences using LLM  

In [2]:

concept_class_dict = {
    'legal_and_regulatory': {'1st_preference': ['government regulations',
                                           'legal proceedings',
                                           'regulatory compliance'],
                        '2nd_preference': []},
'royalties': {'1st_preference': ['royalties'], '2nd_preference': []},
 'licensing_and_ip': {'1st_preference': ['exclusive rights or agreement',
                                         'licensing agreement',
                                         'intellectual property',
                                         'product licensing',
                                         'product marketing'],
                      '2nd_preference': ["research and development"]},
 'real_estate': {'1st_preference': ['lease agreement',
                                    'lease transactions',
                                    'real estate lease'],
                 '2nd_preference': []},
 'supply_purchase_agreement': {'1st_preference': ['supply_purchase_agreement'],
                               '2nd_preference': []},
 'services agreement': {'1st_preference': ['services agreement'],
                        '2nd_preference': []},                               
 'agreement_and_partnership': {'1st_preference': ['contracts in business',
                                    'partnership with another company',
                                    'joint venture or development',
                                    'collaboration agreement'],
                 '2nd_preference': ['agreement with another company', 'changes in agreement']},
 'product_related': {'1st_preference': ['product manufacturing',
                                        'outsourcing operation',
                                        'production or operation',
                                        'product pricing and cost',
                                        'product description'],
                     '2nd_preference': []},
  'supply_chain': {'1st_preference': ['supply channels',
                                     'supply chain',
                                     'product distributors',
                                     'distribution agreement',
                                     'purchase of products'],
                  '2nd_preference': []},
'investment_related': {'1st_preference': ['mergers or acquisitions', 'financing or loans', 
                                          'stock ownership', 'investments in another company', 'sale of assets', 
                                          'buy or sale of shares', 'acquisition', 'acquisition of another company'],
                    '2nd_preference': []},          

 'revenue': {'1st_preference': ['sources of revenue',
                                'contributors to revenue',
                                'yearly revenue numbers',
                                'impact on revenue',
                                'sales performance',
                                'revenue recognition',
                                'potential earnings',
                                'receivable or payable amount to accounts',
                                'outstanding payment',
                                'milestone payments'],
             '2nd_preference': ['dependency on customers']},

  'financial_statements': {'1st_preference': ['financial statements'],
                          '2nd_preference': []},
 'unknown': {'1st_preference': [], '2nd_preference': []}
}

In [6]:
# write a code to create a list from concept_class_dict where all concepts from 1st preference are added 

concept_class_list = []
for k, v in concept_class_dict.items():
    concept_class_list.extend(v["1st_preference"])

len(concept_class_list)

47

In [31]:
sentence_gen_prompt = """You task is to generate sentences related to "{input_concept_name}" for training the  "supplier_customer" classifier.  

## Label 1 or 0 rules - 
- classifier predicts 1 when there is supplier / customer relation due to "{input_concept_name}" relation 
- supplier / customer relation should be very clear to human understanding on just generated sentence 
- Classifier predicts 0 when there is no supplier / customer relation between two companies even when sentence talks about "{input_concept_name}" 

Understand all rules mentioned below and then generate sentences 

## Rules to generate sentence
1. Generate the sentence related to  "{input_concept_name}"  concept only 
2. for label 1 sentence, two companies need to have clear  supplier / customer relation
2. for label 0 also, sentence has to be related to  "{input_concept_name}"  concept only 
3. Generate very short and simple sentences. 
4. Create sentences with two companies are mentioned
5. Use any random valid company names that you know in generated sentences
6. Create diverse examples for label 1 and label 0
7. Generate only 10 sentences, 5 for label 1 and 5 for label 0 
8. Also generate the short reason for label 1 or 0 

# Output rules
- Write output in json object with three keys - `sentence`, `label`, `reason_for_label`
- Output should be strict json object that can be parsed

## output
Return JSON list of json objects with following format given in ``` quote -
```[
    {'sentence' : 'generated sentence' ,
     'label': '1 or 0'
     'reason_for_label': 'one line reason why label 1 or 0'
    }
   ]```

"""

In [14]:
generate_sentences_op = []

In [38]:
def generate_sentences(prompt_1: Text,
                       concept_class_list: List[Text],
                       gpt_model_name: Text)-> List[Dict]:
    
    global generate_sentences_op
    generate_sentences_op = []

    # Iterate over the concept_class_list
    for concept in tqdm(concept_class_list, total=len(concept_class_list), desc="Generating sentences"):
        # Create a prompt for the concept
        prompt = prompt_1.replace("{input_concept_name}", concept)
        # Generate sentences and parse it
        try:            
            prompt_completion = get_completion_2(prompt=prompt, 
                                                 model=gpt_model_name)
            sentences = deserialize_json_list(prompt_completion)
        except:
            print(prompt_completion)
            print(f"prompt completion failed for concept {concept}")
            sentences = []
        
        # Append the output list with modified row
        generate_sentences_op.extend(sentences)
    
    return generate_sentences_op


[]

In [64]:
# test code for 1 concept
generated_sentences_list = generate_sentences(sentence_gen_prompt, 
                                               concept_class_list,
                                               "gpt-3.5-turbo" # "gpt-4"
                                               )


Generating sentences: 100%|██████████| 2/2 [01:35<00:00, 47.85s/it]


In [65]:
generated_sentences_df = pd.DataFrame.from_records(generated_sentences_list)

# change column to number format
generated_sentences_df["label"] = generated_sentences_df["label"].astype(int)

generated_sentences_df.to_excel("generated_sentences.xlsx", index=False)

In [66]:
generated_sentences_df = pd.read_excel("generated_sentences.xlsx")