### Imports

In [None]:
#!pip install pandas openai torch scikit-learn dvc dvc-s3
#!pip install openpyxl retry python-dotenv

#!dvc pull artifacts/matcher_model.dvc

In [2]:
import os
import sys
import pandas as pd
import openai
import pandas as pd
from glob import glob
from tqdm import tqdm
import json
import ast
from pathlib import Path
import traceback
from dotenv import load_dotenv
import re
import time
from typing import Tuple, List, Text, Dict
from collections import defaultdict
from itertools import chain
from copy import copy

from retry import retry

load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")

current_path = Path.cwd()
src_dir = current_path.parent.parent
sys.path.append(str(src_dir))

# import annotation methods
from src.labels_generator import (relation_search, resort_relation)

# Load matcher
from src.matcher.core import SimCSE_Matcher
matcher = SimCSE_Matcher(str(src_dir/ 'artifacts/matcher_model'))


  from .autonotebook import tqdm as notebook_tqdm


### Correct sme_relations

In [20]:
# read excel file with two tabs -

pos_data = pd.read_excel("labelled_data_for_prompt/agreement.xlsx", sheet_name="pos")
neg_data = pd.read_excel("labelled_data_for_prompt/agreement.xlsx", sheet_name="neg")

print(pos_data.columns)


Index(['index', 'sentence', 'Label', 'org_groups', 'inf_relations', 'entity_1',
       'entity_2', 'sme_relations', 'earlier_llm_relations', 'concepts',
       'concept explanation', 'concept_class', 'Comment'],
      dtype='object')


In [16]:
def correct_sme_relation(entity_1, entity_2, inf_relations ):
    "Generate relation from entity 2 to 1"

    if inf_relations == "customer":
        return [entity_1, "supplier" , entity_2]
    elif inf_relations == "supplier":
        return [entity_2, "supplier" , entity_1]
    elif inf_relations == "other":
        return [entity_2, "other" , entity_1]
    else:
        return []


In [None]:
# correct_sme_relation
tqdm.pandas(desc="correct_sme_relation")
pos_data['sme_relations'] =\
pos_data[['entity_1', 'entity_2', 'inf_relations']]\
.progress_apply(lambda x:
correct_sme_relation(
entity_1=x.iloc[0],
entity_2=x.iloc[1],
inf_relations=x.iloc[2]),axis=1).to_list()

tqdm.pandas(desc="correct_sme_relation")
neg_data['sme_relations'] =\
neg_data[['entity_1', 'entity_2', 'inf_relations']]\
.progress_apply(lambda x:
correct_sme_relation(
entity_1=x.iloc[0],
entity_2=x.iloc[1],
inf_relations=x.iloc[2]),axis=1).to_list()


In [65]:
# Write the both dataframes to single excel file

# with pd.ExcelWriter('labelled_data_for_prompt/agreement.xlsx') as writer:
#     pos_data.to_excel(writer, sheet_name='pos', index=False)
#     neg_data.to_excel(writer, sheet_name='neg', index=False)    

### llm relations

In [64]:
# replaces = {"sentence": "{sentence}"}
# # Replace the keys with values for unified relation direction
# relations_map = {"customer": "supplier"}

In [46]:
@retry(tries=3, delay=1)
def get_completion_2(prompt:Text,
                        temperature:float=0,
                        model="gpt-3.5-turbo")->str:
    messages = [{"role": "user", "content": prompt}]
    response = None

    response = openai.ChatCompletion.create(
        model= model,
        messages=messages,
        temperature= temperature,    #this is the degree of randomness of the model's output
        request_timeout = 90
    )

    return response.choices[0].message["content"]

global sent_relations_out_star
sent_relations_out_star = []

def classify_relation(data: pd.DataFrame,
                      prompt_1: Text)-> pd.DataFrame:
    
    batch_size = 10
    sentences_in_batch = []
    global sent_concepts_out_star 
    sent_concepts_out_star = []

    output = []
    # Iterate over the frame rows
    for i, row in tqdm(data.iterrows(), total=data.shape[0], desc="Generating concepts"):
        
        row = row.to_dict()
        
        sentences_in_batch.append(f"{row['index']}    {row['sentence']}")
        # continue till batch fills
        if (len(sentences_in_batch) == batch_size) or (i == data.index[-1]):
            sentences_txt = "\n".join(sentences_in_batch)
        else:
            continue


        report_prompt_1 = copy(prompt_1)
        report_prompt_1 = report_prompt_1.replace("{sentences}", sentences_txt)

        retry_on_parse_err = False        
        attempt_count = 0
        while attempt_count == 0 or retry_on_parse_err:
            try:
                prompt_1_completion = get_completion_2(prompt=report_prompt_1)
                sent_concept_list = deserialize_json_list(prompt_1_completion)
            except:
                # don't retry of already retried
                if retry_on_parse_err == True:                      
                    retry_on_parse_err = False
                    sent_concept_list = []
                    print("Not retrying after 2nd error")
                else:
                    print("Retrying after 1st error")
                    retry_on_parse_err = True
            
            attempt_count += 1
        
        if sent_concept_list:
            sent_concepts_out_star += sent_concept_list

        # Reset 
        sentences_in_batch = []         
        
    return sent_concepts_out_star


def deserialize_json_list(ser_relations):    
    # the string representation of the list of dictionaries
    string_list_of_dicts = ser_relations
    # regular expression to match a dictionary
    dict_regex = r"\{[^{}]+\}"
    # find all dictionaries in the string
    dict_strings = re.findall(dict_regex, string_list_of_dicts)
    # deserialize each dictionary into a Python object
    list_of_dicts = []
    for dict_string in dict_strings:
        try:
            list_of_dicts.append(json.loads(re.sub(r"(?<!\w)'|'(?!\w)", '"', dict_string)))
        except:
            try:
                list_of_dicts.append(json.loads(re.sub(r"(?<!\w)'|'(?!\w)", '"', dict_string.replace('"', '\\"'))))
            except:
                continue
    return list_of_dicts


def find_relation_wrapper(data, prompt):
    
    global sent_concepts_out_star
    sent_concepts_out_star = []

    # run prompts to find relations
    sent_concepts_out = classify_relation(data, prompt)
    
    sent_concepts_dict = {}

    for sent_concepts in sent_concepts_out:
        sent_concepts_dict[sent_concepts["index"]] = sent_concepts

    output = []  
    for i, row in tqdm(data.iterrows(), total=data.shape[0]):
        row = row.to_dict()
        index_str = str(row["index"])
        if index_str in sent_concepts_dict:
            row["cs_other"] = sent_concepts_dict.get(index_str, {}).get("relation")
        else:
            row["cs_other"] = None
        
        output.append(row)
    
    output = pd.DataFrame(output)
    # set index as data index
    output.index = data.index
  
    return output
 

In [85]:

#data = pd.read_excel("../final_train_data/shared_with_all/llm_relations_all_label_1_v2_3.xlsx")
#data = pd.read_excel("../final_train_data/shared_with_all/huge_train_complex_sents_llm_v2_3.xlsx")

data = pd.read_excel("../final_train_data/shared_with_all/huge_train_llm_aligned_v2_3_0_1300.xlsx")


In [86]:
data.concept_class.value_counts()

concept_class
revenue                      420
agreement_and_partnership     91
supply_chain                  61
unknown                       40
investment_related            37
product_related               24
services agreement            21
licensing_and_ip              17
real_estate                    8
legal_and_regulatory           8
royalties                      3
financial_statements           3
Name: count, dtype: int64

In [87]:
data.shape

(733, 15)

In [88]:
# Read agreement_prompt 
with open("./cs_prompts/agreement.txt", "r") as f:
    agreement_prompt = f.read()
    
agreement_data = data[(data["concept_class"] == "agreement_and_partnership") | (data["concept_class"] == "services agreement")]

agreement_data_out = find_relation_wrapper(agreement_data, agreement_prompt)

data.loc[:, "agreement_relation"] = None
data.loc[agreement_data_out.index, "agreement_relation"] = agreement_data_out["cs_other"].to_list()


Generating concepts: 100%|██████████| 112/112 [02:51<00:00,  1.53s/it]
100%|██████████| 112/112 [00:00<00:00, 7184.99it/s]


In [89]:
# Read license_prompt 
with open("./cs_prompts/license.txt", "r") as f:
    license_prompt = f.read()
    
license_data = data[(data["concept_class"] == "licensing_and_ip")]

license_data_out = find_relation_wrapper(license_data, license_prompt)

data.loc[:, "license_relation"] = None
data.loc[license_data_out.index, "license_relation"] = license_data_out["cs_other"].to_list()

Generating concepts:   0%|          | 0/17 [00:00<?, ?it/s]

Generating concepts: 100%|██████████| 17/17 [00:25<00:00,  1.51s/it]
100%|██████████| 17/17 [00:00<00:00, 9943.27it/s]


In [72]:
#supply_chain.txt
# Read license_prompt 
with open("./cs_prompts/supply_chain.txt", "r") as f:
    supply_chain_prompt = f.read()
    
supply_chain_data = data[(data["concept_class"] == "product_related") | (data["concept_class"] == "supply_chain")]

supply_chain_data_out = find_relation_wrapper(supply_chain_data, supply_chain_prompt)

data.loc[:, "supply_chain_relation"] = None
data.loc[supply_chain_data_out.index, "supply_chain_relation"] = supply_chain_data_out["cs_other"].to_list()


Generating concepts:   0%|          | 0/171 [00:00<?, ?it/s]

Generating concepts: 100%|██████████| 171/171 [04:22<00:00,  1.54s/it]
100%|██████████| 171/171 [00:00<00:00, 26263.36it/s]


In [65]:
#To run again with modified prompts or different model 
#supply_chain_data = supply_chain_data_out[(supply_chain_data_out["cs_other"] == "other")]


In [73]:
supply_chain_data.shape

(171, 18)

In [80]:
data.agreement_relation.value_counts()

agreement_relation
customer_supplier    237
other                178
Name: count, dtype: int64

In [90]:
data.license_relation.value_counts()

license_relation
customer_supplier    13
other                 4
Name: count, dtype: int64

In [83]:
data.supply_chain_relation.value_counts()

supply_chain_relation
customer_supplier    152
other                 18
Name: count, dtype: int64

In [91]:
##data.to_excel("../final_train_data/shared_with_all/llm_relations_all_label_1_v2_3.xlsx", index=False)

##data.to_excel("../final_train_data/shared_with_all/huge_train_complex_sents_llm_v2_3.xlsx", index=False)

##data.to_excel("../final_train_data/shared_with_all/huge_train_llm_aligned_v2_3_0_1300.xlsx", index=False)

In [63]:
# # read excel file with two tabs -

# pos_data = pd.read_excel("labelled_data_for_prompt/agreement.xlsx", sheet_name="pos")
# neg_data = pd.read_excel("labelled_data_for_prompt/agreement.xlsx", sheet_name="neg")

# print(pos_data.columns)

# neg_output = generate_relations_wrapper(neg_data, PROMPT_V1 , replaces, relations_map)

# pos_output = generate_relations_wrapper(pos_data, PROMPT_V1 , replaces, relations_map)