In [None]:
#!pip install pandas openai torch scikit-learn dvc dvc-s3
#!pip install openpyxl retry python-dotenv



In [1]:
import os
import sys
import pandas as pd
import openai
import pandas as pd
from glob import glob
from tqdm import tqdm
import json
import ast
from pathlib import Path
import traceback
from dotenv import load_dotenv
load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")

In [2]:
prompt_concept_finder = """Your task is to find the concepts expressed in the sentences from the below concepts

# Information on concepts
-  Concepts are like classification labels for classification making the concepts more general and less descriptive of individual sentences. 

# List of concepts in { }, seperated by comma -  
{ 'investments in another company',  'financial statements', 'buy or sale of shares' , 'sale of assets',  'financing or loans',  'mergers or acquisitions', 
'supply or purchase agreement', 'services agreement',  'exclusive rights or agreement',  'changes in agreement', 
'agreement with another company',  'contracts in business',  'partnership with another company',   'joint venture or development',  'receivable or payable amount to accounts', 
'outstanding payment',  'potential earnings',  'contributors to revenue ',  'sources of revenue', 'yearly revenue numbers',  'sales performance',  'impact on revenue',  'purchase of products', 
'product distributors',  'supply channels', 'supply chain',  'dependency on customers', 'outsourcing operation',  'product manufacturing',  'production or operation', 
'product description',  'product usage',  'product pricing and cost', 'product marketing', 'royalties', 
'intellectual property',  'licensing agreement',  'product licensing', 'lease agreement', 'real estate lease', 'lease transactions',  'business risks', 'regulatory compliance',  'government regulations', 
'research and development', 'legal proceedings', 'stock ownership' }


# Rules to find the concepts
- Strictly use only information given in the input sentences
- Strictly find the concepts from given concepts only
- Check all above concepts one by one if they are expressed in the sentence
- Finally find  most relevant maximum 3 concepts that are expressed in Sentence. 
- A Sentence can have only one concept 
- Write one line explanation using the detected concepts

# Output rules
- `list of concepts` is list of detected concepts 
- Write output in json object with three keys - `index`, `concepts`, `concept explanation`
- Output should be strict json object that can be parsed


## output
Return JSON list of json objects with following format given in ``` quote -
```[
    {'index' : '< sentence index number given in the input Sentences >' ,
     'concepts': [ concept1, concept2, concept3] ,
     'concept explanation': 'one line explanation'
    }
   ]```

#Input
Sentences are given below in tripe back quotes. they are separated by \n and prefixed by index
```
{sentences}
```

"""

In [3]:
import re
import json
import time
from typing import Tuple, List, Text, Dict
from collections import defaultdict
from itertools import combinations
from tqdm import tqdm
from itertools import chain
from copy import copy
import pandas as pd
import numpy as np

from retry import retry


@retry(tries=3, delay=1)
def get_completion_2(prompt:Text,
                        temperature:float=0,
                        model="gpt-3.5-turbo")->str:
    messages = [{"role": "user", "content": prompt}]
    response = None

    response = openai.ChatCompletion.create(
        model= model,
        messages=messages,
        temperature= temperature,    #this is the degree of randomness of the model's output
        request_timeout = 90
    )

    return response.choices[0].message["content"]

global sent_concepts_out_star
sent_concepts_out_star = []

def generate_concepts(data: pd.DataFrame,
                       prompt_1: Text)-> pd.DataFrame:
    
    batch_size = 10
    sentences_in_batch = []
    global sent_concepts_out_star 
    sent_concepts_out_star = []

    output = []
    # Iterate over the frame rows
    for i, row in tqdm(data.iterrows(), total=data.shape[0], desc="Generating concepts"):
        row = row.to_dict()
        
        sentences_in_batch.append(f"{row['index']}    {row['sentence']}")
        # continue till batch fills
        if len(sentences_in_batch) == batch_size or i == (data.shape[0] -1):
            sentences_txt = "\n".join(sentences_in_batch)
        else:
            continue
        
        report_prompt_1 = copy(prompt_1)
        report_prompt_1 = report_prompt_1.replace("{sentences}", sentences_txt)
                
        retry_on_parse_err = False        
        attempt_count = 0
        while attempt_count == 0 or retry_on_parse_err:
            try:
                prompt_1_completion = get_completion_2(prompt=report_prompt_1)        
                sent_concept_list = deserialize_json_list(prompt_1_completion)                      
   
            except:
                # don't retry of already retried
                if retry_on_parse_err == True:                      
                    retry_on_parse_err = False
                    sent_concept_list = []
                    print("Not retrying after 2nd error")
                else:
                    print("Retrying after 1st error")
                    retry_on_parse_err = True
            
            attempt_count += 1
        
        if sent_concept_list:
            sent_concepts_out_star += sent_concept_list

        # Reset 
        sentences_in_batch = []         
        
    return sent_concepts_out_star


def deserialize_json_list(ser_relations):    
    # the string representation of the list of dictionaries
    string_list_of_dicts = ser_relations
    # regular expression to match a dictionary
    dict_regex = r"\{[^{}]+\}"
    # find all dictionaries in the string
    dict_strings = re.findall(dict_regex, string_list_of_dicts)
    # deserialize each dictionary into a Python object
    list_of_dicts = []
    for dict_string in dict_strings:
        try:
            list_of_dicts.append(json.loads(re.sub(r"(?<!\w)'|'(?!\w)", '"', dict_string)))
        except:
            try:
                list_of_dicts.append(json.loads(re.sub(r"(?<!\w)'|'(?!\w)", '"', dict_string.replace('"', '\\"'))))
            except:
                continue
    return list_of_dicts


def generate_concepts_wrapper(data):
    
    global sent_concepts_out_star
    sent_concepts_out_star = []

    # run prompts to find relations
    sent_concepts_out = generate_concepts(data, prompt_concept_finder)
    
    sent_concepts_dict = {}

    for sent_concepts in sent_concepts_out:
        sent_concepts_dict[sent_concepts["index"]] = sent_concepts

    output = []  
    for i, row in tqdm(data.iterrows(), total=data.shape[0]):
        row = row.to_dict()
        index_str = str(row["index"])
        if index_str in sent_concepts_dict:
            row["concepts"] = sent_concepts_dict[index_str]["concepts"]
            row["concept explanation"] = sent_concepts_dict[index_str]["concept explanation"]
        else:
            row["concepts"] = []
            row["concept explanation"] = ""
        
        output.append(row)
  
    return pd.DataFrame(output)
 

In [8]:
file_path = f'../DemoNotebooks/test_pipeline_data/filings_gt_25_relns/final_report_conflict_with_old_relation_llm.xlsx'

# Save the DataFrame to Excel
data = pd.read_excel(file_path)  # Set index=False if you don't want to save the index as a separate column

#data = data[:10]

print(data.shape)
data.columns

(454, 22)


Index(['accessionnumber', 'cik', 'reporter_name', 'reported_company',
       'sentence_id', 'sentence', 'relation', 'score', 'sents_scores',
       'agreration_results', 'winning_relation',
       'matched_reported_company_old', 'reported_company_old', 'old_relation',
       'explanation', 'relation_completion', 'relations', 'llm_relation_label',
       'llm_winning_relation', 'old_llm_align', 'new_llm_align', 'Comment'],
      dtype='object')

In [11]:
data["index"] = data.index
output = generate_concepts_wrapper(data)




In [22]:
#file_path = f'../DemoNotebooks/test_pipeline_data/filings_gt_25_relns/final_report_conflict_with_old_relation_llm_concepts.xlsx'
#output.to_excel(file_path, index=False)  # Set index=False if you don't want to save the index as a separate column


In [None]:
data = pd.read_excel("../data_with_concepts/EXP LLM v2.3 Label_1_0 errors.xlsx")
output = generate_concepts_wrapper(data)


output.to_excel("../data_with_concepts/EXP LLM v2.3 Label_1_0 errors.xlsx", index=False)  # Set index=False if you don't want to save the index as a separate column


(3682, 11)

In [5]:
batch_size = 100

for start_idx in range(500, 3700, batch_size):
    
    data = pd.read_excel("../data_with_concepts/lg_sents_from_huge_set/huge_train_dedup_80_lg_sent.xlsx")
    data = data[start_idx:start_idx+batch_size]

    output = generate_concepts_wrapper(data)
    
    file_path = f'../data_with_concepts/lg_sents_from_huge_set/1_batches/huge_train_dedup_80_lg_sent_{start_idx}_{start_idx+batch_size}.xlsx'
    # # Save the DataFrame to Excel
    output.to_excel(file_path, index=False)  # Set index=False if you don't want to save the index as a separate column


# df_arr = [] 

# for start_idx in range(0, 3700, batch_size):
#     df_arr.append(pd.read_excel(f"../data_with_concepts/lg_sents_from_huge_set/1_batches/huge_train_dedup_80_lg_sent_{start_idx}_{start_idx+batch_size}.xlsx"))
    
# output = pd.concat(df_arr, ignore_index=True)
# output = output.sort_values(by=['index'])

# file_path = '../data_with_concepts/lg_sents_from_huge_set/huge_train_dedup_80_lg_sent_concepts.xlsx'
# output.to_excel(file_path, index=False)


Generating concepts: 100%|██████████| 100/100 [04:16<00:00,  2.57s/it]
100%|██████████| 100/100 [00:00<00:00, 17714.68it/s]
Generating concepts: 100%|██████████| 100/100 [03:14<00:00,  1.94s/it]
100%|██████████| 100/100 [00:00<00:00, 30413.34it/s]
Generating concepts: 100%|██████████| 100/100 [02:37<00:00,  1.57s/it]
100%|██████████| 100/100 [00:00<00:00, 29604.07it/s]
Generating concepts: 100%|██████████| 100/100 [03:01<00:00,  1.81s/it]
100%|██████████| 100/100 [00:00<00:00, 29267.35it/s]
Generating concepts: 100%|██████████| 100/100 [02:43<00:00,  1.63s/it]
100%|██████████| 100/100 [00:00<00:00, 31555.10it/s]
Generating concepts: 100%|██████████| 100/100 [02:36<00:00,  1.56s/it]
100%|██████████| 100/100 [00:00<00:00, 28095.01it/s]
Generating concepts: 100%|██████████| 100/100 [02:56<00:00,  1.77s/it]
100%|██████████| 100/100 [00:00<00:00, 30544.01it/s]
Generating concepts: 100%|██████████| 100/100 [03:01<00:00,  1.82s/it]
100%|██████████| 100/100 [00:00<00:00, 30192.23it/s]
Generati

In [7]:
output.shape

(2609, 13)

In [6]:
# train cs relation concepts files

data1 = pd.read_excel("../final_train_data/llm_relations_all_label_1_v2_3.xlsx")
data1 = data1[data1["align"] == True][["index", "concepts"]]

data2 = pd.read_excel("../final_train_data/huge_train_llm_aligned_v2_3_0_1300.xlsx")
data2 = data2[["index", "concepts"]]

data3 = pd.read_excel("../final_train_data/huge_train_complex_sents_llm_v2_3.xlsx")
data3 = data3[data3["align"] == True][["index", "concepts"]]


output = pd.concat([data3, data2, data3], ignore_index=True)
output['concepts'] = output['concepts'].apply(eval)

In [45]:
# train other relation concepts files

data1 = pd.read_excel("final_train_data/llm_relations_other_from_label_0_v2_3.xlsx")
data1 = data1[["index", "concepts"]]

data2 = pd.read_excel("final_train_data/llm_relations_other_relation_v2_3.xlsx")
data2 = data2[["index", "concepts"]]

data3 = pd.read_excel("final_train_data/huge_other_train_complex_sents.xlsx")
data3 = data3[["index", "concepts"]]


output = pd.concat([data1, data2, data3], ignore_index=True)
output['concepts'] = output['concepts'].apply(eval)

In [46]:
output.shape

(3597, 2)

In [12]:
# Generate the concept_df dataframe, 
# Input - output dataframe with "concepts" column 

concept_df_dict = defaultdict(int)

# 1 concept df
for i, row in tqdm(output.iterrows(), total=output.shape[0]):   
   for concept in row["concepts"]:
    concept_df_dict[concept] += 1
concept_df = pd.DataFrame({'concept1': list(concept_df_dict.keys()),
                           'df1': list(concept_df_dict.values())})
concept_df = concept_df.sort_values(by=['df1'], ascending=False)
#concept_df = concept_df[concept_df['df1'] > 4]

# 2 concept df
concept_df_dict = defaultdict(int)
for i, row in tqdm(output.iterrows(), total=output.shape[0]):   
   if len(row["concepts"]) > 1:
        concept_tuples = list(combinations(row["concepts"], 2))
        for c_tuple in concept_tuples:
                concept_df_dict[c_tuple] += 1

# Sort dictionary by values in descending order
concept_df_dict = dict(sorted(concept_df_dict.items(), key=lambda item: item[1], reverse=True))
# Filter dictionary by values 
concept_df_dict = {key: value for key, value in concept_df_dict.items() if value > 4}

pad_list = [""] * (concept_df.shape[0] - len(concept_df_dict))
concept_df["concept2"] = list(concept_df_dict.keys()) + pad_list
concept_df["df2"] = list(concept_df_dict.values()) + pad_list

# 3 concept df
concept_df_dict = defaultdict(int)
for i, row in tqdm(output.iterrows(), total=output.shape[0]):   
   if len(row["concepts"]) > 2:
        concept_tuples = list(combinations(row["concepts"], 3))
        for c_tuple in concept_tuples:
                concept_df_dict[c_tuple] += 1

# Sort dictionary by values in descending order
concept_df_dict = dict(sorted(concept_df_dict.items(), key=lambda item: item[1], reverse=True))
# Filter dictionary by values 
concept_df_dict = {key: value for key, value in concept_df_dict.items() if value > 4}

pad_list = [""] * (concept_df.shape[0] - len(concept_df_dict))
concept_df["concept3"] = list(concept_df_dict.keys()) + pad_list
concept_df["df3"] = list(concept_df_dict.values()) + pad_list


100%|██████████| 454/454 [00:00<00:00, 35507.17it/s]
100%|██████████| 454/454 [00:00<00:00, 42592.24it/s]
100%|██████████| 454/454 [00:00<00:00, 49208.31it/s]


In [None]:
concept_df.shape

In [None]:
# write concept_df to file

concept_df.to_excel("../data_with_concepts/concepts_df_test_error.xlsx", index=False)

In [48]:
concept_df.to_excel("data_with_concepts/concepts_df_other.xlsx", index=False)

## Sentence classification using detected concepts and concepts mapping 

In [13]:
concept_classes = ['legal_and_regulatory',
 'royalties',
 'licensing_and_ip',
 'real_estate',
 'supply_purchase_agreement',
 'services agreement',
 'agreement_and_partnership',
 'product_related',
 'supply_chain',
 'investment_related',
 'revenue']

remaining_classes = ['financial_statements', 'unknown']


concept_class_dict = {
    'legal_and_regulatory': {'1st_preference': ['government regulations',
                                           'legal proceedings',
                                           'regulatory compliance'],
                        '2nd_preference': []},
'royalties': {'1st_preference': ['royalties'], '2nd_preference': []},
 'licensing_and_ip': {'1st_preference': ['exclusive rights or agreement',
                                         'licensing agreement',
                                         'intellectual property',
                                         'product licensing',
                                         'product marketing'],
                      '2nd_preference': ["research and development"]},
 'real_estate': {'1st_preference': ['lease agreement',
                                    'lease transactions',
                                    'real estate lease'],
                 '2nd_preference': []},
 'supply_purchase_agreement': {'1st_preference': ['supply_purchase_agreement'],
                               '2nd_preference': []},
 'services agreement': {'1st_preference': ['services agreement'],
                        '2nd_preference': []},                               
 'agreement_and_partnership': {'1st_preference': ['contracts in business',
                                    'partnership with another company',
                                    'joint venture or development',
                                    'collaboration agreement'],
                 '2nd_preference': ['agreement with another company', 'changes in agreement']},
 'product_related': {'1st_preference': ['product manufacturing',
                                        'outsourcing operation',
                                        'production or operation',
                                        'product pricing and cost',
                                        'product description'],
                     '2nd_preference': []},
  'supply_chain': {'1st_preference': ['supply channels',
                                     'supply chain',
                                     'product distributors',
                                     'distribution agreement',
                                     'purchase of products'],
                  '2nd_preference': []},
'investment_related': {'1st_preference': ['mergers or acquisitions', 'financing or loans', 
                                          'stock ownership', 'investments in another company', 'sale of assets', 
                                          'buy or sale of shares', 'acquisition', 'acquisition of another company'],
                    '2nd_preference': []},          

 'revenue': {'1st_preference': ['sources of revenue',
                                'contributors to revenue',
                                'yearly revenue numbers',
                                'impact on revenue',
                                'sales performance',
                                'revenue recognition',
                                'potential earnings',
                                'receivable or payable amount to accounts',
                                'outstanding payment',
                                'milestone payments'],
             '2nd_preference': ['dependency on customers']},

  'financial_statements': {'1st_preference': ['financial statements'],
                          '2nd_preference': []},
 'unknown': {'1st_preference': [], '2nd_preference': []}
}




In [15]:
def find_concept_class_from_1st_preference(sent_detected_concepts, concept_class_dict, concept_classes):
    # strict match: check if detected_concept is part of 1st_preference of concept class
    for detected_concept in sent_detected_concepts:
        for concept_class in concept_classes:
            if detected_concept in concept_class_dict.get(concept_class)["1st_preference"]:
                return concept_class

def find_concept_class_from_2nd_preference(sent_detected_concepts, concept_class_dict, concept_classes):
    # strict match: check if detected_concept is part of 2nd_preference of concept class
    for detected_concept in sent_detected_concepts:
        for concept_class in concept_classes:
            if detected_concept in concept_class_dict.get(concept_class)["2nd_preference"]:
                return concept_class

def find_concept_class_from_any_preference(sent_detected_concepts, concept_class_dict, concept_classes):
    
    # relaxed match: check if concept_class that we know is substring of detected_concept of the sentence 
    for detected_concept in sent_detected_concepts:
        for concept_class in concept_classes:
            for concept_class_val in (concept_class_dict.get(concept_class)["1st_preference"] + 
                                      concept_class_dict.get(concept_class)["2nd_preference"]):
                    if concept_class_val in  detected_concept:                    
                        return concept_class


def find_concept_class(sent_detected_concepts, concept_class_dict, 
                       concept_classes, remaining_classes):
    if not sent_detected_concepts:
        return 'unknown'
    
    concept_class = find_concept_class_from_1st_preference(sent_detected_concepts, 
                                                           concept_class_dict, concept_classes) 
    if concept_class:
        return concept_class
    
    concept_class = find_concept_class_from_2nd_preference(sent_detected_concepts, 
                                                           concept_class_dict, concept_classes) 
    if concept_class:
        return concept_class
    
    concept_class = find_concept_class_from_any_preference(sent_detected_concepts, 
                                                           concept_class_dict, 
                                                           concept_classes + remaining_classes) 
    if concept_class:
        return concept_class
    
    return "unknown"

    


In [16]:
# output['concept_class'] = output['concepts'].apply(lambda sent_detected_concepts: 
#                                                  find_concept_class(sent_detected_concepts, concept_class_dict,
#                                                                     concept_classes, remaining_classes))

In [63]:
# concept classification for CS relation samples

data1 = pd.read_excel("final_train_data/llm_relations_all_label_1_v2_3.xlsx")
data1['concepts'] = data1['concepts'].apply(eval)
data1['concept_class'] = data1['concepts'].apply(lambda sent_detected_concepts: 
                                                 find_concept_class(sent_detected_concepts, concept_class_dict,
                                                                    concept_classes, remaining_classes))
data1.to_excel("final_train_data/llm_relations_all_label_1_v2_3.xlsx", index=False)

data2 = pd.read_excel("final_train_data/huge_train_llm_aligned_v2_3_0_1300.xlsx")
data2['concepts'] = data2['concepts'].apply(eval)
data2['concept_class'] = data2['concepts'].apply(lambda sent_detected_concepts: 
                                                 find_concept_class(sent_detected_concepts, concept_class_dict,
                                                                    concept_classes, remaining_classes))
data2.to_excel("final_train_data/huge_train_llm_aligned_v2_3_0_1300.xlsx", index=False)


data3 = pd.read_excel("final_train_data/huge_train_complex_sents_llm_v2_3.xlsx")
data3['concepts'] = data3['concepts'].apply(eval)
data3['concept_class'] = data3['concepts'].apply(lambda sent_detected_concepts: 
                                                 find_concept_class(sent_detected_concepts, concept_class_dict,
                                                                    concept_classes, remaining_classes))
data3.to_excel("final_train_data/huge_train_complex_sents_llm_v2_3.xlsx", index=False)


In [66]:
# concept classification for other class

data1 = pd.read_excel("final_train_data/llm_relations_other_from_label_0_v2_3.xlsx")
data1['concepts'] = data1['concepts'].apply(eval)
data1['concept_class'] = data1['concepts'].apply(lambda sent_detected_concepts: 
                                                 find_concept_class(sent_detected_concepts, concept_class_dict,
                                                                    concept_classes, remaining_classes))
data1.to_excel("final_train_data/llm_relations_other_from_label_0_v2_3.xlsx", index=False)

data2 = pd.read_excel("final_train_data/llm_relations_other_relation_v2_3.xlsx")
data2['concepts'] = data2['concepts'].apply(eval)
data2['concept_class'] = data2['concepts'].apply(lambda sent_detected_concepts: 
                                                 find_concept_class(sent_detected_concepts, concept_class_dict,
                                                                    concept_classes, remaining_classes))
data2.to_excel("final_train_data/llm_relations_other_relation_v2_3.xlsx", index=False)


data3 = pd.read_excel("final_train_data/huge_other_train_complex_sents.xlsx")
data3['concepts'] = data3['concepts'].apply(eval)
data3['concept_class'] = data3['concepts'].apply(lambda sent_detected_concepts: 
                                                 find_concept_class(sent_detected_concepts, concept_class_dict,
                                                                    concept_classes, remaining_classes))
data3.to_excel("final_train_data/huge_other_train_complex_sents.xlsx", index=False)


In [60]:
data3['concept_class'].value_counts()

concept_class
revenue                      927
agreement_and_partnership    710
licensing_and_ip             578
unknown                      415
royalties                    244
supply_chain                 208
investment_related           145
financial_statements         120
services agreement           116
product_related              103
legal_and_regulatory          64
real_estate                   52
Name: count, dtype: int64

In [21]:
output.columns

# create new df where old_relation is not matching with relation column
# filtered_output = output[output['old_relation'] != output['relation']]
# filtered_output.shape


Index(['accessionnumber', 'cik', 'reporter_name', 'reported_company',
       'sentence_id', 'sentence', 'relation', 'score', 'sents_scores',
       'agreration_results', 'winning_relation',
       'matched_reported_company_old', 'reported_company_old', 'old_relation',
       'explanation', 'relation_completion', 'relations', 'llm_relation_label',
       'llm_winning_relation', 'old_llm_align', 'new_llm_align', 'Comment',
       'index', 'concepts', 'concept explanation', 'concept_class'],
      dtype='object')

In [17]:
output['concept_class'].value_counts()

concept_class
licensing_and_ip             92
agreement_and_partnership    70
unknown                      65
royalties                    62
supply_chain                 46
product_related              35
revenue                      33
legal_and_regulatory         18
financial_statements         10
investment_related           10
real_estate                   9
services agreement            4
Name: count, dtype: int64

In [20]:
filtered_output['concept_class'].value_counts()

concept_class
licensing_and_ip             76
agreement_and_partnership    56
royalties                    54
unknown                      49
supply_chain                 44
revenue                      30
product_related              27
legal_and_regulatory         18
financial_statements         10
investment_related            8
real_estate                   8
services agreement            3
Name: count, dtype: int64