# **_Basic dependencies_**

In [1]:
import json
import pandas as pd
import os
import math
from tqdm import tqdm
from typing import Dict, Any, Text, Tuple
import yaml
import sys
import collections as collections
from pathlib import Path

src_dir= Path.cwd().parent
sys.path.append(str(src_dir))


import src.labels_generator.llm_relation_prompt as llm_relation_prompt
import src.labels_generator.utils as llm_relation_utils
from src.utils.logs import get_logger
from src.matcher.core import SimCSE_Matcher





  from .autonotebook import tqdm as notebook_tqdm


In [90]:
import importlib
importlib.reload(llm_relation_prompt)


<module 'src.labels_generator.llm_relation_prompt' from '/Users/shyamshinde/codebase/inferess-relation-extraction/src/labels_generator/llm_relation_prompt.py'>

In [18]:
def get_entity_matcher(entity_names_to_index):
    # "pipeline-artifacts/matcher/all-MiniLM-Nli-All-Random-v4"
    entity_matcher = SimCSE_Matcher(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2")
    entity_matcher.build_index(entity_names_to_index)
    return entity_matcher
       
def get_first_match(matcher, entity_search, threshold):
    
    results = matcher.search(entity_search, threshold=threshold)
    if results:
        return results[0][0], results[0][1]
    else:
        return None, None

sentence_transformers_matcher = SimCSE_Matcher(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
# entity_matcher = get_entity_matcher()
# # test
# get_first_match(entity_matcher, "ScanSource", threshold=0.9)


In [168]:
def get_results_df(new_results_path, old_results_path):    
    # load new_results
    new_results_df = pd.read_excel(new_results_path)
    # rename columns if exists
    new_results_df.rename(columns={"accessionNumber": "accessionnumber",
                               "reporterName": "reporter_name",
                               "company_name": "reported_company"}, inplace=True)
    new_results_df = new_results_df[new_results_df["accessionnumber"].notna()]


    print("new_results_df shape: ", new_results_df.shape)
    print("new_results_df columns: ", new_results_df.columns)

    # load old_results
    old_results_df = pd.read_excel(old_results_path)
    # filter old_results to only include the accession_number that are in new_results
    old_results_df = old_results_df[old_results_df.accessionnumber.isin(new_results_df.accessionnumber.unique())]
    print("old_results_df shape: ", old_results_df.shape)
    print("old_results_df columns: ", old_results_df.columns)
   
    return new_results_df, old_results_df



In [169]:
new_results_path = 'test_pipeline_data/filings_gt_25_relns/sc_re_final_report.xlsx'
old_results_path = 'test_pipeline_data/relation_table_joined_on_text.xlsx'


new_results_df, old_results_df = get_results_df(new_results_path, old_results_path)



new_results_df shape:  (5737, 11)
new_results_df columns:  Index(['accessionnumber', 'cik', 'reporter_name', 'reported_company',
       'sentence_id', 'sentence', 'relation', 'score', 'sents_scores',
       'agreration_results', 'winning_relation'],
      dtype='object')
old_results_df shape:  (1069, 12)
old_results_df columns:  Index(['valid_from', 'last_seen', 'reporter_id', 'reporter_name',
       'relationship_type', 'reported_company_id', 'reported_company',
       'date_from', 'date_to', 'confidence', 'accession_number',
       'accessionnumber'],
      dtype='object')


In [6]:
# Debug purpose, can be ignored 

#- Quick experimental check on reporter/filer names 

# drop duplicates on accessionnumber and reporter_name
new_reporter_names_df = new_results_df.drop_duplicates(subset=["accessionnumber", "reporter_name"], keep="first")
old_reporter_names_df = old_results_df.drop_duplicates(subset=["accessionnumber", "reporter_name"], keep="first")

# join two dfs on accessionnumber and print df
joined_df = pd.merge(new_reporter_names_df, old_reporter_names_df, on=["accessionnumber"], how="inner", suffixes=("_new", "_old"))

#joined_df[["accession_number", "reporter_name_new", "reporter_name_old"]]

In [170]:
reported_company_new_to_old_dict = {}

for accessionnumber in new_results_df[new_results_df.accessionnumber.notna()].accessionnumber.unique():
    if not accessionnumber:
        continue    

    # get all rows for this accessionnumber
    df = old_results_df[old_results_df.accessionnumber == accessionnumber]
    # get all reported_company for this accessionnumber
    reported_companies_old = df[df.reported_company.notna()].reported_company.unique().tolist()

    # get entity_matcher for reported_companies_old
    entity_matcher = get_entity_matcher(reported_companies_old)

    # get all reported_company for this accessionnumber
    df_new = new_results_df[new_results_df.accessionnumber == accessionnumber]
    reported_companies_new = df_new[df_new.reported_company.notna()].reported_company.unique().tolist()
    reported_company_new_to_old_dict[accessionnumber] = {}

    for reported_company_new in reported_companies_new:
        # get matching_reported_company_old
        matching_reported_company_old, score = get_first_match(entity_matcher, reported_company_new, threshold=0.85)
        # add to dict
        reported_company_new_to_old_dict[accessionnumber][reported_company_new] =  matching_reported_company_old
    



09/26/2023 16:15:26 - INFO - src.matcher.core -   Encoding embeddings for sentences...
09/26/2023 16:15:26 - INFO - src.matcher.core -   Building index...
09/26/2023 16:15:26 - INFO - src.matcher.core -   Use CPU-version faiss
09/26/2023 16:15:26 - INFO - src.matcher.core -   Finished
09/26/2023 16:15:27 - INFO - src.matcher.core -   Encoding embeddings for sentences...
09/26/2023 16:15:27 - INFO - src.matcher.core -   Building index...
09/26/2023 16:15:27 - INFO - src.matcher.core -   Use CPU-version faiss
09/26/2023 16:15:27 - INFO - src.matcher.core -   Finished
09/26/2023 16:15:28 - INFO - src.matcher.core -   Encoding embeddings for sentences...
09/26/2023 16:15:28 - INFO - src.matcher.core -   Building index...
09/26/2023 16:15:28 - INFO - src.matcher.core -   Use CPU-version faiss
09/26/2023 16:15:29 - INFO - src.matcher.core -   Finished
09/26/2023 16:15:29 - INFO - src.matcher.core -   Encoding embeddings for sentences...
09/26/2023 16:15:29 - INFO - src.matcher.core -   Build

In [24]:
# Debug purpose, can be ignored

#  dataframe from dict by adding accessionnumber as column, new reported company and old matching company

# reported_company_new_to_old_records = []
# for accessionnumber in reported_company_new_to_old_dict.keys():

#     for new_reported_company, old_reported_company in reported_company_new_to_old_dict[accessionnumber].items():
#         reported_company_new_to_old_records.append({"accessionnumber": accessionnumber,
#                                                     "reported_company_new": new_reported_company,
#                                                     "reported_company_old": old_reported_company})
        

# reported_company_new_to_old_df = pd.DataFrame.from_records(reported_company_new_to_old_records)
# # write to file
# reported_company_new_to_old_df.to_excel("./test_pipeline_data/filings_gt_25_relns/reported_company_new_to_old_df_0.85.xlsx")


# from reported_company_new_to_old_dict, for each accessionnumber, print total count of {reported_company_new and reported_company_old }

# for accessionnumber, reported_company_new_to_old in reported_company_new_to_old_dict.items():
#     print(f"""{accessionnumber}    {len(reported_company_new_to_old.keys())} {len([v for v in reported_company_new_to_old.values() if v])}""")
    


In [171]:
# update new_results_df by adding matching_reported_company_old column
new_results_df["matched_reported_company_old"] = None
for accessionnumber, reported_company_new_to_old in reported_company_new_to_old_dict.items():
    for reported_company_new, matching_reported_company_old in reported_company_new_to_old.items():
        new_results_df.loc[(new_results_df.accessionnumber == accessionnumber) & (new_results_df.reported_company == reported_company_new), "matched_reported_company_old"] = matching_reported_company_old

In [26]:
# TODO - move it later in this notebook

new_results_df[new_results_df["matched_reported_company_old"].notna()].shape

recall_old_relation = new_results_df[new_results_df["matched_reported_company_old"].notna()].shape[0] / old_results_df.shape[0] * 100

print(f"Recall of old relation in new results {recall_old_relation}")



Recall of old relation in new results 68.38166510757718


In [172]:
new_results_df.columns

Index(['accessionnumber', 'cik', 'reporter_name', 'reported_company',
       'sentence_id', 'sentence', 'relation', 'score', 'sents_scores',
       'agreration_results', 'winning_relation',
       'matched_reported_company_old'],
      dtype='object')

In [173]:
# write no_match_in_old to excel 

(new_results_df[new_results_df["matched_reported_company_old"].isna()].
 to_excel("test_pipeline_data/filings_gt_25_relns/new_results_no_match_in_old.xlsx", index=False))


old_results_df["matched_reported_company_old"] = old_results_df.reported_company

# slice old_results_df to only include columns that are needed
sliced_old_results_df = old_results_df[['reporter_id', 'reporter_name',
       'relationship_type', 'reported_company_id', 'reported_company', 'matched_reported_company_old',
       'accessionnumber']]

# join old and new df on accessionnumber and matched_reported_company_old
joined_df = pd.merge(new_results_df, sliced_old_results_df, 
                     on=["accessionnumber",  "matched_reported_company_old"], 
                     how="inner", suffixes=("_new", "_old"))


# Add flag if the relationship_type in old_df is the same as winning_relation in new df
joined_df["relation_match"] = joined_df.apply(lambda x: x.relationship_type == x.winning_relation, axis=1)

print(joined_df.shape)

print("Sentence level : Error distribution when relation_match is False")
print(joined_df[joined_df["relation_match"] == False].winning_relation.value_counts())

# Write to file

(joined_df[['accessionnumber', 'cik', 'reporter_name_new', 'reported_company_new', 'reported_company_old',
       'sentence_id', 'sentence', 'relation', 'score', 'sents_scores',
       'agreration_results', 'winning_relation', 'relationship_type', 'relation_match',
       'matched_reported_company_old', 'reporter_id', 'reporter_name_old', 'reported_company_id']].
       to_excel("test_pipeline_data/filings_gt_25_relns/new_results_vs_old.xlsx", index=False))



(2854, 18)
Error distribution when relation_match is False
winning_relation
customer    259
supplier    163
other        32
Name: count, dtype: int64


In [137]:

# find all reported_company_found_in_new_results
# preq: reported_company_new_to_old_dict


conflict_relations_reported_company_new = collections.defaultdict(list)

for accessionnumber in joined_df.accessionnumber.unique():    
    conflict_relations_reported_company_new[accessionnumber] = (
        joined_df[(joined_df["relation_match"] == False) & (joined_df["accessionnumber"] == accessionnumber)].
        reported_company_new.to_list())

#conflict_relations_reported_company_new


In [138]:
joined_df.columns

Index(['accessionnumber', 'cik', 'reporter_name_new', 'reported_company_new',
       'sentence_id', 'sentence', 'relation', 'score', 'sents_scores',
       'agreration_results', 'winning_relation',
       'matched_reported_company_old', 'reporter_id', 'reporter_name_old',
       'relationship_type', 'reported_company_id', 'reported_company_old',
       'relation_match'],
      dtype='object')

In [45]:
joined_df.reported_company_old

Index(['accessionnumber', 'cik', 'reporter_name_new', 'reported_company_new',
       'sentence_id', 'sentence', 'relation', 'score', 'sents_scores',
       'agreration_results', 'winning_relation',
       'matched_reported_company_old', 'reporter_id', 'reporter_name_old',
       'relationship_type', 'reported_company_id', 'reported_company_old',
       'relation_match'],
      dtype='object')

In [29]:
# generate sentences for where old and new results are not matching
# preq: joined_df
# preq: file: test_pipeline_data/filings_gt_25_relns/filings_gt_25_relns_final_report.xlsx

# load filings_gt_25_relns_final_report.xlsx
filings_gt_25_relns_final_report_df = pd.read_excel("test_pipeline_data/filings_gt_25_relns/sc_re_final_report.xlsx")
filings_gt_25_relns_final_report_df = filings_gt_25_relns_final_report_df[filings_gt_25_relns_final_report_df["accessionNumber"].notna()]

# check if company_name in filings_gt_25_relns_final_report_df is not in reported_company_found_in_new_results
filings_gt_25_relns_final_report_df["conflict_with_old_relation"] = filings_gt_25_relns_final_report_df.apply(lambda x: x.company_name in conflict_relations_reported_company_new[x.accessionNumber], axis=1)

(filings_gt_25_relns_final_report_df[filings_gt_25_relns_final_report_df["conflict_with_old_relation"]==True].
    to_excel("test_pipeline_data/filings_gt_25_relns/sc_re_conflict_with_old_relation.xlsx", index=False))

print("print the shape number of rows where conflict_with_old_relation is True")
filings_gt_25_relns_final_report_df[filings_gt_25_relns_final_report_df["conflict_with_old_relation"]==True].shape



print the shape number of rows where conflict_with_old_relation is True


(454, 12)

<!-- 
Stastical comparison of new and old results for 36 filing companies with > 25 relations from athena:old_prediction table

1. New results have approx 1.8 times more relations than old results, even after having 0.95 threshold for both SC and RE model
2. 65% recall of old relations in new results, meaning we did not find relations for 35% companies 
3. Out of 65% relations, 85% of relations are exactly matching with old results relations. 
    - Most of 15% erros are due to conflict in customer/suppier relation, rather than conflict with `other` 
  
Next steps - 
1. Find error patterns in the relations conflicts of old and new results by doing -> manual review + help from LLM
2. Find why low recall(65%) of old relations in new results, one prominant reason could be NER, but need to check
3. Validate extra relations detected in new results 
 -->


In [37]:
# generate sentences for where old and new results are not matching
# preq: joined_df
# preq: file: test_pipeline_data/filings_gt_25_relns/filings_gt_25_relns_final_report.xlsx

# load filings_gt_25_relns_final_report.xlsx
filings_gt_25_relns_final_report_df = pd.read_excel("test_pipeline_data/filings_gt_25_relns/filings_gt_25_relns_final_report.xlsx")
filings_gt_25_relns_final_report_df = filings_gt_25_relns_final_report_df[filings_gt_25_relns_final_report_df["accessionNumber"].notna()]

# check if company_name in filings_gt_25_relns_final_report_df is not in reported_company_found_in_new_results
filings_gt_25_relns_final_report_df["conflict_with_old_relation"] = filings_gt_25_relns_final_report_df.apply(lambda x: x.company_name in conflict_relations_reported_company_new[x.accessionNumber], axis=1)




In [38]:
filings_gt_25_relns_final_report_df.conflict_with_old_relation.value_counts()    

conflict_with_old_relation
False    5283
True      454
Name: count, dtype: int64

In [None]:

# Stastical comparison of new and old results for 36 filing companies with > 25 relations from athena:old_prediction table

# 1. New results have approx 1.8 times more relations than old results, even after having 0.95 threshold for both SC and RE model
# 2. 65% recall of old relations in new results, meaning we did not find relations for 35% companies 
# 3. Out of 65% relations, 85% of relations are exactly matching with old results relations. 
#     - Most of 15% erros are due to conflict in customer/suppier relation, rather than conflict with `other` 
  
# Next steps - 
# 1. Find error patterns in the relations conflicts of old and new results by doing -> manual review + help from LLM
# 2. Find why low recall(65%) of old relations in new results, one prominant reason could be NER, but need to check
# 3. Validate extra relations detected in new results 



In [216]:
conflict_final_report_df = joined_df[joined_df["relation_match"] == False].reset_index(drop=True)

conflict_final_report_df.shape

conflict_final_report_df = conflict_final_report_df[['accessionnumber', 'cik', 'reporter_name_new', 'reported_company_new',
       'sentence_id', 'sentence', 'relation', 'score', 'sents_scores',
       'agreration_results', 'winning_relation',
       'matched_reported_company_old', 'reported_company_old',
       'relationship_type' ]]

# rename columns of suffix _new to remove _new
conflict_final_report_df.rename(columns={"reporter_name_new": "reporter_name",
                                         "reported_company_new": "reported_company",
                                         "relationship_type": "old_relation"}, 
                                         inplace=True)


(454, 18)

In [270]:
# ## conflict_final_report_df.to_excel("test_pipeline_data/filings_gt_25_relns/sc_re_conflict_with_old_relation.xlsx", index=False)


In [269]:
conflict_final_report_df.columns

Index(['accessionnumber', 'cik', 'reporter_name', 'reported_company',
       'sentence_id', 'sentence', 'relation', 'score', 'sents_scores',
       'agreration_results', 'winning_relation',
       'matched_reported_company_old', 'reported_company_old', 'old_relation'],
      dtype='object')

### Run llm

In [273]:
output_df = (pd.read_excel(
    "test_pipeline_data/filings_gt_25_relns/final_report_conflict_with_old_relation_llm.xlsx"))

output_df.shape

# eval 
output_df["agreration_results"] = output_df["agreration_results"].apply(eval) 
output_df["sents_scores"] = output_df["sents_scores"].apply(eval) 
output_df["relations"] = output_df["relations"].apply(eval) 



In [274]:
output_df.columns


Index(['accessionnumber', 'cik', 'reporter_name', 'reported_company',
       'sentence_id', 'sentence', 'relation', 'score', 'sents_scores',
       'agreration_results', 'winning_relation',
       'matched_reported_company_old', 'reported_company_old', 'old_relation',
       'explanation', 'relation_completion', 'relations', 'llm_relation_label',
       'llm_winning_relation', 'old_llm_align', 'new_llm_align'],
      dtype='object')

In [50]:
# batch_size = 100
# num_batches = math.ceil(filings_gt_25_relns_final_report_df.shape[0] / batch_size)
# batch_dfs = [filings_gt_25_relns_final_report_df[i*batch_size: (i+1) *batch_size]  for i in range(num_batches)] 

# output_batch_dfs = []
# for batch_df in batch_dfs:
#     output_batch_df = llm_relation_prompt.generate_relations(batch_df, {"sentence": "{sentence}"},
#                                        {"customer": "supplier"}) 
#     output_batch_dfs.append(output_batch_df)

# output_df = pd.concat(output_batch_dfs, axis=0)

#output_df.to_excel("test_pipeline_data/filings_gt_25_relns/final_report_conflict_with_old_relation_llm.xlsx", index=False)



In [230]:
output_df.columns

Index(['accessionnumber', 'cik', 'reporter_name', 'reported_company',
       'sentence_id', 'sentence', 'relation', 'score', 'sents_scores',
       'agreration_results', 'winning_relation',
       'matched_reported_company_old', 'reported_company_old', 'old_relation',
       'explanation', 'relation_completion', 'relations'],
      dtype='object')

In [275]:
def generate_llm_relation_label(row, matcher_object, threshold):
    
    reporter = row["reporter_name"]
    reported_company = row["reported_company"]

    reverse_relation_dict = {"customer": "supplier", "supplier": "customer"}

    # zip source_companies and dest_companies
    for llm_relation in row["relations"]:
        llm_source_company = llm_relation[0]
        llm_dest_company = llm_relation[2]
        llm_relation_label = llm_relation[1]

        # match llm_source_company and llm_dest_company using matcher_object
        source_match = True if matcher_object.similarity(reported_company, [llm_source_company]).max() > threshold else False
        dest_match = True if matcher_object.similarity(reporter, [llm_dest_company]).max() > threshold else False
        
        if source_match and dest_match:
            if llm_relation_label in reverse_relation_dict:
                return llm_relation_label
            else:
                return "other"

        source_match = True if matcher_object.similarity(reporter, [llm_source_company]).max() > threshold else False
        dest_match = True if matcher_object.similarity(reported_company, [llm_dest_company]).max() > threshold else False
      
        if source_match and dest_match:
            return reverse_relation_dict.get(llm_relation_label, "other")
        
    return "other"


# generate llm relations for output_df
output_df["llm_relation_label"] = output_df.apply(lambda x: generate_llm_relation_label(x, sentence_transformers_matcher, threshold=0.85), axis=1)
    

0.97184396

In [71]:
# output_df["old_pipeline_relation"] = output_df.apply(lambda x: [x.company_name , x.relation, x.reporterName], 
#                                                      axis=1)

# output_df["new_pipeline_relation"] = output_df.apply(lambda x: [x.company_name , x.winning_relation, x.reporterName], 
#                                                      axis=1)


# # Search relations and return mask
# tqdm.pandas(desc="Search relations")
# output_df['old_llm_align'] =\
# output_df[['old_pipeline_relation', 'relations']]\
# .progress_apply(lambda x:
# llm_relation_utils.relation_search(
# query_relation=  x.iloc[0],
# relations_tuples= x.iloc[1],
# matcher=sentence_transformers_matcher,
# threshold=0.85,
# main_relations=["customer", "supplier"]), axis=1).to_list()


# # Search relations and return mask
# tqdm.pandas(desc="Search relations")
# output_df['new_llm_align'] =\
# output_df[['new_pipeline_relation', 'relations']]\
# .progress_apply(lambda x:
# llm_relation_utils.relation_search(
# query_relation=  x.iloc[0],
# relations_tuples= x.iloc[1],
# matcher=sentence_transformers_matcher,
# threshold=0.85,
# main_relations=["customer", "supplier"]), axis=1).to_list()



Search relations: 100%|██████████| 454/454 [00:08<00:00, 51.39it/s]


In [232]:
dict(output_df.llm_relation_label.value_counts())

{'not_found': 207,
 'customer': 127,
 'supplier': 103,
 'financial_trade': 16,
 'nothing': 1}

In [234]:
output_df.columns

Index(['accessionnumber', 'cik', 'reporter_name', 'reported_company',
       'sentence_id', 'sentence', 'relation', 'score', 'sents_scores',
       'agreration_results', 'winning_relation',
       'matched_reported_company_old', 'reported_company_old', 'old_relation',
       'explanation', 'relation_completion', 'relations',
       'llm_relation_label'],
      dtype='object')

In [320]:
# group by accessionNumber and company_name and find the winning relations from each group of sentences
# winning relation is the relation with max count of llm_relation_label in the group

accessionnumber_company_name_to_winning_relation_dict = collections.defaultdict(dict)

for gorup_columns, group in output_df.groupby(["accessionnumber", "reported_company"]):
    
    label_count = dict(group.llm_relation_label.value_counts())
    
    if label_count.get("customer", 0) > label_count.get("supplier", 0):
        winning_relation = "customer"
    elif label_count.get("customer", 0) < label_count.get("supplier", 0):
        winning_relation = "supplier"
    else: 
        winning_relation = "other"
    
    # get the accessionnumber and company_name from group
    accessionnumber = gorup_columns[0]
    reported_company = gorup_columns[1]

    accessionnumber_company_name_to_winning_relation_dict[accessionnumber][reported_company] = winning_relation

# add winning_relation to output_df
output_df["llm_winning_relation"] = output_df.apply(lambda x: accessionnumber_company_name_to_winning_relation_dict[x.accessionnumber][x.reported_company], axis=1)

In [330]:
# there is bug in winning_relation ; hence fix here

def max_score_winning_relation(agreration_result):
    default_winning_relation = "other"
    if agreration_result.get("customer") > agreration_result.get("supplier"):
        return "customer"
    elif agreration_result.get("supplier") > agreration_result.get("customer"):
        return "supplier"
    else:
        return default_winning_relation

output_df.loc[:, "winning_relation"] =   output_df.agreration_results.apply(max_score_winning_relation)


In [331]:
# check how many rows have winning_relation same as old_pipeline_relation
output_df.loc[:, "old_llm_align"] = output_df.apply(lambda x: x.llm_winning_relation == x.old_relation, axis=1)
output_df.loc[:, "new_llm_align"] = output_df.apply(lambda x: x.llm_winning_relation == x.winning_relation, axis=1)


In [335]:
# ## output_df.to_excel("test_pipeline_data/filings_gt_25_relns/final_report_conflict_with_old_relation_llm.xlsx", index=False)


In [280]:
output_df.old_relation.value_counts()

old_relation
supplier    268
customer    186
Name: count, dtype: int64

In [332]:
# find the unique rows on accessionNumber, company_name
unique_output_df = output_df.drop_duplicates(subset=["accessionnumber", "reported_company"], keep="first")


In [317]:
unique_output_df.winning_relation.value_counts()

winning_relation
customer    70
supplier    36
other       23
Name: count, dtype: int64

In [318]:
unique_output_df.old_relation.value_counts()

old_relation
supplier    78
customer    51
Name: count, dtype: int64

In [283]:
unique_output_df.old_llm_align.value_counts()

old_llm_align
False    104
True      25
Name: count, dtype: int64

In [333]:
unique_output_df.new_llm_align.value_counts()

new_llm_align
False    66
True     63
Name: count, dtype: int64

In [334]:
# ## unique_output_df.to_excel("test_pipeline_data/filings_gt_25_relns/final_report_unique_conflict_with_old_relation_llm.xlsx", index=False)



#### Try out new winning relation methods on conflict dataset 

In [288]:
unique_output_df.columns

Index(['accessionnumber', 'cik', 'reporter_name', 'reported_company',
       'sentence_id', 'sentence', 'relation', 'score', 'sents_scores',
       'agreration_results', 'winning_relation',
       'matched_reported_company_old', 'reported_company_old', 'old_relation',
       'explanation', 'relation_completion', 'relations', 'llm_relation_label',
       'llm_winning_relation', 'old_llm_align', 'new_llm_align'],
      dtype='object')

In [328]:
def max_score_winning_relation(agreration_result):
    default_winning_relation = "other"
    if agreration_result.get("customer") > agreration_result.get("supplier"):
        return "customer"
    elif agreration_result.get("supplier") > agreration_result.get("customer"):
        return "supplier"
    else:
        return default_winning_relation


def winning_relation_maxpool(row):
    sents_score = row["sents_scores"]
    default_winning_relation = "other"
    
    count_customer = len(sents_score.get("customer_scores", []))
    count_supplier = len(sents_score.get("supplier_scores", []))
    count_other = len(sents_score.get("other_scores", []))

    if count_customer > count_supplier:
        return "customer"
    elif count_supplier > count_customer:
        return "supplier"
    else:
        return default_winning_relation
    
def winning_relation_maxsum(row):
    sents_score = row["sents_scores"]
    default_winning_relation = "other"
    
    count_customer = sum(sents_score.get("customer_scores", [0]))
    count_supplier = sum(sents_score.get("supplier_scores", [0]))
    count_other = sum(sents_score.get("other_scores", [0]))

    if count_customer > count_supplier:
        return "customer"
    elif count_supplier > count_customer:
        return "supplier"
    else:
        return default_winning_relation


#unique_output_df.loc[:, "new_winning_relation"] =   unique_output_df.agreration_results.apply(max_score_winning_relation)
#unique_output_df.loc[:, "new_winning_relation_maxpool"] =   unique_output_df.apply(winning_relation_maxpool, axis=1)

#unique_output_df.loc[:, "new_winning_relation_maxsum"] =   unique_output_df.apply(winning_relation_maxsum, axis=1)


In [312]:
#unique_output_df.loc[:, "new_llm_align_new"] = unique_output_df.apply(lambda x: x.llm_winning_relation == x.new_winning_relation, axis=1)

#unique_output_df.loc[:, "new_llm_align_maxpool"] = unique_output_df.apply(lambda x: x.llm_winning_relation == x.new_winning_relation_maxpool, axis=1)

#unique_output_df.loc[:, "new_llm_align_maxsum"] = unique_output_df.apply(lambda x: x.llm_winning_relation == x.new_winning_relation_maxsum, axis=1)


In [311]:
unique_output_df.new_llm_align_maxsum.value_counts()

new_llm_align_maxsum
False    67
True     62
Name: count, dtype: int64

In [313]:
output_df.columns

Index(['accessionnumber', 'cik', 'reporter_name', 'reported_company',
       'sentence_id', 'sentence', 'relation', 'score', 'sents_scores',
       'agreration_results', 'winning_relation',
       'matched_reported_company_old', 'reported_company_old', 'old_relation',
       'explanation', 'relation_completion', 'relations', 'llm_relation_label',
       'llm_winning_relation', 'old_llm_align', 'new_llm_align'],
      dtype='object')

In [None]:
output_df.loc[:, "winning_relation"] =   output_df.agreration_results.apply(max_score_winning_relation)


In [6]:
labeled_relationships_sentences_neg.shape

(956509, 5)

In [9]:
 

# find unique count on company_name and related_entity

#print(labeled_relationships_sentences.groupby(["key", "company_name", "related_entity", "relationship_type"]).count().shape)
 

Index(['key', 'company_name', 'related_entity', 'relationship_type',
       'sentence'],
      dtype='object')
(8479, 5)


14996
14996


entity_in_sentence
False    887480
True      69029
Name: count, dtype: int64


Index(['key', 'company_name', 'related_entity', 'relationship_type',
       'sentence', 'join_id', 'acc_join_id', 'entity_in_sentence'],
      dtype='object')