### Imports

In [2]:
import json
import pandas as pd
import os
import collections
import math
from typing import Dict, Any, Text, Tuple
import yaml
import sys
from pathlib import Path
import re

from tqdm import tqdm



In [2]:
pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
#machine = "local"
machine = "paperspace"

if machine == "local":
    src_dir= Path.cwd().parent    
elif machine == "paperspace":
    src_dir = Path("/notebooks/inferess-relation-extraction/")

sys.path.append(str(src_dir))



In [4]:

from src.matcher.core import SimCSE_Matcher
entity_matcher = SimCSE_Matcher(
        model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"
    )


(…)L6-v2/resolve/main/tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

(…)/all-MiniLM-L6-v2/resolve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

(…)MiniLM-L6-v2/resolve/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

(…)-v2/resolve/main/special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

(…)ll-MiniLM-L6-v2/resolve/main/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

### Test set from annotated

In [3]:
labeled_relationships_sentences = pd.read_csv("test_pipeline_data/labeled-relationships/labeled-relationships-sentences_pos_entity_in_sentence.csv")


In [5]:
print(labeled_relationships_sentences.columns)

print(labeled_relationships_sentences.shape)

# find unique count on company_name and related_entity
print(labeled_relationships_sentences.drop_duplicates(subset=["key", "related_entity"]).shape)


Index(['key', 'company_name', 'related_entity', 'relationship_type',
       'sentence'],
      dtype='object')
(150228, 5)
(6775, 5)


In [None]:
# # debug - check overlap in positive and negative samples

# # form join_id by joining all keys except sentence
# labeled_relationships_sentences["join_id"] = labeled_relationships_sentences.apply(lambda x: "_".join([str(x[k]) for k in x.keys() if k != "sentence"]), axis=1)
# labeled_relationships_sentences_neg["join_id"] = labeled_relationships_sentences_neg.apply(lambda x: "_".join([str(x[k]) for k in x.keys() if k != "sentence"]), axis=1)

# labeled_relationships_sentences["acc_join_id"] = labeled_relationships_sentences.apply(lambda x: "_".join([str(x[k]) for k in x.keys() if k not in ["relationship_type", "join_id", "sentence"]]), axis=1)
# labeled_relationships_sentences_neg["acc_join_id"] = labeled_relationships_sentences_neg.apply(lambda x: "_".join([str(x[k]) for k in x.keys() if k not in ["relationship_type", "join_id", "sentence"]]), axis=1)


# join_id_true = set(labeled_relationships_sentences["join_id"].to_list())
# join_id_false = set(labeled_relationships_sentences_neg["join_id"].to_list())

# acc_join_id_true = set(labeled_relationships_sentences["acc_join_id"].to_list())
# acc_join_id_false = set(labeled_relationships_sentences_neg["acc_join_id"].to_list())


# # debug 
# print(len(join_id_false))

# print(len(acc_join_id_false))

# len(join_id_true.intersection(join_id_false))

# len(acc_join_id_false.intersection(acc_join_id_true))


In [4]:
# write a function to check if entity_company is in the sentence
def check_entity_in_sentence(sentence, company_name, entity_company):
    ent_comp = entity_company.lower().split(" ")[0].strip(".,-:;")
    filer_comp = company_name.lower().split(" ")[0].strip(".,-:;")
    sent_lower = sentence.lower() 
    ent_comp_present = True if re.search(r"\b{}\b".format(ent_comp), sent_lower) else False
    filer_comp_present = True if re.search(r"\b{}\b".format(filer_comp), sent_lower) else False

    return ent_comp_present and filer_comp_present

# check if entity_company is in the sentence
labeled_relationships_sentences["entity_in_sentence"] = labeled_relationships_sentences.apply(lambda x: check_entity_in_sentence(x.sentence, x.company_name, x.related_entity), axis=1)

print(labeled_relationships_sentences["entity_in_sentence"].value_counts())


entity_in_sentence
False    995493
True     150228
Name: count, dtype: int64


In [5]:
labeled_relationships_sentences = labeled_relationships_sentences[labeled_relationships_sentences["entity_in_sentence"]==True]

In [12]:
import numpy as np

def get_deduplicate_sentences(entity_matcher, data, threshold=0.99):

    # Compute embeddings for each sentence
    sentence_list = data['sentence'].to_list()
    embeddings = entity_matcher.encode(sentence_list)
    embeddings = embeddings.numpy()

    groups = []
    seen = set()

    # Calculate the dot products on GPU
    dot_products_cpu = np.dot(embeddings, embeddings.T)

    # Create a mask for similarities > threshold
    mask = dot_products_cpu > threshold

    # Create groups based on mask
    groups = []
    seen = set()

    for i in range(mask.shape[0]):
        if i not in seen:
            # Find indices where similarity > threshold
            similar_indices = np.where(mask[i])[0].tolist()

            # Mark these as seen
            seen.update(similar_indices)

            # Append the group
            groups.append(similar_indices)

    # Keep one index from each group
    indices_to_keep = [group[0] for group in groups]
    sentences_to_ignored = ["\n\n".join([sentence_list[idx] for idx in group[1:]])
                            for group in groups ]

    return indices_to_keep, sentences_to_ignored


def remove_duplicates_at_accession_entity_level(entity_matcher, sentences_df):
    # sentences_df has columns 'key', 'company_name', 'related_entity'

    # redemove duplicates at the group {accession, reporter_name and reported_company}
    #"key", "company_name", "related_entity"
    #'accessionnumber', 'reporter_name', 'reported_company'
    sentences_df.loc[:, "deduped"] = False
    #sentences_df.loc[:, "dup_sents"] = '[]'
    
    for _ , group in tqdm(sentences_df.groupby(["key", "company_name", "related_entity"])):
        group_index_mapping = dict(zip(range(group.index.shape[0]), group.index))
        indices_to_keep, sentences_to_ignored = get_deduplicate_sentences(entity_matcher, group)
        indices_to_keep = [group_index_mapping[i] for i in indices_to_keep]
        sentences_df.loc[indices_to_keep, "deduped"] = True
        #sentences_df.loc[indices_to_keep, "dup_sents"] = sentences_to_ignored
    



In [10]:
labeled_relationships_sentences.columns

Index(['key', 'company_name', 'related_entity', 'relationship_type',
       'sentence'],
      dtype='object')

In [13]:
remove_duplicates_at_accession_entity_level(entity_matcher, labeled_relationships_sentences)

 71%|███████   | 4781/6775 [02:00<01:10, 28.27it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

 72%|███████▏  | 4894/6775 [02:07<05:17,  5.93it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

 79%|███████▊  | 5330/6775 [02:22<00:49, 29.15it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

100%|██████████| 6775/6775 [02:57<00:00, 38.24it/s]


In [15]:
labeled_relationships_sentences["deduped"].value_counts()

True     79042
False    71186
Name: deduped, dtype: int64

In [16]:
labeled_relationships_sentences = labeled_relationships_sentences[labeled_relationships_sentences["deduped"]==True]

In [17]:

# (labeled_relationships_sentences
#  [['key', 'company_name', 'related_entity', 'relationship_type', 'sentence']].
#  to_csv("test_pipeline_data/labeled-relationships/labeled-relationships-sentences_pos_entity_in_sentence_dedup.csv", index=False))


(labeled_relationships_sentences
 [['key', 'company_name', 'related_entity', 'relationship_type', 'sentence']].
 to_csv("test_pipeline_data/labeled-relationships/labeled-relationships-sentences_pos_entity_in_sentence_dedup.csv", index=False))




### Read csv data 

In [18]:
query_data = pd.read_csv("/storage/test_pipeline_data/labeled-relationships/labeled-relationships-sentences_pos_entity_in_sentence_dedup.csv")

# rename columns
query_data.rename(columns={"key": "accessionnumber",
                           "related_entity": "reported_company",                        
                           "company_name": "reporter_name"}, 
                           inplace=True)

query_data.shape


(79042, 5)

In [19]:
print(query_data.drop_duplicates(subset=["accessionnumber", "reported_company"]).shape)


(6775, 5)


### Basic dependencies

In [4]:

import awswrangler as wr
from src.utils.s3_utils import put_json_obj
from src.utils.data import overwrite_json
from src.utils.logs import get_logger
from src.relation_extraction.infer import infer_from_trained
from src.matcher.core import SimCSE_Matcher
from src.relation_extraction.reporter import (agg_relations,
                                              process_relations,
                                              match_companies)
from src.glue.glue_etl import GlueETL

icon = "\U0001F4AB "
logger = get_logger(f"{icon} RE JOB", log_level="INFO")
############### Variables ################
CURRENT_STEP = "RE"
FOLLOWING = "Final"
distribute = False
##########################################
# Load GlueEtl Worker
etl = GlueETL()

# Reference it in the inference container at /opt/ml/model/code
def model_fn(model_dir: str) -> Tuple[infer_from_trained, SimCSE_Matcher]:
    """
    Loads the trained relation extractor and entity matcher models and returns them
    as a tuple.
    """
    relation_extractor = infer_from_trained(detect_entities=True,
                             language_model="en_core_web_trf",
                             require_gpu=True,
                            load_matcher=True,
                             entity_matcher=str(src_dir / "artifacts/matcher_model"))
    # "pipeline-artifacts/matcher/all-MiniLM-Nli-All-Random-v4"
    entity_matcher = SimCSE_Matcher(
        model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"
    )
    model_args = {"model_path": os.path.join(model_dir, "re_model"), "batch_size": 32}
    relation_extractor.load_model(model_args)
    return relation_extractor, entity_matcher


def float_format(x: Any) -> float:
    """
    Converts the given argument to a float and returns it.
    """
    return float(x)


def input_fn(request_body: str, content_type: str) -> Dict[str, Any]:
    """
    Parses the incoming request data and returns it as a dictionary.
    """
    if content_type == "application/json":
        request_content = json.loads(request_body)
    else:
        request_content = {}
    return request_content


if __name__ == "__main__":
    relation_extractor, entity_matcher = model_fn(src_dir/ "artifacts")

10/22/2023 03:55:32 PM [INFO]: Found credentials in shared credentials file: ~/.aws/credentials
10/22/2023 03:55:32 PM [INFO]: Found credentials in shared credentials file: ~/.aws/credentials


Torch GPU Exists..
2023-10-22 15:55:34,465 — 🌌 spaCy — INFO — Language model used is en_core_web_trf
2023-10-22 15:55:34,467 — 🌌 spaCy — INFO — spaCy Work On GPU


(…)L6-v2/resolve/main/tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

(…)/all-MiniLM-L6-v2/resolve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

(…)MiniLM-L6-v2/resolve/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

(…)-v2/resolve/main/special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

(…)ll-MiniLM-L6-v2/resolve/main/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

2023-10-22 15:55:45,181 — 💫 Relations Extractor — INFO — Loading tokenizer and model...


You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embeding dimension will be 30005. This might induce some performance reduction as *Tensor Cores* will not be available. For more details  about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Load with CUDA


10/22/2023 03:55:47 PM [INFO]: Loaded model.


2023-10-22 15:55:47,919 — 💫 Relations Extractor — INFO — Done!


### Simple Evaluation

In [None]:

from itertools import chain
def is_sc(relations):
    if not relations:
        return 0
    if isinstance(relations, list):
        return int(any([1 if x =='supplier' else 0 for x in\
             list(chain(*[list(r.values()) for r in relations])) ]))
    else:
        0
# Evaluate model performance on the simple data for a sanity check.
#simple_data = pd.read_excel(src_dir /"data/raw/simple_sentences_cs_report.xlsx")
simple_data = pd.read_excel(src_dir / "notebooks/generated_sentences_cs_report.xlsx")
predictions = relation_extractor.predict_frame(simple_data, sentence_column = 'sentence', mutate=True, reverse=True)        
simple_data.loc[predictions.index, 'relations'] = predictions['relations']
simple_data.loc[:, 're_prediction'] = simple_data['relations'].apply(is_sc)
simple_data['re_prediction'].fillna(0, inplace=True)
simple_data.loc[:, 're_correct_prediction'] = simple_data['re_prediction'] == simple_data['true_label']
simple_data.query("label == re_prediction").shape[0]/ len(simple_data)


In [None]:
simple_data.columns   

### SEC data from Athena

In [None]:
# Read with CIK
query_data = etl.run_query("""SELECT * FROM "legacyevents"."filingtexttext_parquet" where reporter_cik IN ('0000012927',
    '0000037996',
    '0000104169',
    '0000320193',
    '0001047122',
    '0000078003',
    '0000789019',
    '0000320193',
    '0001467858')
""")

In [None]:
query_data.columns

In [None]:
#  value counts by accession number and other info for all those unique accession numbers
query_data.groupby("accessionnumber").agg({"reporter_cik": "unique",
                                            "reporter_name": "unique",
                                            "reporter_normalizedname": "unique",
                                           "accessionnumber": "count",
                                           })


### Extract NER Tags

In [22]:
query_data = pd.read_csv("/storage/test_pipeline_data/labeled-relationships/labeled-relationships-sentences_neg_entity_in_sentence.csv")
print(query_data.columns)

# rename columns
query_data.rename(columns={"key": "accessionnumber",
                           "related_entity": "reported_company",                        
                           "company_name": "reporter_name"}, 
                           inplace=True)

print(query_data.shape)

print(query_data[["accessionnumber",  "reported_company"]].drop_duplicates().shape)


(79042, 5)
(6775, 2)


In [21]:
query_data.columns

Index(['accessionnumber', 'reporter_name', 'reported_company',
       'relationship_type', 'sentence'],
      dtype='object')

In [23]:
sents, spans, group_docs, aliases_docs = relation_extractor.spacy_loader.predictor(query_data['sentence'])

2023-10-01 07:49:34,685 — 🌌 spaCy — INFO — Start batch job for 4 chunks
2023-10-01 07:49:34,687 — 🌌 spaCy — INFO — process chunk#1 ...
2023-10-01 07:51:51,655 — 🌌 spaCy — INFO — process chunk#2 ...
2023-10-01 07:53:45,457 — 🌌 spaCy — INFO — process chunk#3 ...
2023-10-01 07:55:42,600 — 🌌 spaCy — INFO — process chunk#4 ...


  0%|          | 0/389 [00:00<?, ?it/s]

In [24]:
len(group_docs)
#sents, spans, group_docs, aliases_docs

79042

In [25]:
# Convert JSON strings to Python objects
query_data.loc[:, "sentence"] = sents
query_data.loc[:, "spans"] = spans
query_data.loc[:, "org_groups"] = group_docs
query_data.loc[:, "aliases"] = aliases_docs
query_data.loc[:, 'num_orgs'] = query_data['org_groups']\
          .apply(lambda x : len(set(x.values()))).tolist()
query_data = query_data[query_data['num_orgs'] > 1]

query_data.reset_index(drop=True, inplace=True)

query_data.shape

(75250, 9)

In [111]:
## debug - read the already ner detected data

query_data = pd.read_excel("test_pipeline_data/labeled-relationships/sentences_pos_entity_in_sentence_with_ner.xlsx")

# query_data.columns

query_data.loc[:, "spans"] = query_data.spans.apply(lambda x: eval(x))
query_data.loc[:, "org_groups"] = query_data.org_groups.apply(lambda x: eval(x))
query_data.loc[:, "filtered_org_groups"] = query_data.filtered_org_groups.apply(lambda x: eval(x))
query_data.loc[:, "aliases"] = query_data.aliases.apply(lambda x: eval(x))


In [107]:
#query_data.head()

In [88]:
query_data.org_groups.to_list()[0]

{'Adeptus Health Inc': 0, 'Medicare': 1, 'Medicaid': 1, 'Tricare': 2}

In [26]:
# keep only orgs having 'company_name', 'related_entity' companies 

def get_filtered_orgs(row):
    
    id2orgs = collections.defaultdict(list)
    filtered_org_groups = {}
    for k,v in row["org_groups"].items():
        id2orgs[v].append(k)

    ent_comp = row["reported_company"].lower().split(" ")[0].strip(".,-:;")
    filer_comp = row["reporter_name"].lower().split(" ")[0].strip(".,-:;")
    filtered_org_id = {'filer': -1, 'ent_comp': -1}
    for id, orgs in id2orgs.items():
        
        is_ent_comp_group =  any([True for x in orgs if re.search(r"\b{}\b".format(ent_comp), x.lower()) ])
        is_filer_comp_group =  any([True for x in orgs if re.search(r"\b{}\b".format(filer_comp), x.lower())])
        
        if is_ent_comp_group:
            filtered_org_id['ent_comp'] = id
        elif is_filer_comp_group:
            filtered_org_id['filer'] = id            

        if filtered_org_id["filer"] != -1 and filtered_org_id["ent_comp"] != -1:
            break
    
    if filtered_org_id["filer"] != -1 and filtered_org_id["ent_comp"] != -1:
        for org in id2orgs[filtered_org_id["filer"]]:
            filtered_org_groups[org] = 0
    
        for org in id2orgs[filtered_org_id["ent_comp"]]:
            filtered_org_groups[org] = 1


    return filtered_org_groups



In [97]:
query_data.shape

(53313, 10)

In [27]:
print(query_data.shape)

# create filtered_org_groups column
query_data.loc[:, 'filtered_org_groups'] = query_data.apply(get_filtered_orgs, axis=1)

# keep only those rows where filtered_org_groups is not empty
query_data.loc[:, 'num_orgs'] = query_data['filtered_org_groups']\
          .apply(lambda x : len(set(x.values()))).tolist()

query_data = query_data[query_data['num_orgs'] > 1]
query_data.reset_index(drop=True, inplace=True)

print(query_data.shape)

print(query_data.drop_duplicates(subset=["accessionnumber", "reported_company"]).shape)


(75250, 9)
(65143, 10)


In [28]:
# debug - find the count of unique records by accessionnumber, reporter_name, reported_company
query_data[["accessionnumber", "reported_company"]].drop_duplicates().shape



(6431, 10)


In [29]:
query_data['filtered_org_groups'].apply(lambda x : len(x)).value_counts()

2    57938
3     6717
4      415
5       63
6        7
7        3
Name: filtered_org_groups, dtype: int64

### Detect supply-chain


In [5]:
# Notice - debug mode - uncomment below only when you want to want to start from here and have the query_data generated from last step

query_data = pd.read_excel("test_pipeline_data/labeled-relationships/test_pos_query_data_sc.xlsx")

print(query_data.shape)
print(query_data.columns)

# # apply eval on few columns which has list and dict in the columns
query_data.loc[:, "spans"] = query_data["spans"].apply(eval)
query_data.loc[:, "org_groups"] = query_data["org_groups"].apply(eval)
query_data.loc[:, "filtered_org_groups"] = query_data.filtered_org_groups.apply(lambda x: eval(x))
query_data.loc[:, "aliases"] = query_data["aliases"].apply(eval)


(65143, 12)
Index(['accessionnumber', 'reporter_name', 'reported_company',
       'relationship_type', 'sentence', 'spans', 'org_groups', 'aliases',
       'num_orgs', 'filtered_org_groups', 'sc_score', 'sc_label'],
      dtype='object')


In [4]:
# debug - find the count of unique records by accessionnumber, reporter_name, reported_company
query_data[["accessionnumber", "reported_company"]].drop_duplicates().shape

(6431, 2)

In [6]:
((query_data["sc_score"] > 0.95) & (query_data["sc_label"] == 1)).value_counts()


True     48350
False    16793
dtype: int64

In [6]:
from src.sc_classifier.trainer import Trainer
from src.sc_classifier.config.core import config
config.train_args.load_pretrained = True
sc_model = Trainer(config=config,load_data=False)

root==> /notebooks/inferess-relation-extraction
2023-10-22 13:32:30,104 — SCClassifier — INFO — loading checkpoint from `sc_model`
2023-10-22 13:32:32,590 — SCClassifier — INFO — inference mode...


In [7]:
scores, preds = sc_model.predict_seq(query_data['sentence'] , max_length=128)
query_data.loc[:, 'sc_score'] = scores.max(1)
query_data.loc[:, 'sc_label'] = preds

100%|[32m██████████[0m| 1448/1448 [09:00<00:00,  2.68batch/s]

In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`



In [8]:
query_data.to_excel("test_pipeline_data/labeled-relationships/test_pos_query_data_sc.xlsx", index=False)

In [9]:
((query_data['sc_label'] == 1) & (query_data['sc_score'] > 0.95)).value_counts()

True     48325
False    16818
dtype: int64

In [7]:
print(query_data.shape)
query_data = query_data[(query_data['sc_label'] == 1) & (query_data['sc_score'] > 0.95)]
query_data.reset_index(drop=True, inplace=True)
print(query_data.shape)

(65143, 12)
(48350, 12)


In [8]:
# debug - find the count of unique records by accessionnumber, reporter_name, reported_company
query_data[["accessionnumber", "reported_company"]].drop_duplicates().shape

(6161, 2)

In [19]:
#query_data[(query_data['sc_label'] == 1) & (query_data['sc_score'] > 0.90)][["accessionnumber", "reported_company"]].drop_duplicates().shape

### Extract relations

In [20]:
# Notice - debug mode - uncomment below only when you want to want to start from here and have the query_data generated from last step

# query_data = pd.read_excel("test_pipeline_data/labeled-relationships/test_pos_query_data_re.xlsx")
# apply eval on few columns which has list and dict in the columns
# query_data.loc[:, "spans"] = query_data["spans"].apply(eval)
# query_data.loc[:, "org_groups"] = query_data["org_groups"].apply(eval)
# query_data.loc[:, "filtered_org_groups"] = query_data["filtered_org_groups"].apply(eval)
# query_data.loc[:, "aliases"] = query_data["aliases"].apply(eval)

#query_data.shape

In [9]:
# Predict relations
predictions= relation_extractor.predict_relations(
    sentences=query_data["sentence"].tolist(),
    ent="ORG",
    spans=query_data["spans"].tolist(),
    org_groups=query_data["filtered_org_groups"].tolist(),
    aliases=query_data["aliases"].tolist(),
    mutate=True, # re_model trained to predict
    reverse=True # aggregate average score between both directions
)


mutate text: 100%|██████████| 88472/88472 [00:10<00:00, 8723.34it/s]
10/22/2023 03:58:03 PM [INFO]: Tokenizing data...
tokenization: 100%|██████████| 88472/88472 [00:31<00:00, 2792.17it/s]
tags positioning: 100%|██████████| 88472/88472 [00:01<00:00, 53382.94it/s]



Invalid rows/total: 0/88472


100%|██████████| 2765/2765 [09:18<00:00,  4.95it/s]
mutate text: 100%|██████████| 88472/88472 [00:10<00:00, 8653.50it/s]
10/22/2023 04:08:06 PM [INFO]: Tokenizing data...
tokenization: 100%|██████████| 88472/88472 [00:33<00:00, 2675.64it/s]
tags positioning: 100%|██████████| 88472/88472 [00:01<00:00, 51290.26it/s]



Invalid rows/total: 0/88472


100%|██████████| 2765/2765 [08:31<00:00,  5.41it/s]
  tagged_frame.loc[:, 'scores'] =  labels, score


In [10]:
query_data['relations'] = None
query_data.loc[predictions.index.values, "relations"] = predictions["relations"]
query_data.dropna(subset=['relations'],inplace=True)

In [63]:
# optional - filter RE predictions based on the score

#query_data = query_data[query_data['re_score'] > 0.95]

In [11]:
# ##query_data.to_excel("/storage/test_pipeline_data/labeled-relationships/test_ "<>" _query_data_re.xlsx", index=False)

query_data.to_excel("test_pipeline_data/labeled-relationships/test_pos_query_data_re.xlsx", index=False)


In [15]:
query_data.columns

Index(['accessionnumber', 'reporter_name', 'reported_company',
       'relationship_type', 'sentence', 'spans', 'org_groups', 'aliases',
       'num_orgs', 'filtered_org_groups', 'sc_score', 'sc_label', 'relations'],
      dtype='object')

In [39]:
# Debug - find the number of relations that will be dropped after re confidence threshold

print(query_data[["accessionnumber", "reported_company"]].drop_duplicates().shape)

query_data['re_score'] = query_data['relations'].apply(lambda x: x[0]['score'])

print(query_data['re_score'].describe())

# find count of unique accession numbers and related entities
query_data[query_data['re_score'] > 0.75][["accessionnumber", "reported_company"]].drop_duplicates().shape

(6161, 2)
count    48350.000000
mean         0.912503
std          0.136938
min          0.340300
25%          0.903700
50%          0.980400
75%          0.993000
max          0.999900
Name: re_score, dtype: float64


(6043, 2)

### Merge re results

In [13]:
import math
from typing import Tuple, List, Text, Dict
def top_n_size(x, y, z=None):
    """
    Input - 
    x - count of supplier relation sentences
    y - count of customer relation sentences
    z - count of other relation sentences

    Balaced approach towards all relations
    Return the minimum of "20% of count each relation" as top_n_relations to consier in final scoring

    """
    if z:
        assert (x > 0) and (y > 0) and (z > 0)
        n1 = math.ceil(x * 0.2)
        n2 = math.ceil(y * 0.2)
        n3 = math.ceil(z * 0.2)
        return min(n1, min(n2, n3))
    else:
        assert (x > 0) and  (y > 0)
        n1 = math.ceil(x * 0.2)
        n2 = math.ceil(y * 0.2)
        return min(n1, n2)

def top_n_size_new(x, y):
    """
    Input - 
    x - count of supplier relation sentences
    y - count of customer relation sentences

    - Ignore the relation count of other relations, find the top_n_size based on 
    only supplier and customer relations count. 
    - This approach slightely favors relation occuring more times
   
    """
    assert (x > 0) and (y > 0) 
    
    # If difference of just 1 relation count, two relations will fight for winning relation
    if abs(x - y) == 1:
        
        return min(x, y)
    else:
        # if more difference in relation counts, favor to relation with more count
        n1 = math.ceil(x * 0.5)
        n2 = math.ceil(y * 0.5)        
        return max(n1, n2)


def log_sum_top_n(scores, top_n_size):
    """
    Logarithmic sum of top_n scores
    function name - log_sum_top_n
    """
    total_score = sum(scores)
    avg_score = total_score / len(scores)
    sorted_classifications = sorted(scores, reverse=True)
    top_n = sorted_classifications[:top_n_size]
    top_n_conf = sum(top_n)
    return avg_score * (1 + math.log(top_n_conf))
    
    

def agg_relation_score(company_relation_score: Dict, top_n_approach: str):
    """
    Input:
    top_n_approach: "old" or "new"
        "old" - top_n_size function
        "new" - top_n_size_new function
    
    Returns:
    Aggregates the scores for each relation type (supplier, customer, other) and 
    returns a dictionary with the aggregated scores.
    """

    supplier_scores = company_relation_score.get("supplier_scores", [])
    customer_scores = company_relation_score.get("customer_scores", [])
    other_scores = company_relation_score.get("other_scores", [])
 
    label_scores = {"supplier": 0, "customer": 0, "other": 0}

    # no scores for any relation 
    if not supplier_scores and not customer_scores and not other_scores:
        pass
    
    # only one relation has scores
    elif supplier_scores and not customer_scores and not other_scores:
        label_scores["supplier"] = log_sum_top_n(supplier_scores, len(supplier_scores))
    
    elif customer_scores and not supplier_scores and not other_scores:        
        label_scores["customer"] = log_sum_top_n(customer_scores, len(customer_scores))

    elif other_scores and not customer_scores and not supplier_scores:
        label_scores["other"] = log_sum_top_n(other_scores, len(other_scores)) 
    
    # two or more relations have scores
    else:
        if customer_scores and supplier_scores and not other_scores:
            if top_n_approach == "old":
                n = top_n_size(len(customer_scores), len(supplier_scores))
            elif top_n_approach == "new":
                n = top_n_size_new(len(customer_scores), len(supplier_scores))
        elif customer_scores and other_scores and not supplier_scores:
            if top_n_approach == "old":
                n = top_n_size(len(customer_scores), len(other_scores))
            elif top_n_approach == "new":
                n = top_n_size_new(len(customer_scores), len(other_scores))

        elif supplier_scores and other_scores and not customer_scores:
            if top_n_approach == "old":
                n = top_n_size(len(supplier_scores), len(other_scores))
            elif top_n_approach == "new":
                n = top_n_size_new(len(supplier_scores), len(other_scores))
        elif customer_scores and supplier_scores and other_scores:
            if top_n_approach == "old":
                n = top_n_size(len(customer_scores), len(supplier_scores), len(other_scores))
            elif top_n_approach == "new":
                n = top_n_size_new(len(customer_scores), len(supplier_scores))

        if customer_scores:
            label_scores["customer"] = log_sum_top_n(customer_scores, n)
        if supplier_scores:
            label_scores["supplier"] = log_sum_top_n(supplier_scores, n)
        if other_scores:
            label_scores["other"] = log_sum_top_n(other_scores, n)

    return label_scores
    
def get_winning_relation(company_relation_score: Dict, top_n_approach: str  ):
    """
    Input:
    ------
    Dict with list of scores for each relation type. 
    {"customer_scores": [..],  "supplier_scores": [..], "other_scores": [..]}
    
    top_n_approach: "old" or "new"
    "old" - top_n_size function
    "new" - top_n_size_new function
    
    Return:
    -------
    Dict with relation type and scores for each relation type.
    If there are more than one relation type with the same score, then winning relation is "Supplier".

    """

    # get the scores for each relation type: Dict[relation_key, aggregated_score_for_relation]
    relation_scores = agg_relation_score(company_relation_score, top_n_approach)

    # find the max score and get the relation type of max score
    max_score = max(relation_scores.values())
    max_score_relations = [k for k, v in relation_scores.items() if v == max_score]

    # if there are more than one relation type with max score, then return "Suuplier"
    if len(max_score_relations) > 1:
        winning_relation = "other"
    else:
        winning_relation = max_score_relations[0]
    
    relation_scores["winning_relation"] = winning_relation

    return relation_scores


def get_winning_relation_simplified(company_relation_score: Dict, top_n_approach: str  ):
    """
    Input:
    ------
    Dict with list of scores for each relation type. 
    {"customer_scores": [..],  "supplier_scores": [..], "other_scores": [..]}
    
    top_n_approach: "old" or "new"
    "old" - top_n_size function
    "new" - top_n_size_new function
    
    Return:
    -------
    Dict with relation type and scores for each relation type.
    If there are more than one relation type with the same score, then winning relation is "Supplier".

    """

    # get the scores for each relation type: Dict[relation_key, aggregated_score_for_relation]
    supplier_scores = company_relation_score.get("supplier_scores", [])
    customer_scores = company_relation_score.get("customer_scores", [])
    other_scores = company_relation_score.get("other_scores", [])
 
    relation_scores = {"supplier": sum(supplier_scores),
                     "customer": sum(customer_scores),
                       "other": sum(other_scores)}

    # if 

    if relation_scores["supplier"] > relation_scores["customer"]:
        winning_relation = "supplier"
    elif relation_scores["customer"] > relation_scores["supplier"]:
        winning_relation = "customer"
    else:
        winning_relation = "other"
    
    relation_scores["winning_relation"] = winning_relation

    return relation_scores



    

In [14]:
query_data.shape

(48350, 14)

In [40]:
from typing import Dict, List, Tuple
import string
import re
from itertools import chain
from collections import defaultdict
import numpy as np
import pandas as pd
from typing import Dict, List, Tuple

def process_relations(
    all_relations: pd.DataFrame,
    top_n_approach="new",
    re_score_threshold=0.95,
) -> List[Dict[str, str]]:
    """
    Process relations from all_relations dataframe and perform clustering based on organization names.

    @params
    -------
    - all_relations (pd.DataFrame): DataFrame containing all relations data.
    - matcher: Matcher object for name matching.
    - _ingest (bool): Flag indicating whether the data should be ingested.
    - match_thresh (float): Threshold for matching similarity.
    - top_n_approach (str): Approach for selecting top relations.

    @returns:
    --------
    List[Dict[str, str]]: Processed relations data.
    """

    all_records = []
    counter_for_company_not_found = 0
    counter_for_reported_company = 0
    counter_for_relation_not_found = 0

    # Group relations by accessionnumber and process each file
    for _, group in all_relations.groupby(["accessionnumber",  "reported_company"]):
        
        companies_relations = defaultdict(dict)

        # Increment counter for reported company
        counter_for_reported_company += 1   # debug - remove later
        
        # Process each relation in the group
        for _, raw in group.iterrows():
                              
            for relation in raw["relations"]:

                reporter_name = raw["reporter_name"]
                reported_company = raw["reported_company"]
                reporter_first_word = reporter_name.split(" ")[0].strip(".,-:;").lower()
                reported_first_word = reported_company.split(" ")[0].strip(".,-:;").lower()

                # safe check
                if not relation or not relation.get("score"):
                    counter_for_relation_not_found += 1 # debug - remove later
                    continue

                score = relation.get("score")
                
                entity_companies = [key for key in relation.keys() if key != "score"]
                company1 = entity_companies[0]
                company2 = entity_companies[1]
                company1_lower = company1.lower()
                company2_lower = company2.lower()

                
                # TODO - find the correct relation
                if reporter_first_word in company1_lower or reported_first_word in company2_lower:                    
                    reported_company_relation = relation[company2]
                elif reporter_first_word in company2_lower or reported_first_word in company1_lower:
                    reported_company_relation = relation[company1]
                
                else:
                    print(f" {reporter_name} -- {reported_company} --   {company1}  -- {company2}")                    
                    counter_for_company_not_found += 1  # debug - remove later
                    continue

                # if score is more than trheshold, then add the relation to the list                  
                if score >= re_score_threshold:
                    if not companies_relations[reported_company].get('sentences'):
                        companies_relations[reported_company]['sentences'] = []
                        companies_relations[reported_company]['scores'] = defaultdict(list)
                    
                    # add the score to scores dict
                    companies_relations[reported_company]['scores']\
                    ["{}_scores".format(reported_company_relation)].append(score)
                    # add sentence info
                    companies_relations[reported_company]['sentences'].append(
                        {
                            "sentence": raw["sentence"],
                            "relation": reported_company_relation,
                            "score": score
                        }
                    )
            
        # Aggregate relations by company and append to file_report
        intial_vals = group.iloc[0]
        
        for co in list(companies_relations.keys()):
            # Assign reporter name and reported company, relationship type
            row_per_detected_company = {} 

            row_per_detected_company["accessionnumber"] = intial_vals["accessionnumber"]
            row_per_detected_company["reporter_name"] = intial_vals["reporter_name"]
            row_per_detected_company["reported_company"] = intial_vals["reported_company"]
            row_per_detected_company["relationship_type"] = intial_vals["relationship_type"]
            row_per_detected_company["company_name"] = co
            row_per_detected_company["sentences"] = companies_relations[co]["sentences"]
            
            # Get the winning results and add to relations row

            winning_results = get_winning_relation_simplified(companies_relations[co]['scores'],
                                                              top_n_approach=top_n_approach)
            
            row_per_detected_company["sents_scores"] = {k:[round(vv,2) for vv in v] for k,v in 
                                                        companies_relations[co]["scores"].items()}             
            row_per_detected_company['aggregation_results'] = {k: round(v,2) for k,v in winning_results.items() 
                                                                 if k in ['supplier', 'customer', 'other']}
            row_per_detected_company['winning_relation'] = winning_results["winning_relation"]           
            
            # collect into list
            all_records.append(row_per_detected_company)
    
    print(f"counter_for_company_not_found: {counter_for_company_not_found}")
    print(f"counter_for_reported_company: {counter_for_reported_company}")    
    print(f"counter_for_relation_not_found: {counter_for_relation_not_found}")
    
    return all_records


In [16]:
query_data.columns

Index(['accessionnumber', 'reporter_name', 'reported_company',
       'relationship_type', 'sentence', 'spans', 'org_groups', 'aliases',
       'num_orgs', 'filtered_org_groups', 'sc_score', 'sc_label', 'relations',
       're_score'],
      dtype='object')

In [41]:
relations_report = process_relations(query_data,  
                                     top_n_approach="new",
                                     re_score_threshold=0.75)

counter_for_company_not_found: 0
counter_for_reported_company: 6161
counter_for_relation_not_found: 0


In [47]:
relations_report_df = pd.DataFrame(relations_report)

relations_report_df["old_new_match"] = relations_report_df.apply(lambda x: x['relationship_type'] == x['winning_relation'], axis=1)

# for negative data, old relation is other
#relations_report_df["old_new_match"] = relations_report_df.apply(lambda x: "other" == x['winning_relation'], axis=1)



In [48]:
# generate the xlsx file for the relations_report by expanding relations dict
relations_report_sentences_df = []

# iterate over relations_report list

for all_relations_data_wrt_filer in relations_report:
    row = {}
    row['accessionnumber'] = all_relations_data_wrt_filer['accessionnumber']
    row['reporter_name'] = all_relations_data_wrt_filer['reporter_name']
    row['reported_company'] = all_relations_data_wrt_filer['reported_company']
    row['company_name'] = all_relations_data_wrt_filer['company_name']
    row['relationship_type'] = all_relations_data_wrt_filer['relationship_type']
    row['sents_scores'] = all_relations_data_wrt_filer['sents_scores']
    row['aggregation_results'] = all_relations_data_wrt_filer['aggregation_results']
    row['winning_relation'] = all_relations_data_wrt_filer['winning_relation']    
    row["old_new_match"] = (row["relationship_type"] == row["winning_relation"])
    
    # for negative data
    #row["old_new_match"] = ("other" == row["winning_relation"])

    for sentence_data in all_relations_data_wrt_filer['sentences']:
       
        v1 = row.copy()
        v1['sentence'] = sentence_data['sentence']
        #v1['sentence_id'] = sentence_data['sentence_id']
        v1['sent_relation'] = sentence_data['relation']
        v1['sent_score'] = round(sentence_data['score'], 2)

        relations_report_sentences_df.append(v1.copy())
        
    # Add few empty rows for better readability in the excel 
    relations_report_sentences_df.append({})
    relations_report_sentences_df.append({})


# convert the list of dict to dataframe

relations_report_sentences_df = pd.DataFrame(relations_report_sentences_df)



In [20]:
print(relations_report_df.columns)

print(relations_report_sentences_df.columns)

Index(['accessionnumber', 'reporter_name', 'reported_company',
       'relationship_type', 'company_name', 'sentences', 'sents_scores',
       'aggregation_results', 'winning_relation', 'old_new_match'],
      dtype='object')
Index(['accessionnumber', 'reporter_name', 'reported_company', 'company_name',
       'relationship_type', 'sents_scores', 'aggregation_results',
       'winning_relation', 'old_new_match', 'sentence', 'sent_relation',
       'sent_score'],
      dtype='object')


In [43]:
relations_report_sentences_df.old_new_match.value_counts()

True     37208
False     4611
Name: old_new_match, dtype: int64

In [64]:
# store results

# pos
## relations_report_df.to_excel("test_pipeline_data/labeled-relationships/test_pos_relations_report.xlsx", index=False)
## relations_report_sentences_df.to_excel("test_pipeline_data/labeled-relationships/test_pos_relations_report_sentences.xlsx", index=False)

# neg
## relations_report_df.to_excel("test_pipeline_data/labeled-relationships/test_neg_relations_report.xlsx", index=False)
## relations_report_sentences_df.to_excel("/storage/test_pipeline_data/labeled-relationships/test_neg_relations_report_sentences.xlsx", index=False)

relations_report_df.to_excel("test_pipeline_data/labeled-relationships/test_pos_relations_report.xlsx", index=False)
relations_report_sentences_df.to_excel("test_pipeline_data/labeled-relationships/test_pos_relations_report_sentences.xlsx", index=False)



In [51]:
print(query_data.drop_duplicates(subset=['accessionnumber', 'reported_company']).shape)

print(relations_report_df.shape)

(6161, 14)
(6043, 10)


In [50]:
relations_report_df["old_new_match"].value_counts()

True     5397
False     646
Name: old_new_match, dtype: int64

In [53]:
relations_report_df['relationship_type'].value_counts()

customer    3795
supplier    2248
Name: relationship_type, dtype: int64

In [54]:
relations_report_df[ ~ relations_report_df['old_new_match']]['relationship_type'].value_counts()

supplier    525
customer    121
Name: relationship_type, dtype: int64

In [None]:
716

In [59]:
relations_report_df[ ~ relations_report_df['old_new_match']]['reported_company'].value_counts()

United States Postal Service                         11
FedEx                                                11
UPS                                                   8
Zodiac Pool Systems Inc.                              6
Microsoft                                             6
                                                     ..
China University of Geosciences School of Jewelry     1
Gedeon Richter                                        1
Icon Genetics AG                                      1
Dairy Farmers of America                              1
GlobalFoundries                                       1
Name: reported_company, Length: 576, dtype: int64

In [23]:
relations_report_df.drop_duplicates(subset=['reporter_name', 'reported_company']).shape

(5218, 10)

Concept disribution in wrong predictions 

In [36]:
# report_sents_pos = "test_pipeline_data/labeled-relationships/test_pos_relations_report_sentences_w_concepts.xlsx"
# report_sents_pos =  pd.read_excel(report_sents_pos)

# report_sents_neg = "test_pipeline_data/labeled-relationships/test_neg_relations_report_sentences_w_concepts.xlsx"
# report_sents_neg =  pd.read_excel(report_sents_neg)


In [69]:
# concept distribution in all labelled sentences

concat_report_sents = pd.concat([report_sents_pos, report_sents_neg])

(concat_report_sents.concept_class.value_counts(normalize=True, dropna=False).round(2) * 100 )

concept_class
revenue                      22.0
supply_chain                 22.0
agreement_and_partnership    13.0
licensing_and_ip             12.0
unknown                       9.0
product_related               5.0
investment_related            4.0
services agreement            3.0
real_estate                   3.0
royalties                     2.0
legal_and_regulatory          2.0
financial_statements          1.0
Name: proportion, dtype: float64

In [70]:
report_sents_pos = report_sents_pos[(report_sents_pos.relationship_type != report_sents_pos.sent_relation)]
report_sents_neg = report_sents_neg[("other" != report_sents_neg.sent_relation)]


In [71]:
# pos - sentences where relation is incorrect
(report_sents_pos.concept_class.value_counts(normalize=True, dropna=False).round(2) * 100 )[:5]

concept_class
supply_chain                 20.0
licensing_and_ip             20.0
agreement_and_partnership    17.0
unknown                      10.0
revenue                       8.0
Name: proportion, dtype: float64

In [72]:
# Neg - sentences where sentence prediction is incorrect
(report_sents_neg.concept_class.value_counts(normalize=True, dropna=False).round(2) * 100 )[:5]


concept_class
agreement_and_partnership    24.0
revenue                      13.0
unknown                      13.0
supply_chain                 12.0
licensing_and_ip             11.0
Name: proportion, dtype: float64

In [73]:
# pos + neg - all sentences where sentence prediction is incorrect

concat_report_all_sents = pd.concat([report_sents_pos, report_sents_neg])

(concat_report_all_sents.concept_class.value_counts(normalize=True, dropna=False).round(2) * 100 )

concept_class
agreement_and_partnership    20.0
supply_chain                 16.0
licensing_and_ip             16.0
unknown                      11.0
revenue                      11.0
product_related               6.0
investment_related            5.0
services agreement            4.0
legal_and_regulatory          3.0
real_estate                   3.0
financial_statements          2.0
royalties                     2.0
Name: proportion, dtype: float64

In [75]:
# # pos - all sentences where relation and sentence predictionis incorrect
# report_sents_pos = report_sents_pos[(report_sents_pos["old_new_match"] == False)] 
# (report_sents_pos.concept_class.value_counts(normalize=True, dropna=False).round(2) * 100 )[:5]

In [74]:
# # neg - all sentences where relation and sentence predictionis incorrect

# report_sents_neg = report_sents_neg[(report_sents_neg["old_new_match"] == False)] 
# (report_sents_neg.concept_class.value_counts(normalize=True, dropna=False).round(2) * 100 )[:5]


0.0528071150639244

### Performance report

**Report on positive relations**


- Initial Total Relations	6775	(100%)
- After NER Matching	6431	(94%)
- After SC Filter	6215	(91%)
- After RE Score Filter	5692	(84%)
- Relations Summary	-
  - Dropped in Pipeline	1211	(16%)
  - Matching	4976	(73%)
  - Not Matching	716	(11%)


**Report on negative relations**

- Initial Total Relations	2316	(100%)
- After NER Matching	1799	(77%)
- After SC Filter	1316	(56%)
- After RE Score Filter	996	(43%)
- Relations Summary	-
  - Dropped in Pipeline	1320	(57%)
  - Matching	104	(5%)
  - Not Matching	892	(38%)


**Concept distibution**

-- All labelled sentences  

  - revenue                      22.0
  - supply_chain                 22.0
  - agreement_and_partnership    13.0
  - licensing_and_ip             12.0
  - unknown                       9.0

  
-- Sentence wrong prediction 
 
  - agreement_and_partnership    20.0
  - supply_chain                 16.0
  - licensing_and_ip             16.0
  - unknown                      11.0
  - revenue                      11.0

**Observations**

1. Labelled data 
   1. It has a relation to many sentences mapping. Most of the sentences don't have required companies. Also many sentences have near duplicates in them.
2. Both positive and negative test peformance 
   1. Pipeline performance is ~ 77%
   2. Just model accuracy performance is ~ 85% 
3. Mistakes 
   1. Model mispredicts customer relation (another company to filer) more often (85%). 
      1. Looks like model has memorized filer as supplier from imbalance in training data 
   2. Model has high mispredictions on less frequently occuring concepts. 
      1. Model needs to see more training data on these concepts to get better 
4. Improvements 
   1. Create balanced training data on following things - 
      1. relation (1 or 0)
      2. direction of relation
      3. concepts class
   



In [57]:
6775 - ( 5314 + 457)

5397 / 6431

0.8392162960659306


** After agreement and lincense fix - Report on positive relations**


- Initial Total Relations	6775	(100%)
- After NER Matching	6431	(94%)
- After SC Filter	6215 -> 6181 -> 6051 -	(92%) -> (91%) -> (89%)
- After RE Score Filter	5692 -> 5888 -> 5771 	(84%) -> (87%) -> (85%)
- Relations Summary	-
  - Dropped in Pipeline	1083 -> 887	(16%) -> 1004 (15%)
  - Matching	4976 -> 5217	-> 5314 (73%) -> (77%) -> (79%)
  - Not Matching	716 -> 671-> 457 (11%) -> (10%)  -> (7%)


** After adding weekly lablled data re_score_threshold=0.75 **


- Initial Total Relations	6775	(100%)
- After NER Matching	6431	(94%)
- After SC Filter	6215 -> 6181 -> 6051 -> 6161	(92%) -> (91%) -> (89%)
- After RE Score Filter	5692 -> 5888 -> 5771 -> 6043	(84%) -> (87%) -> (85%)
- Relations Summary	-
  - Dropped in Pipeline	1083 -> 887	(16%) -> 1004 (15%) -> 732 (10%)  
  - Matching	4976 -> 5217	-> 5314 -> 5397 (73%) -> (77%) -> (78%) -> (79%)
  - Not Matching	716 -> 671-> 457 -> 646 (11%) -> (10%)  -> (7%) -> (9%)