### Imports

In [1]:
import json
import pandas as pd
import numpy as np
import os
import collections
import math
from typing import Dict, Any, Text, Tuple
import yaml
import sys
from pathlib import Path
import re

from tqdm import tqdm
import sklearn.cluster


In [6]:
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
#machine = "local"
machine = "paperspace"

if machine == "local":
    src_dir= Path.cwd().parent    
elif machine == "paperspace":
    src_dir = Path("/notebooks/inferess-relation-extraction/")

sys.path.append(str(src_dir))



In [3]:

from src.matcher.core import SimCSE_Matcher
from InstructorEmbedding import INSTRUCTOR

instructor_model = INSTRUCTOR('hkunlp/instructor-large')

# sent_embd_model = SimCSE_Matcher(
#         model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"
#     )

#sentence_emb_model = "minilm"
sentence_emb_model = "instructor"

def encode_sentences(sentence_list):
    if sentence_emb_model == "minilm":
        embeddings =  sent_embd_model.encode(sentence_list)
        return embeddings.numpy()
    elif sentence_emb_model == "instructor":
        embeddings = instructor_model.encode(sentence_list)
        return embeddings




10/22/2023 04:03:41 - INFO - sentence_transformers.SentenceTransformer -   Load pretrained SentenceTransformer: hkunlp/instructor-large


load INSTRUCTOR_Transformer
max_seq_length  512


10/22/2023 04:03:47 - INFO - sentence_transformers.SentenceTransformer -   Use pytorch device: cuda


### Test set from annotated

In [57]:
org_data_df = pd.read_excel("./test_pipeline_data/huge_set_deduped_data/huge_train_dedup_sc_cc_re.xlsx")

print(org_data_df.shape)

print(org_data_df.columns)


org_data_df.re_rel_label.value_counts(dropna=False)

(23590, 29)
Index(['level_0', 'index', 'accessionNumber', 'filer', 'firstEntity',
       'relationship', 'secondEntity', 'sentence', 'sme_relations',
       'duplicate_sentences', 'sent_size', 'clause_size', 'deduped',
       'sc_score', 'sc_label', 'cc_class', 'cc_class_2', 'cc_class_score',
       'cc_class_2_score', 'spans', 'org_groups', 'aliases', 'num_orgs',
       'filtered_org_groups', 'num_orgs_filter', 're_relations', 're_score',
       're_rel_label', 're_pred_match'],
      dtype='object')


customer    16051
supplier     3834
other        2983
NaN           722
Name: re_rel_label, dtype: int64

In [81]:
org_data_df.loc[:, "data_status"] = "ignore"

print("Concepts learned well in both SC and RE")


data_df = org_data_df[(org_data_df["sc_label"] == 1) & 
                       (org_data_df["sc_score"] > 0.95) &
                       (org_data_df["re_score"] > 0.95) &              
                       (org_data_df["re_rel_label"].isin(["customer", "supplier"]))] 

org_data_df.loc[data_df.index, "data_status"] = "well_trained"

print(data_df.shape)

#----------------------
print("Need more train data to learn better RE")


data_df = org_data_df[(org_data_df["sc_label"] == 1) & 
                       (org_data_df["sc_score"] > 0.95) &
                       (org_data_df["re_score"] < 0.95) &              
                       (org_data_df["re_rel_label"].isin(["customer", "supplier"]))]                                             
org_data_df.loc[data_df.index, "data_status"] = "need_more_re"

print(data_df.shape)

#----------------------
print("Need more train data to learn better in SC")

data_df = org_data_df[(org_data_df["sc_label"] == 1) & 
                       (org_data_df["sc_score"] < 0.95) &
                       (org_data_df["re_score"] > 0.95) &              
                       (org_data_df["re_rel_label"].isin(["customer", "supplier"]))]                                             
org_data_df.loc[data_df.index, "data_status"] = "need_more_sc"
print(data_df.shape)

print("Need more train data to learn better in SC and RE ")

data_df = org_data_df[(org_data_df["sc_label"] == 1) & 
                       (org_data_df["sc_score"] < 0.95) &
                       (org_data_df["re_score"] < 0.95) &              
                       (org_data_df["re_rel_label"].isin(["customer", "supplier"]))]                                             
org_data_df.loc[data_df.index, "data_status"] = "need_more_sc_re"
print(data_df.shape)


#----------------------
print("New concepts that are not covered in SC relation")
data_df = org_data_df[(org_data_df["sc_label"] == 0) & 
                       (org_data_df["re_rel_label"].isin(["customer", "supplier"]))]                                             
print(data_df.shape)
org_data_df.loc[data_df.index, "data_status"] = "need_concepts_sc"



Concepts learned well in both SC and RE
(7734, 31)
Need more train data to learn better RE
(1335, 31)
Need more train data to learn better in SC
(6067, 31)
Need more train data to learn better in SC and RE 
(3239, 31)
New concepts that are not covered in SC relation
(1509, 31)


In [16]:
def create_cluster_sentences(sentence_list, n_clusters=50):    
    embeddings = encode_sentences(sentence_list)
    clustering_model = sklearn.cluster.KMeans(n_clusters=n_clusters)
    clustering_model.fit(embeddings)
    cluster_labels = clustering_model.labels_

    return cluster_labels

# Create an Excel writer object

def write_clusers_to_spreadsheet(data, cluster_labels, out_file):
    writer = pd.ExcelWriter(out_file)
    data.loc[:, "cluster_labels"] = cluster_labels
    data.to_excel(writer, sheet_name="all_clusters", index=False)

    # Get unique cluster labels (excluding -1, which represents noise)
    unique_clusters = np.unique(cluster_labels[cluster_labels != -1])
    for cluster_label in unique_clusters:
        # Filter data for the current cluster label
        cluster_data = data[data["cluster_labels"] == cluster_label]

        # Write the cluster data to a separate sheet
        sheet_name = f"{cluster_label}"
        cluster_data.to_excel(writer, sheet_name=sheet_name, index=False)

    # Save the Excel file
    writer.save()



In [31]:
data_df = org_data_df[(org_data_df["sc_label"] == 1) & 
                       (org_data_df["sc_score"] > 0.95) &
                       (org_data_df["re_score"] < 0.95) &              
                       (org_data_df["re_rel_label"].isin(["customer", "supplier"]))]                                             
print(data_df.shape)


(1335, 29)


In [32]:
# call

sentence_list = data_df["sentence"].to_list() 

sentence_list = [["Represent the finance sentence for clustering: ", sent] for sent in sentence_list]

cluster_labels = create_cluster_sentences(sentence_list, n_clusters=25)

out_file = "./test_pipeline_data/huge_set_deduped_data/clusters_pos/more_pos_data_for_re.xlsx"

write_clusers_to_spreadsheet(data_df, cluster_labels, out_file)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[:, "cluster_labels"] = cluster_labels
  writer.save()


In [25]:

# data_df[data_df["cluster_labels"] == 0].cc_class.value_counts(dropna=False)

# group by cluster labels and print the value counts of cc_class column

# for _, cluster_group in data_df.groupby("cluster_labels"):
#     print(_)
#     print(cluster_group.cc_class.value_counts()[:3])
#     print("--------------------------------------------------")
          



#### Find simillar sentences

<!-- {'all_0': {'dir': '/notebooks/inferess-relation-extraction/data/raw/llm_relations_other_from_label_0_v2_3.xlsx'},
 'all_1': {'dir': '/notebooks/inferess-relation-extraction/data/raw/llm_relations_all_label_1_v2_3.xlsx'},
 'all_other': {'dir': '/notebooks/inferess-relation-extraction/data/raw/llm_relations_other_relation_v2_3.xlsx'},
 'huge_1': {'dir': '/notebooks/inferess-relation-extraction/data/raw/huge_train_llm_aligned_v2_3_0_1300.xlsx'},
 'huge_1_complex': {'dir': '/notebooks/inferess-relation-extraction/data/raw/huge_train_complex_sents_llm_v2_3.xlsx'}} -->

In [40]:
data1 = pd.read_excel("/notebooks/inferess-relation-extraction/data/raw/llm_relations_all_label_1_v2_3.xlsx")
data1["file"] = "all_1" 
#data1 = data1[data1["agree"]]
data1 = data1[["file", "index", "sentence", "sme_relations", "concept_class", "agreement_relation", "license_relation"]]

data2 = pd.read_excel("/notebooks/inferess-relation-extraction/data/raw/huge_train_llm_aligned_v2_3_0_1300.xlsx")
data2["file"] = "huge_1" 
data2 = data2[["file", "index", "sentence", "sme_relations", "concept_class", "agreement_relation", "license_relation"]]

data3 = pd.read_excel("/notebooks/inferess-relation-extraction/data/raw/huge_train_complex_sents_llm_v2_3.xlsx")
data3["file"] = "huge_complex" 
data3 = data3[["file", "index", "sentence", "sme_relations", "concept_class", "agreement_relation", "license_relation"]]

pos_train_data = pd.concat([data1, data2, data3], axis=0)
pos_train_data.reset_index(drop=True, inplace=True)
data1, data2, data3 = None, None, None

In [50]:
# looks like we have only filtered CS data for agreement and license relations

pos_train_data[((pos_train_data["concept_class"].isin(["agreement_and_partnership", "services agreement" ,"licensing_and_ip"])))
                                |  (pos_train_data["agreement_relation"] == "customer_supplier")
                                | (pos_train_data["license_relation"].notna())].shape

(840, 7)

In [54]:
#pos_train_data[pos_train_data.agreement_relation.notna()].concept_class.value_counts(dropna=False)

#pos_train_data[pos_train_data.license_relation.notna()].concept_class.value_counts(dropna=False)

#pos_train_data[pos_train_data.agreement_relation.isna()].concept_class.value_counts(dropna=False)

In [5]:
# create sentence embeddings for all the sentences in the data
org_sentence_list = pos_train_data["sentence"].to_list()
sentence_list = [["Represent the finance sentence for retrival: ", sent] for sent in org_sentence_list]

sentence_embeddings =  encode_sentences(sentence_list)


In [15]:
# given new sentences, find the top k similar sentences from the existing data

def find_similar_sentences_from_data(sent_embd, sentence_embeddings, sentence_list, k=5):
    # calculate cosine similarity between the current sentence and all the sentences
    cosine_scores = sklearn.metrics.pairwise.cosine_similarity(sent_embd, sentence_embeddings)[0]
    print(cosine_scores.shape)
    # get the top k similar sentences
    top_k_sentence_indices = cosine_scores.argsort()[-k:][::-1]
    # get the scores 
    top_k_sentence_scores = cosine_scores[top_k_sentence_indices]

    # get the actual sentences
    top_k_sentences = [sentence_list[idx] for idx in top_k_sentence_indices]


    return top_k_sentences, top_k_sentence_indices, top_k_sentence_scores

In [19]:
query_sentence = "For example, IONIS PHARMACEUTICALS Inc licensed IONIS FXIRx to Bayer to develop and commercialize IONIS FXIRx for the prevention of thrombosis"

query_sentence_e = [["Represent the finance sentence for retrival: ", query_sentence]]
query_sentence_e = encode_sentences(query_sentence_e)

top_k_sentences,top_k_sentence_indices, top_k_sentence_scores = \
    (find_similar_sentences_from_data(query_sentence_e, sentence_embeddings, org_sentence_list, k=5))



print("------------          Query  ---------------")
print( "\n".join([query_sentence[i:i+100] for i in range(0, len(query_sentence), 100 )]     ))

print("\n------------          Matching  ---------------")

for topk in range(len(top_k_sentences)):
    sent = top_k_sentences[topk]
    score = top_k_sentence_scores[topk]
    print("score :" + str(score))     
    print( "\n".join([sent[i:i+100] for i in range(0, len(sent), 100 )] ))

    print("--------------------------------------")



(4005,)
------------          Query  ---------------
For example, IONIS PHARMACEUTICALS Inc licensed IONIS FXIRx to Bayer to develop and commercialize IO
NIS FXIRx for the prevention of thrombosis

------------          Matching  ---------------
score :0.9125923
For example, SCYNEXIS Inc currently have a development license and supply agreement with R-Pharm, pu
rsuant to which SCYNEXIS Inc license to R-Pharm rights to develop and commercialize SCY-078 in the f
ield of human health in Russia and certain smaller non-core markets, and if SCY-078 receives marketi
ng approval, SCYNEXIS Inc may enter into additional sales and marketing arrangements with third part
ies for international sales
--------------------------------------
score :0.9113324
In May 2015, IONIS PHARMACEUTICALS Inc exclusively licensed IONIS FXIRx to Bayer.
--------------------------------------
score :0.91124916
For example, under BIOSANTE PHARMACEUTICALS Inc license agreement with Jazz Pharmaceuticals, Jazz Ph
armaceuti

#### matching sents in org_train_data

In [60]:
print(org_data_df.data_status.value_counts(dropna=False))

print(org_data_df.columns)

well_trained       7734
need_more_sc       6067
ignore             3706
need_more_sc_re    3239
need_conepts_sc    1509
need_more_re       1335
Name: data_status, dtype: int64
Index(['level_0', 'index', 'accessionNumber', 'filer', 'firstEntity',
       'relationship', 'secondEntity', 'sentence', 'sme_relations',
       'duplicate_sentences', 'sent_size', 'clause_size', 'deduped',
       'sc_score', 'sc_label', 'cc_class', 'cc_class_2', 'cc_class_score',
       'cc_class_2_score', 'spans', 'org_groups', 'aliases', 'num_orgs',
       'filtered_org_groups', 'num_orgs_filter', 're_relations', 're_score',
       're_rel_label', 're_pred_match', 'data_status'],
      dtype='object')


In [101]:
# find the number of sentences that are matching in training data for score > 0.9

def find_matching_sentences_count_from_data(query_df, sentence_embeddings, score_threshold=0.5):

    query_sentence_e = [["Represent the finance sentence for retrival: ", sentence]
                        for sentence in query_df["sentence"].to_list()]
    
    query_sentence_e = encode_sentences(query_sentence_e)

    sent_match_count = []

    for idx in tqdm(range(query_sentence_e.shape[0])):
        # calculate cosine similarity between the current sentence and all the sentences
        cosine_scores = sklearn.metrics.pairwise.cosine_similarity([query_sentence_e[idx]], sentence_embeddings)[0]

        # score greater than threshold
        cosine_scores = cosine_scores[cosine_scores > score_threshold]

        sent_match_count.append(len(cosine_scores))
    
    return sent_match_count



In [104]:
org_data_df.loc[:, "matching_sent_count"] = -1


query_df = org_data_df[org_data_df["data_status"].isin(["need_more_sc", "need_more_re" ,"need_more_sc_re"])] 


query_df.loc[:, "matching_sent_count"] = find_matching_sentences_count_from_data(query_df, sentence_embeddings, score_threshold=0.9)
org_data_df.loc[query_df.index, "matching_sent_count"] = query_df["matching_sent_count"]


100%|██████████| 10641/10641 [04:11<00:00, 42.28it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  query_df.loc[:, "matching_sent_count"] = find_matching_sentences_count_from_data(query_df, sentence_embeddings, score_threshold=0.9)


In [112]:
org_data_df[org_data_df["matching_sent_count"] > 0 ].matching_sent_count.describe()

count    6172.000000
mean        6.520901
std        15.380847
min         1.000000
25%         1.000000
50%         3.000000
75%         6.000000
max       266.000000
Name: matching_sent_count, dtype: float64

In [115]:
org_data_df.columns

Index(['level_0', 'index', 'accessionNumber', 'filer', 'firstEntity',
       'relationship', 'secondEntity', 'sentence', 'sme_relations',
       'duplicate_sentences', 'sent_size', 'clause_size', 'deduped',
       'sc_score', 'sc_label', 'cc_class', 'cc_class_2', 'cc_class_score',
       'cc_class_2_score', 'spans', 'org_groups', 'aliases', 'num_orgs',
       'filtered_org_groups', 'num_orgs_filter', 're_relations', 're_score',
       're_rel_label', 're_pred_match', 'data_status', 'matching_sent_count'],
      dtype='object')

In [114]:
org_data_df.to_excel("./test_pipeline_data/huge_set_deduped_data/huge_train_dedup_sc_cc_re.xlsx", index=False)