### Imports

In [5]:
import json
import pandas as pd
import numpy as np
import os
import collections
import math
from typing import Dict, Any, Text, Tuple
import yaml
import sys
from pathlib import Path
import re

from tqdm import tqdm
import sklearn.cluster


In [6]:
pd.options.mode.chained_assignment = None  # default='warn'

In [7]:
#machine = "local"
machine = "paperspace"

if machine == "local":
    src_dir= Path.cwd().parent    
elif machine == "paperspace":
    src_dir = Path("/notebooks/inferess-relation-extraction/")

sys.path.append(str(src_dir))



In [8]:

from src.matcher.core import SimCSE_Matcher
from InstructorEmbedding import INSTRUCTOR

instructor_model = INSTRUCTOR('hkunlp/instructor-large')

# sent_embd_model = SimCSE_Matcher(
#         model_name_or_path="sentence-transformers/all-MiniLM-L6-v2"
#     )

#sentence_emb_model = "minilm"
sentence_emb_model = "instructor"

def encode_sentences(sentence_list):
    if sentence_emb_model == "minilm":
        embeddings =  sent_embd_model.encode(sentence_list)
        return embeddings.numpy()
    elif sentence_emb_model == "instructor":
        embeddings = instructor_model.encode(sentence_list)
        return embeddings




10/13/2023 12:37:47 - INFO - sentence_transformers.SentenceTransformer -   Load pretrained SentenceTransformer: hkunlp/instructor-large


(…)06e59443b07dc819fb15c7233/.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

(…)3b07dc819fb15c7233/1_Pooling/config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

(…)443b07dc819fb15c7233/2_Dense/config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

(…)84de506e59443b07dc819fb15c7233/README.md:   0%|          | 0.00/66.3k [00:00<?, ?B/s]

(…)de506e59443b07dc819fb15c7233/config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

(…)5c7233/config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

(…)dc819fb15c7233/sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

(…)07dc819fb15c7233/special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

(…)06e59443b07dc819fb15c7233/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

(…)3b07dc819fb15c7233/tokenizer_config.json:   0%|          | 0.00/2.41k [00:00<?, ?B/s]

(…)e506e59443b07dc819fb15c7233/modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer


10/13/2023 12:38:19 - INFO - sentence_transformers.SentenceTransformer -   Use pytorch device: cuda


max_seq_length  512


### Test set from annotated

In [91]:
definitions_data = pd.read_excel("test_pipeline_data/cluster_definitions/all_0_1_definitions_clustered.xlsx")

#definitions_data = definitions_data.rename(columns={"Unnamed: 0": "index"})

definitions_data = definitions_data[['definition', 'sentence', 'sme_relations', 'source_file', 'source_idx']]

definitions_data["join_index"] = definitions_data["source_idx"]
print(definitions_data.columns)

pos_definitions_data = definitions_data[definitions_data["source_file"] == "/notebooks/inferess-relation-extraction/data/raw/llm_relations_all_label_1_v2_3.xlsx"]
neg_definitions_data = definitions_data[definitions_data["source_file"] == "/notebooks/inferess-relation-extraction/data/raw/llm_relations_other_from_label_0_v2_3.xlsx"]



pos_definitions_data.reset_index(inplace=True)
neg_definitions_data.reset_index(inplace=True)


pos_org_data = pd.read_excel("test_pipeline_data/pipeline_train_data/llm_relations_all_label_1_v2_3.xlsx")
neg_org_data = pd.read_excel("test_pipeline_data/pipeline_train_data/llm_relations_other_from_label_0_v2_3.xlsx")

pos_org_data["join_index"] = pos_org_data["index"]
neg_org_data["join_index"] = neg_org_data["index"]



Index(['definition', 'sentence', 'sme_relations', 'source_file', 'source_idx',
       'join_index'],
      dtype='object')


In [92]:

 # join the definitions with the original data, and only use two columns from original data
pos_definitions_data = pos_definitions_data.merge(pos_org_data[["join_index", "concepts", "concept_class"]], on="join_index", how="left")

neg_definitions_data = neg_definitions_data.merge(neg_org_data[["join_index",  "concepts", "concept_class"]], on="join_index", how="left")



In [71]:
pos_definitions_data.columns

Index(['index', 'definition', 'sentence', 'sme_relations', 'source_file',
       'source_idx', 'join_index', 'concepts', 'concept_class'],
      dtype='object')

In [93]:
def get_deduplicate_sentences(data, threshold=0.99):

    # Compute embeddings for each sentence
    sentence_list = data['definition'].to_list()
    embeddings = encode_sentences(sentence_list)

    groups = []
    seen = set()

    # Calculate the dot products on GPU
    dot_products_cpu = np.dot(embeddings, embeddings.T)

    # Create a mask for similarities > threshold
    mask = dot_products_cpu > threshold

    # Create groups based on mask
    groups = []
    seen = set()

    for i in range(mask.shape[0]):
        if i not in seen:
            # Find indices where similarity > threshold
            similar_indices = np.where(mask[i])[0].tolist()

            # Mark these as seen
            seen.update(similar_indices)

            # Append the group
            groups.append(similar_indices)

    # Keep one index from each group
    indices_to_keep = [group[0] for group in groups]
    sentences_to_ignored = [[sentence_list[idx]                                          
                                         if type(sentence_list[idx]) == str 
                                         else sentence_list[idx][1]
                                         for idx in group[1:]]
                            for group in groups ]

    return indices_to_keep, sentences_to_ignored


def remove_duplicates_at_definitions_level(sentences_df):

    sentences_df.loc[:, "deduped"] = False
    sentences_df.loc[:, "dup_definitions"] = '[]'
    sentences_df.loc[:, "simillar_definitions_count"] = 0
    
    
    indices_to_keep, sentences_to_ignored = get_deduplicate_sentences(sentences_df, threshold=0.95)    
    sentences_df.loc[indices_to_keep, "deduped"] = True
    sentences_df.loc[indices_to_keep, "dup_definitions"] = ["\n\n".join(sents) for sents in sentences_to_ignored]
    sentences_df.loc[indices_to_keep, "simillar_definitions_count"] = [len(sents) for sents in sentences_to_ignored]





In [87]:
def create_cluster_definitions(sentence_list):    
    embeddings = encode_sentences(sentence_list)
    clustering_model = sklearn.cluster.KMeans(n_clusters=50)
    clustering_model.fit(embeddings)
    cluster_labels = clustering_model.labels_

    return cluster_labels

# Create an Excel writer object

def write_clusers_to_spreadsheet(data, cluster_labels, out_file):
    writer = pd.ExcelWriter(out_file)
    data.loc[:, "cluster_labels"] = cluster_labels
    data.to_excel(writer, sheet_name="all_clusters", index=False)

    # Get unique cluster labels (excluding -1, which represents noise)
    unique_clusters = np.unique(cluster_labels[cluster_labels != -1])
    for cluster_label in unique_clusters:
        # Filter data for the current cluster label
        cluster_data = data[data["cluster_labels"] == cluster_label]

        # Write the cluster data to a separate sheet
        sheet_name = f"{cluster_label}"
        cluster_data.to_excel(writer, sheet_name=sheet_name, index=False)

    # Save the Excel file
    writer.save()



In [124]:
# # call
# remove_duplicates_at_definitions_level(pos_definitions_data)
# pos_definitions_data.to_excel("/notebooks/data_to_download/cluster_definition/pos_definitions_deduped.xlsx", index =False)

# pos_definitions_data = pos_definitions_data[pos_definitions_data.deduped]

# sentence_list = pos_definitions_data['definition'].to_list()

# cluster_labels = create_cluster_definitions(sentence_list)
# out_file = "/notebooks/data_to_download/cluster_definition/pos_clusters.xlsx"
# write_clusers_to_spreadsheet(pos_definitions_data, cluster_labels, out_file)



  writer.save()


In [94]:
# call
remove_duplicates_at_definitions_level(pos_definitions_data)
pos_definitions_data.to_excel("/notebooks/data_to_download/cluster_definition/pos_definitions_deduped_instruct.xlsx", index =False)

pos_definitions_data = pos_definitions_data[pos_definitions_data.deduped]

sentence_list = pos_definitions_data['definition'].to_list()
sentence_list = [["Represent the finance sentence for clustering: ", sent] for sent in sentence_list]

cluster_labels = create_cluster_definitions(sentence_list)
out_file = "/notebooks/data_to_download/cluster_definition/pos_clusters_instruct.xlsx"
pos_definitions_data = pos_definitions_data[[ 'definition', 'sentence', 'sme_relations', 'source_idx',
                                               'concepts', 'concept_class', 'dup_definitions', 'simillar_definitions_count'
                                               ]]
write_clusers_to_spreadsheet(pos_definitions_data, cluster_labels, out_file)



  writer.save()


In [123]:
# remove_duplicates_at_definitions_level(neg_definitions_data)
# neg_definitions_data.to_excel("/notebooks/data_to_download/cluster_definition/neg_definitions_deduped.xlsx", index =False)

# neg_definitions_data = neg_definitions_data[neg_definitions_data.deduped]

# sentence_list = neg_definitions_data['definition'].to_list()

# cluster_labels = create_cluster_definitions(sentence_list)
# out_file = "/notebooks/data_to_download/cluster_definition/neg_clusters.xlsx"
# write_clusers_to_spreadsheet(neg_definitions_data, cluster_labels, out_file)


  writer.save()


In [95]:
# call

remove_duplicates_at_definitions_level(neg_definitions_data)
neg_definitions_data.to_excel("/notebooks/data_to_download/cluster_definition/neg_definitions_deduped_instruct.xlsx", index =False)

neg_definitions_data = neg_definitions_data[neg_definitions_data.deduped]

sentence_list = neg_definitions_data['definition'].to_list()
sentence_list = [["Represent the finance sentence for clustering: ", sent] for sent in sentence_list]

cluster_labels = create_cluster_definitions(sentence_list)
out_file = "/notebooks/data_to_download/cluster_definition/neg_clusters_instruct.xlsx"
neg_definitions_data = neg_definitions_data[[ 'definition', 'sentence', 'sme_relations', 'source_idx',
                                               'concepts', 'concept_class', 'dup_definitions', 'simillar_definitions_count'
                                               ]]

write_clusers_to_spreadsheet(neg_definitions_data, cluster_labels, out_file)


  writer.save()
