In [4]:
import pandas as pd
import csv, json
# from rag.data_loader import *
import os
import glob
from tqdm import tqdm

def load_data(data_dir, filter_composite, filter_duplicate):
    """
    Parameters
    ----------
    data_dir : str
        a path of data
    filter_composite : bool
        filter composite mentions
    filter_duplicate : bool
        filter duplicate queries

    Returns
    -------
    data : np.array
        mention, cui pairs
    """
    data = []

    # concept_files = glob.glob(os.path.join(data_dir, "*.txt"))
    file_types = ("*.concept", "*.txt")
    concept_files = []
    for ft in file_types:
        concept_files.extend(glob.glob(os.path.join(data_dir, ft)))

    for concept_file in tqdm(concept_files):
        with open(concept_file, "r", encoding="utf-8") as f:
            concepts = f.readlines()

        for concept in concepts:
            concept = concept.split("||")
            if len(concept) == 2:
                mention = concept[1].strip().lower()
                cui = str(concept[0].strip())
                if cui.lower() == "cui-less":
                    continue
                is_composite = cui.replace("+", "|").count("|") > 0

                if filter_composite and is_composite:
                    continue
                else:
                    data.append((mention, cui))
            elif len(concept) == 6:
                mention = concept[3].strip().lower()
                cui = concept[4].strip()
                if cui.lower() == "cui-less":
                    continue
                is_composite = cui.replace("+", "|").count("|") > 0
                context = concept[5].strip()
                if filter_composite and is_composite:
                    continue
                else:
                    data.append((mention, context, cui))

    if filter_duplicate:
        data = list(dict.fromkeys(data))
    print(f"length of queries = {len(data)}")
    return data


# NCBI DATASET

In [5]:
concepts_df = pd.read_csv(
    "data/output/concepts.csv", dtype=str
)
concept_dict_to_name = concepts_df.set_index("concept_id")["domain_id"].to_dict()

ncbi_directory = "data/eval_datasets/original_ncbi-disease"
ncbi_data = load_data(ncbi_directory, filter_composite=True, filter_duplicate=True)
domain = []
for item in ncbi_data:
    _, cui = item
    if cui in concept_dict_to_name:
        if concept_dict_to_name[cui] not in domain:
            domain.append(concept_dict_to_name[cui])
print(f"domains in ncbi are  = {domain}")

100%|██████████| 4/4 [00:01<00:00,  3.79it/s]

length of queries = 48714
domains in ncbi are  = ['Measurement', 'Condition', 'Observation', 'Device', 'Procedure']





# BC5CDR-Disease DATASET

In [6]:
# For BC5CDR
ncbi_directory = "data/eval_datasets/original_bc5cdr-disease"
ncbi_data = load_data(ncbi_directory, filter_composite=True, filter_duplicate=True)
domain = []
for item in ncbi_data:
    _, cui = item
    if cui in concept_dict_to_name:
        if concept_dict_to_name[cui] not in domain:
            domain.append(concept_dict_to_name[cui])
print(f"domains in bc5cdr-D are  = {domain}")

100%|██████████| 6/6 [00:02<00:00,  2.77it/s]

length of queries = 49096
domains in bc5cdr-D are  = ['Measurement', 'Condition', 'Observation', 'Device', 'Procedure']





# BC5CDR-Disease CHEMICAL

In [7]:
# ncbi_directory = "/workspace/rag_pipeline/syn_map/eval_dataset/bc5cdr-chemical"
# ncbi_data = load_data(ncbi_directory, filter_composite=True, filter_duplicate=True)
# domain = []
# for item in ncbi_data:
#     _, cui = item
#     if cui in concept_dict_to_name:
#         if concept_dict_to_name[cui] not in domain:
#             domain.append(concept_dict_to_name[cui])
# print(f"domains in bc5cdr-C are  = {domain}")

# MedMention Dataset

In [8]:
# def load_med_mentiondata(
#     data_dir, cui_to_concept_id, load_full_sentence=False, filter_duplicate=True
# ):
#     """
#     Parameters
#     ----------
#     data_dir : str
#         a path of data
#     filter_composite : bool
#         filter composite mentions
#     filter_duplicate : bool
#         filter duplicate queries

#     Returns
#     -------
#     data : np.array
#         mention, cui pairs
#     """
#     data = []
#     file_types = ("*.concept", "*.txt")
#     concept_files = []
#     for ft in file_types:
#         concept_files.extend(glob.glob(os.path.join(data_dir, ft)))

#     for concept_file in tqdm(concept_files):
#         # concept_files = glob.glob(os.path.join(data_dir, "*.concept"))
#         with open(concept_file, mode="r", encoding="utf-8") as f:
#             lines = f.readlines()
#             for line in lines:
#                 line = line.split("||")
#                 if len(line) != 5:
#                     continue
#                 cui = str(line[4]).strip()
#                 if cui in cui_to_concept_id:
#                     cui = cui_to_concept_id[cui]
#                     mention = line[3]
#                     data.append((mention, cui))
#                 else:
#                     continue

#             # Ensures mentions is not empty

#             # print(mentions)
#             # if mentions:
#             #     for mention in mentions:
#             #         print(mention)
#             #         data.append((sentence, mention, cui))

#     if filter_duplicate:
#         data = list(dict.fromkeys(data))
#     print("query size:", len(data))
#     return data


# medmention_dir = "/workspace/rag_pipeline/syn_map/eval_dataset/PMedMention/full"

# ohdsi_to_cui = pd.read_csv(
#     "/workspace/rag_pipeline/data/input/omop_v5.4/CUItoOHDSI.csv", dtype=str
# )
# cui_to_concept_id = ohdsi_to_cui.set_index("CUI")["concept_id"].to_dict()
# medmention_data = load_med_mentiondata(medmention_dir, cui_to_concept_id)
# domain = []
# for item in medmention_data:
#     _, cui = item
#     if cui in concept_dict_to_name:
#         if concept_dict_to_name[cui] not in domain:
#             domain.append(concept_dict_to_name[cui])
# print(f"domains in MedMention are  = {domain}")

# Cometa

In [9]:
# cometa_directory = "/workspace/rag_pipeline/syn_map/eval_dataset/cometa"
# cometa_data = load_data(cometa_directory, filter_composite=True, filter_duplicate=True)
# domain = []
# for item in cometa_data:
#     _, cui = item
#     if cui in concept_dict_to_name:
#         if concept_dict_to_name[cui] not in domain:
#             domain.append(concept_dict_to_name[cui])
# print(f"domains in ncbi are  = {domain}")

# AskaPatient

In [10]:
# askapatient_directory = "/workspace/rag_pipeline/syn_map/eval_dataset/askapatient"
# askapatient_data = load_data(
#     askapatient_directory, filter_composite=True, filter_duplicate=True
# )
# domain = []
# for item in askapatient_data:
#     _, cui = item
#     if cui in concept_dict_to_name:
#         if concept_dict_to_name[cui] not in domain:
#             domain.append(concept_dict_to_name[cui])
# print(f"domains in ncbi are  = {domain}")

In [11]:
# import json

# def load_jsonl(file_path: str) -> list:
#     """
#     Loads a .jsonl file into a list of dictionaries.

#     Parameters:
#         file_path (str): The path to the .jsonl file.

#     Returns:
#         list: A list of dictionaries representing the .jsonl content.
#     """
#     with open(file_path, 'r', encoding='utf-8') as f:
#         return [json.loads(line) for line in f]

# def save_jsonl(file_path: str, data: list):
#     """
#     Saves a list of dictionaries into a .jsonl file.

#     Parameters:
#         file_path (str): The path to the .jsonl file.
#         data (list): A list of dictionaries to be saved.
#     """
#     with open(file_path, 'w', encoding='utf-8') as f:
#         for entry in data:
#             f.write(json.dumps(entry) + '\n')

# def append_jsonl(source_file: str, destination_file: str):
#     """
#     Appends the content of source_file to the end of destination_file, avoiding duplicates.

#     Parameters:
#         source_file (str): The path to the source .jsonl file.
#         destination_file (str): The path to the destination .jsonl file.
#     """
#     source_data = load_jsonl(source_file)
#     destination_data = load_jsonl(destination_file)

#     destination_content = {json.dumps(entry) for entry in destination_data}

#     for entry in source_data:
#         entry_json = json.dumps(entry)
#         if entry_json not in destination_content:
#             destination_data.append(entry)
#             destination_content.add(entry_json)
#     destination_file = '/workspace/rag_pipeline/data/output/concepts_all_v1.jsonl'
#     save_jsonl(destination_file, destination_data)

# # Usage example
# destination_file_path = '/workspace/rag_pipeline/data/output/concepts_all.jsonl'
# source_file_path= '/workspace/rag_pipeline/data/output/medra_concepts.jsonl'
# append_jsonl(source_file_path, destination_file_path)


In [12]:
;;

  ("")("")
  ("")("")
  ("")("")


TypeError: 'str' object is not callable

In [1]:
import json
import pickle

# Path to your JSONL file
jsonl_file_path = "data/output/sapbert_embedding_docs.jsonl"
# Path where the Pickle file will be saved
pkl_file_path = "data/output/sapbert_embedding_docs.pkl"

# List to hold the data from the JSONL file
data = []

# Read the JSONL file
with open(jsonl_file_path, "r") as file:
    for line in file:
        # Assuming each line is a separate JSON object
        data.append(json.loads(line))

# Serialize the data to a Pickle file
with open(pkl_file_path, "wb") as file:
    pickle.dump(data, file)

In [None]:
from langchain.schema import Document
import json
from tqdm import tqdm
from typing import Iterable


def create_document(data):
    try:
        # print(data.keys())
        # Check if 'page_content' exists in data, use an empty string as default if not
        page_content = data.get("kwargs", {}).get("page_content", {})
        # print(f"page_content={page_content}")
        # Access 'metadata' safely
        metadata = data.get("kwargs", {}).get("metadata", {})

        # Create the Document object
        document = Document(page_content=page_content, metadata=metadata)
        return document

    except Exception as e:
        print(f"Error loading document: {e}")
        # Return None or handle the error appropriately (perhaps re-raise the exception or log it)
        return None


def load_docs_from_jsonl(file_path) -> list:
    docs_dict = {}
    with open(file_path, "r") as jsonl_file:
        print("Opening file...")
        for line in tqdm(jsonl_file, desc="Loading Documents"):
            data = json.loads(line)
            try:
                obj = Document(**data)
            except Exception as e:
                # print(f"document object translated into Dictionary format")
                obj = create_document(data)
            # print(f"data={obj}")
            vocab = obj.metadata["vocab"].lower()
            if vocab in [
                "snomed",
                "loinc",
                "atc",
                "ucum",
                "rxnorm",
                "omop extension",
                "mesh",
                "meddra",
            ]:
                # Define a unique key based on page content and critical metadata
                # This might include other metadata fields you consider critical for uniqueness
                # Here we use a combination of page_content and a sorted JSON dump of metadata to ensure the key is unique and consistently formatted
                key = (obj.page_content, json.dumps(obj.metadata, sort_keys=True))

                # Only add to dictionary if it is truly unique
                if key not in docs_dict:
                    docs_dict[key] = obj

    # Convert dictionary values to a sorted list to process documents in a specific order
    sorted_docs = sorted(
        docs_dict.values(), key=lambda doc: doc.metadata["vocab"].lower()
    )
    print(f"Total Unique Documents: {len(sorted_docs)}")
    return sorted_docs


def save_docs_to_jsonl(array: Iterable[Document], file_path: str) -> None:
    with open(file_path, "w") as jsonl_file:
        for doc in array:
            # print(doc.json())
            jsonl_file.write(doc.json() + "\n")


save_docs_to_jsonl(
    load_docs_from_jsonl(
        "data/output/sapbert_embedding_docs.jsonl"
    ),
    "data/output/sapbert_emb_docs_json.jsonl",
)


In [None]:
dict_path = "./ncbi-disease/test_dictionary.txt"
with open(dict_path, "r") as f:
    lines = f.readlines()
cuis = []
for l in lines:
    cui = l.split("||")[0]
    cuis.append(cui)

print(len(cuis))
print(len(set(cuis)))


In [None]:
dict_path = "./ncbi-disease/train_dictionary.txt"
with open(dict_path, "r") as f:
    lines = f.readlines()
cuis_test = []
for l in lines:
    cui = l.split("||")[0]
    cuis_test.append(cui)
print(len(cuis))
print(len(set(cuis)))

In [None]:
dict_path = "./bc5cdr-disease/test_dictionary.txt"
with open(dict_path, "r") as f:
    lines = f.readlines()
cuis = []
for l in lines:
    cui = l.split("||")[0]
    cuis.append(cui)
print(len(cuis))
print(len(set(cuis)))

In [None]:
dict_path = "./medmentions/umls2017aa_reference_ont_lower.txt"
with open(dict_path, "r") as f:
    lines = f.readlines()
cuis = []
for l in lines:
    cui = l.split("||")[0]
    cuis.append(cui)
print(len(cuis))
print(len(set(cuis)))

In [None]:
dict_path = "./askapatient/AskAPatient.dict.txt"
with open(dict_path, "r") as f:
    lines = f.readlines()
cuis = []
for l in lines:
    cui = l.split("||")[0]
    cuis.append(cui)
print(len(cuis))
print(len(set(cuis)))

In [None]:
dict_path = "./askapatient/AskAPatient.dict.txt"
with open(dict_path, "r") as f:
    lines = f.readlines()
cuis = []
for l in lines:
    cui = l.split("||")[0]
    cuis.append(cui)
print(len(cuis))
print(len(set(cuis)))

In [None]:
len_sum = 0
for i in range(10):
    dict_path = "./askapatient/AskAPatient.fold-" + str(i) + ".validation.txt"
    with open(dict_path, "rb") as f:
        lines = f.readlines()
    cuis = []
    for l in lines:
        l = str(l)
        cui = l.split("\t")[0]
        cuis.append(cui)
    len_sum += len(cuis)
    # print (len(set(cuis)))
print(len_sum / 10)

In [None]:
dict_path = "./cometa/COMETA_id_sf_dictionary.txt"
with open(dict_path, "r") as f:
    lines = f.readlines()
cuis = []
for l in lines:
    cui = l.split("||")[0]
    cuis.append(cui)
print(len(cuis))
print(len(set(cuis)))

In [None]:
# npz data
import pandas as pd
import numpy as np

data = np.load(
    "data/eval_datasets/miid/miid-data.npz", allow_pickle=True
)

# List array names and their shapes
for array_name in data.files:
    array = data[array_name]
    print(f"Array Name: {array_name}, Shape: {array.shape}, Dtype: {array.dtype}")


In [None]:
# Access the arrays
mention_indices = data["mention_indices"]
mention_labels = data["mention_labels"]
mention_names = data["mention_names"]
concept_indices = data["concept_indices"]
concept_names = data["concept_names"]

# Convert to a DataFrame
mentions_df = pd.DataFrame(
    {
        "mention_index": mention_indices,
        "mention_label": mention_labels,
        "mention_name": mention_names,
    }
)
concept_df = pd.DataFrame(
    {"concept_index": concept_indices, "concept_name": concept_names}
)

# Display the first few rows
print("Mentions DataFrame:")
print(mentions_df.head())
mentions_df.to_csv(
    "data/eval_datasets/miid/miid-mentions.csv", index=False
)
concept_df.to_csv(
    "data/eval_datasets/miid/miid-concepts.csv", index=False
)


In [None]:
df = pd.read_csv("data/eval_datasets/miid/miid-mentions.csv")

# Create 'bike_disease_queries.txt' with format 'mention_index||mention_label'
bike_disease_queries = df[["mention_index", "mention_label"]].astype(str)
bike_disease_queries["combined"] = (
    bike_disease_queries["mention_index"] + "||" + bike_disease_queries["mention_label"]
)
bike_disease_queries["combined"].to_csv(
    "BIKH_disease_queries.txt", index=False, header=False
)

print("bike_disease_queries.txt created successfully.")

# Create 'dictionary.txt' with format 'mention_index||mention_name'
dictionary = df[["mention_index", "mention_name"]].astype(str)
dictionary["combined"] = dictionary["mention_index"] + "||" + dictionary["mention_name"]
dictionary["combined"].to_csv(
    "data/eval_datasets/miid/mimic_dictionary.txt",
    index=False,
    header=False,
)

print("dictionary.txt created successfully.")

In [1]:
import csv


def generate_files(
    mention_csv_path, concept_csv_path, output_path_mention_queries, output_path_concept
):
    # Load concept data and create a lookup from label to id
    concept_id_by_label = {}
    with open(concept_csv_path, mode="r", newline="", encoding="utf-8") as file:
        reader = csv.reader(file)
        for row in reader:
            # Strip any whitespace and handle IDs consistently
            concept_id, label = row[0].strip(), row[1].strip()
            concept_id_by_label[label] = concept_id

    # Process mention data and generate output files
    mention_queries = []
    concept_ids_used = set()

    with open(mention_csv_path, mode="r", newline="", encoding="utf-8") as file:
        reader = csv.reader(file)
        for row in reader:
            # Ensure data is stripped of any excess whitespace
            mention_index, mention_label, mention_name = (
                row[0].strip(),
                row[1].strip(),
                row[2].strip(),
            )
            if mention_label in concept_id_by_label:
                concept_id = concept_id_by_label[mention_label]
                mention_queries.append(f"{concept_id}||{mention_name}")
                concept_ids_used.add(concept_id)

    # Write mention queries to output file
    with open(output_path_mention_queries, "w", encoding="utf-8") as file:
        for query in mention_queries:
            file.write(query + "\n")

    # Write concept IDs and labels used to output file
    with open(output_path_concept, "w", encoding="utf-8") as file:
        for label, concept_id in concept_id_by_label.items():
            file.write(f"{concept_id}||{label}\n")


# Example usage

# Example usage
generate_files(
    "data/eval_datasets/miid/miid-mentions.csv",
    "data/eval_datasets/miid/miid-concepts.csv",
    "data/eval_datasets/miid/miid_queries.txt",
    "data/eval_datasets/miid/miid_concepts.txt",
)


In [1]:
import os
import glob
import argparse
import json


def process_directory(directory):
    # Define the file pattern
    pattern = os.path.join(directory, "AskAPatient.fold-[0-9].test.txt")

    # Find all files matching the pattern
    files = glob.glob(pattern)

    if not files:
        print(f"No files found matching the pattern in directory: {directory}")
        return

    cui_label_dict = {}
    combined_queries = set()

    for file_path in files:
        print(f"Processing file: {file_path}")
        try:
            with open(file_path, "r", encoding="utf-8") as file:
                for line_number, line in enumerate(file, 1):
                    # Strip newline characters and split by tab
                    parts = line.strip().split("\t")

                    if len(parts) < 3:
                        print(
                            f"Warning: Line {line_number} in {file_path} does not have at least 3 columns. Skipping."
                        )
                        continue

                    cui, label, mention_text = parts[0], parts[1], parts[2]

                    # Update the dictionary (assuming latest label overwrites previous ones)
                    if cui != "cui-less":
                        cui_label_dict[cui] = label
                    if mention_text not in combined_queries:
                        combined_queries.add(f"{cui}||{mention_text}")

        except Exception as e:
            print(f"Error processing file {file_path}: {e}")

    # Write combined_queries.txt
    combined_queries_path = os.path.join(directory, "combined_queries.txt")
    try:
        with open(combined_queries_path, "w", encoding="utf-8") as cq_file:
            for query in combined_queries:
                cq_file.write(f"{query}\n")
        print(f"Combined queries written to: {combined_queries_path}")
    except Exception as e:
        print(f"Error writing combined_queries.txt: {e}")

    # Optionally, write the dictionary to a JSON file
    # write the dictionary to a .txt file || seperated
    try:
        dict_path = os.path.join(directory, "test_dictionary.txt")
        with open(dict_path, "w", encoding="utf-8") as dict_file:
            for cui, label in cui_label_dict.items():
                dict_file.write(f"{cui}||{label}\n")
        print(f"CUI to Label dictionary written to: {dict_path}")
    except Exception as e:
        print(f"Error writing cui_label_dict.json: {e}")


process_directory("data/eval_datasets/askapatient/train_test")


Processing file: data/eval_datasets/askapatient/train_test/AskAPatient.fold-0.test.txt
Processing file: data/eval_datasets/askapatient/train_test/AskAPatient.fold-1.test.txt
Processing file: data/eval_datasets/askapatient/train_test/AskAPatient.fold-2.test.txt
Processing file: data/eval_datasets/askapatient/train_test/AskAPatient.fold-3.test.txt
Processing file: data/eval_datasets/askapatient/train_test/AskAPatient.fold-4.test.txt
Processing file: data/eval_datasets/askapatient/train_test/AskAPatient.fold-5.test.txt
Processing file: data/eval_datasets/askapatient/train_test/AskAPatient.fold-6.test.txt
Processing file: data/eval_datasets/askapatient/train_test/AskAPatient.fold-7.test.txt
Processing file: data/eval_datasets/askapatient/train_test/AskAPatient.fold-8.test.txt
Processing file: data/eval_datasets/askapatient/train_test/AskAPatient.fold-9.test.txt
Combined queries written to: data/eval_datasets/askapatient/train_test/combined_queries.txt
CUI to Label dictionary written to: da

# Transform mapped dictionary to reference set

In [159]:
import pandas as pd

df = pd.read_csv(
    "data/eval_datasets/Icare_data/CHECK-CHF_DATA_DICTIONARY_update_mapped.csv",
    dtype=str,
)

data = []
for row in df.itertuples():
    mention = ""
    codes = ""
    domain = ""
    if pd.notna(row[10]):
        domain = row[10].strip()
    if pd.notna(row[2]) and pd.notna(row[9]):
        mention = row[2].strip()

        label_codes = row[9]

        if pd.notna(row[13]):
            label_codes = f"{label_codes}|{row[13]}"
        item = f"{label_codes}||{mention}||{domain}"
        data.append(item)

    if pd.notna(row[5]):
        categorical_values = row[5].split("|")
        if pd.notna(row[17]):
            categorical_values_codes = row[17].split("|")
            min_ = min(len(categorical_values), len(categorical_values_codes))
            for i in range(min_):
                cat_mention = categorical_values[i].strip()
                cat_code = categorical_values_codes[i].strip()
                item = f"{cat_code}||{cat_mention}||observation"
                data.append(item)

    if pd.notna(row[4]):
        unit = row[4]
        if pd.notna(row[20]):
            ucode = row[20].strip()
            item = f"{ucode}||{unit}||unit"
            data.append(item)
    if pd.notna(row[6]):
        visit = row[6].strip()
        if pd.notna(row[13]):
            visit_code = row[13].strip().split("|")[-1]
            item = f"{visit_code}||{visit}||visit"
            data.append(item)
print(len(data))

#

590


In [160]:
data[-5:]

['46234437|45885250|45883167||ivabradina: dosage||drug',
 '8576||mg (miligram)| o.i.d| b.i.d||unit',
 '46234437|4162374|4145077||ivabradina: daily dose||drug',
 '46234437|4145077||Ivabradina: dosing frequency||drug',
 '46234437|4162374|4145077||ivabradina total daily dose||drug']

# CHECK-CHF Dictionary


#

In [161]:
# import pandas as pd

# df = pd.read_csv(
#     "data/eval_datasets/Icare_data/updated_giss3+TIME_CHF+CHECK_CHF_mix_mapped.csv",
#     dtype=str,
# )

In [162]:
# # dictionary
# ground_truth = []
# for row in df.itertuples():
#     codes = ""
#     labels = ""
#     if pd.notna(row[7]):
#         labels = row[7]
#     if pd.notna(row[9]):
#         codes = row[9]
#     if pd.notna(row[11]):
#         value = row[11]
#         if "|" in row[11]:
#             value = row[11].replace("|", "||")
#         labels = f"{labels}||{value}"
#     if pd.notna(row[13]):
#         codes = f"{codes}|{row[13]}"
#     if pd.notna(row[15]):
#         value = row[15]
#         if "|" in row[15]:
#             value = row[15].replace("|", "||")
#         labels = f"{labels}||{value}"
#     if pd.notna(row[17]):
#         codes = f"{codes}|{row[17]}"
#     if pd.notna(row[18]):
#         value = row[18]
#         if "|" in row[18]:
#             value = row[18].replace("|", "||")
#         labels = f"{labels}||{value}"
#     if pd.notna(row[20]):
#         codes = f"{codes}|{row[20]}"
#     codes = codes.split("|")
#     labels = labels.split("||")
#     print(codes, labels)
#     max_num = min(len(codes), len(labels))
#     for i in range(max_num):
#         ground_truth.append(f"{codes[i]}||{labels[i]}")

In [163]:
# # data[:5]
# ground_truth[:5]

In [164]:
# ;;;

In [165]:
# store in .txt file
with open(
    "data/eval_datasets/Icare_data/reference_set_v2.txt", "a"
) as f:
    for item in data:
        f.write("%s\n" % item)

In [166]:
# with open(
#     "/workspace/other_tasks/syn_map/eval_dataset/custom_data/custom_dict.txt", "a"
# ) as f:
#     for item in ground_truth:
#         f.write("%s\n" % item)

In [1]:
from rag.utils import load_docs_from_jsonl

docs = load_docs_from_jsonl(
    "data/eval_datasets/original_bc5cdr-disease/test_dictionary_docs.jsonl"
)


# I'll calculate token overlaps based on the provided code.

def jaccard_similarity(str1, str2):
    # Tokenize the strings into sets of words
    set1 = set(str1.split())
    set2 = set(str2.split())

    # Calculate the intersection and union of the sets
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))

    # Return the Jaccard similarity score
    return intersection / union

# Test input data (emulating 'docs' structure)
docs = [
    {"metadata": {"label": "Hypertension"}},
    {"metadata": {"label": "Pulmonary Hypertension"}},
    {"metadata": {"label": "Systemic Hypertension"}},
    {"metadata": {"label": "Hypertensive Disorder"}},
]

# Calculating token overlap between labels
similarities = []
for doc in docs:
    for doc_ in docs:
        if doc == doc_:
            continue
        sim = jaccard_similarity(doc["metadata"]["label"], doc_["metadata"]["label"])
        similarities.append((doc["metadata"]["label"], doc_["metadata"]["label"], sim))

# Filtering those with similarity over 0.5
high_similarity_pairs = [(label1, label2, sim) for label1, label2, sim in similarities if sim > 0.5]

high_similarity_pairs


ModuleNotFoundError: No module named 'rag'