In [20]:
from collections import defaultdict
from difflib import SequenceMatcher

# Define file paths
files = [
    "/workspace/mapping_tool/data/eval_datasets/icare4cvd_reference/references_v3.txt",
    "/workspace/mapping_tool/data/eval_datasets/original_miid/mimic_mention.txt",
    "/workspace/mapping_tool/data/eval_datasets/original_bc5cdr-disease/combined_test_queries.txt",
    "/workspace/mapping_tool/data/eval_datasets/original_ncbi-disease/combined_test_queries.txt",
]

# Dictionary to store CUI and query names from each file
file_queries = defaultdict(set)
all_queries = []
all_queries_unique = set()

# Helper function to calculate similarity
def is_similar(query1, query2, threshold=0.8):
    return SequenceMatcher(None, query1, query2).ratio() >= threshold

# Collect queries from each file
for file in files:
    with open(file, "r") as f:
        lines = f.readlines()
        for line in lines:
            line_parts = str(line).strip().lower().split("||")
            if len(line_parts) >= 2:  # Handle lines with at least 2 parts
                cui = line_parts[0].strip()
                name = line_parts[1].strip()
                query = name
                file_queries[file].add(query)
                all_queries.append(query)
                all_queries_unique.add(query)

# Check for common queries with similarity threshold
common_queries = set()
for query1 in all_queries_unique:
    for query2 in all_queries_unique:
        if query1 != query2 and is_similar(query1, query2):
            common_queries.add(query1)
            common_queries.add(query2)

# Find unique queries for each file
unique_queries = {
    file: queries - common_queries for file, queries in file_queries.items()
}

# Display results
print("\nTotal Queries:")
print(f"{len(all_queries)} total queries")

print(f"\nNumber of Common Queries (with Similarity >= 0.8): {len(common_queries)}")
print("\nUnique Queries per File:")
for file, queries in unique_queries.items():
    print(f"{file}: {len(queries)} unique queries")



Total Queries:
3593 total queries

Number of Common Queries (with Similarity >= 0.8): 1529

Unique Queries per File:
/workspace/mapping_tool/data/eval_datasets/icare4cvd_reference/references_v3.txt: 294 unique queries
/workspace/mapping_tool/data/eval_datasets/original_miid/mimic_mention.txt: 868 unique queries
/workspace/mapping_tool/data/eval_datasets/original_bc5cdr-disease/combined_test_queries.txt: 637 unique queries
/workspace/mapping_tool/data/eval_datasets/original_ncbi-disease/combined_test_queries.txt: 186 unique queries


In [21]:
# now we have four dictionaries and we need to check wether the cuis for queries are referrning to same label or not


In [22]:
# Collect queries from each file
file_queries = defaultdict(set)
all_queries = set()
for file in files:
    with open(file, "r") as f:
        lines = f.readlines()
        for line in lines:
            line_parts = str(line).strip().lower().split("||")
            if len(line_parts) >= 2:  # Handle lines with at least 2 parts
                cui = line_parts[0].strip()
                name = line_parts[1].strip()
                cuis = cui.split("|")
                for cui in cuis:
                    file_queries[file].add(cui)
                    all_queries.add(cui)

print(f"all queries: {len(all_queries)}")
# Find common and unique queries
common_queries = set.intersection(*file_queries.values())
unique_queries = {
    file: queries - common_queries for file, queries in file_queries.items()
}
# show common quueries
print("\nCommon Queries:")
print(f"{len(common_queries)} common queries")

print("\nUnique Queries per File:")
for file, queries in unique_queries.items():
    print(f"{file}: {len(queries)} unique queries")

all queries: 2931

Common Queries:
0 common queries

Unique Queries per File:
/workspace/mapping_tool/data/eval_datasets/icare4cvd_reference/references_v3.txt: 515 unique queries
/workspace/mapping_tool/data/eval_datasets/original_miid/mimic_mention.txt: 1566 unique queries
/workspace/mapping_tool/data/eval_datasets/original_bc5cdr-disease/combined_test_queries.txt: 686 unique queries
/workspace/mapping_tool/data/eval_datasets/original_ncbi-disease/combined_test_queries.txt: 230 unique queries
