In [None]:
import pandas as pd
from collections import defaultdict
from tqdm import tqdm

STOP_WORDS = ['stop','start','combinations','combination','various combinations','various','left','right','blood','finding','finding status',
              'status','extra','point in time','oral','product','oral product','several','types','several types','random','nominal',
              'p time','quant','qual','quantitative','qualitative','ql','qn','quant','anti','antibodies','whole blood','serum','plasma','diseases',
              'disorders','disorder','disease','lab test','measurements','lab tests','meas value','measurement','procedure','procedures',
              'panel','ordinal','after','before','survey','level','levels','others','other','p dose','dose','dosage','frequency'
]

def add_relational_synonyms(concepts, relations):
    relevant_relations = ['Tradename of', 'RxNorm - SNOMED eq', 'ATC - RxNorm', 'ATC - SNOMED eq', 'Has tradename']
    
    concept_synonyms = defaultdict(set)
    
    for _, row in tqdm(relations.iterrows(), total=relations.shape[0], desc='Processing relations'):
        concept_id_1 = row['concept_id_1']
        concept_id_2 = row['concept_id_2']
        relationship_id = row['relationship_id']
        
        if relationship_id == 'Is a':
            concept_name_2 = concepts.loc[concepts['concept_id'] == concept_id_2, 'concept_name'].values[0]
            if concept_name_2:
                concept_synonyms[concept_id_1].add(concept_name_2)
                
        elif relationship_id in relevant_relations:
            concept_name_1 = concepts.loc[concepts['concept_id'] == concept_id_1, 'concept_name'].values[0]
            concept_name_2 = concepts.loc[concepts['concept_id'] == concept_id_2, 'concept_name'].values[0]
            
            if concept_name_1 and concept_name_2:
                concept_synonyms[concept_id_1].add(concept_name_2)
                concept_synonyms[concept_id_2].add(concept_name_1)

    # Update the concepts DataFrame
    for concept_id, synonyms in concept_synonyms.items():
        existing_synonyms = concepts.loc[concepts['concept_id'] == concept_id, 'concept_synonym_name'].values[0]
        if pd.isna(existing_synonyms):
            updated_synonyms = ";; ".join(synonyms)
        else:
            updated_synonyms = existing_synonyms + ";; " + ";; ".join(synonyms)
        concepts.loc[concepts['concept_id'] == concept_id, 'concept_synonym_name'] = updated_synonyms

    concepts.to_csv("/workspace/rag_pipeline/data/input/omop_v5.4/hier_concepts.csv", index=False)
    
    return concepts

def load_data():
    # Load the datasets
    concepts = pd.read_csv('/workspace/rag_pipeline/data/gat_bert_data/omop_dir_1/concepts_chv.csv', dtype=str)
    relations = pd.read_csv('/workspace/rag_pipeline/data/gat_bert_data/omop_dir_1/relationships.csv',  dtype=str)
    concepts = add_relational_synonyms(concepts, relations)
    return concepts

def filter_synonyms(entity, entity_synonyms: set):
    filtered_synonyms = {synonym for synonym in entity_synonyms if synonym not in STOP_WORDS}
    return filtered_synonyms

def build_ancestry(is_a_relations):
    ancestry = {}
    for _, row in is_a_relations.iterrows():
        child, parent = row['concept_id_1'], row['concept_id_2']
        ancestry.setdefault(child, []).append(parent)
        if parent in ancestry:
            ancestry[child].extend(ancestry[parent])
    return ancestry

def main():
    concepts = load_data()
    # Additional processing steps
    # ancestry = build_ancestry(is_a_relations)
    # create_title_text(concepts, synonyms, ancestry)

if __name__ == "__main__":
    main()


In [None]:
import pandas as pd
from collections import defaultdict
from tqdm import tqdm

STOP_WORDS = ['stop', 'start', 'combinations', 'combination', 'various combinations', 'various', 'left', 'right',
              'blood', 'finding', 'finding status', 'status', 'extra', 'point in time', 'oral', 'product', 'oral product',
              'several', 'types', 'several types', 'random', 'nominal', 'p time', 'quant', 'qual', 'quantitative',
              'qualitative', 'ql', 'qn', 'quant', 'anti', 'antibodies', 'whole blood', 'serum', 'plasma', 'diseases',
              'disorders', 'disorder', 'disease', 'lab test', 'measurements', 'lab tests', 'meas value', 'measurement',
              'procedure', 'procedures', 'panel', 'ordinal', 'after', 'before', 'survey', 'level', 'levels', 'others',
              'other', 'p dose', 'dose', 'dosage', 'frequency']

RELEVANT_RELATIONS = ['RxNorm - SNOMED eq', 'ATC - RxNorm', 'ATC - SNOMED eq', 'Has tradename', 'Is a']

def filter_relevant_relations(relations):
    return relations[relations['relationship_id'].isin(RELEVANT_RELATIONS)]

def add_relational_synonyms(concepts, relations, domain_id_priority='Drug'):
    # Convert the DataFrame to dictionaries for faster lookups
    concepts_dict = concepts.set_index('concept_id')['concept_name'].to_dict()
    synonyms_dict = concepts.set_index('concept_id')['concept_synonym_name'].to_dict()
    domain_dict = concepts.set_index('concept_id')['domain_id'].to_dict()

    concept_synonyms = defaultdict(set)

    for _, row in tqdm(relations.iterrows(), total=relations.shape[0], desc='Processing relations'):
        concept_id_1 = row['concept_id_1']
        concept_id_2 = row['concept_id_2']
        relationship_id = row['relationship_id']

        if relationship_id == 'Is a':
            if concept_id_2 in concepts_dict:
                concept_synonyms[concept_id_1].add(concepts_dict[concept_id_2])

        elif (relationship_id.endswith('eq') or relationship_id == 'Has tradename') and domain_dict.get(concept_id_1) == domain_id_priority:
            if concept_id_1 in concepts_dict and concept_id_2 in concepts_dict:
                concept_synonyms[concept_id_1].add(concepts_dict[concept_id_2])
                concept_synonyms[concept_id_2].add(concepts_dict[concept_id_1])

        elif relationship_id in RELEVANT_RELATIONS:
            if concept_id_1 in concepts_dict and concept_id_2 in concepts_dict:
                concept_synonyms[concept_id_1].add(concepts_dict[concept_id_2])
                concept_synonyms[concept_id_2].add(concepts_dict[concept_id_1])

    for concept_id, synonyms in concept_synonyms.items():
        existing_synonyms = synonyms_dict.get(concept_id, '')
        existing_synonyms_set = set(existing_synonyms.split(';;')) if existing_synonyms else set()
        updated_synonyms_set = existing_synonyms_set.union(synonyms)
        updated_synonyms_set = {syn for syn in updated_synonyms_set if syn not in STOP_WORDS}
        if len(updated_synonyms_set) > 10:
            updated_synonyms_set = set(list(updated_synonyms_set)[:10])
        updated_synonyms = ';; '.join(updated_synonyms_set)
        concepts.at[concept_id, 'concept_synonym_name'] = updated_synonyms

    return concepts

def load_data_in_chunks(concepts_path, relations_path, chunk_size=5000):
    concepts = pd.read_csv(concepts_path, dtype=str)
    index = 0
    for chunk in pd.read_csv(relations_path, dtype=str, chunksize=chunk_size, usecols=['concept_id_1', 'concept_id_2', 'relationship_id']):
        filtered_chunk = filter_relevant_relations(chunk)
        concepts = add_relational_synonyms(concepts, filtered_chunk)
        concepts.to_csv(f"/workspace/rag_pipeline/data/gat_bert_data/omop_dir_1/chunks/concept_chunk{index}.csv", index=False)
        index += 1
    return concepts

def filter_synonyms(entity, entity_synonyms: set):
    return {synonym for synonym in entity_synonyms if synonym not in STOP_WORDS}

def build_ancestry(is_a_relations):
    ancestry = defaultdict(list)
    for _, row in is_a_relations.iterrows():
        child, parent = row['concept_id_1'], row['concept_id_2']
        ancestry[child].append(parent)
        if parent in ancestry:
            ancestry[child].extend(ancestry[parent])
    return ancestry

def main():
    concepts_path = '/workspace/rag_pipeline/data/gat_bert_data/omop_dir_1/concepts_chv.csv'
    relations_path = '/workspace/rag_pipeline/data/gat_bert_data/omop_dir_1/filtered_relationships.csv'
    concepts = load_data_in_chunks(concepts_path, relations_path)
    concepts.to_csv("/workspace/rag_pipeline/data/gat_bert_data/omop_dir_1/updated_concepts.csv", index=False)

if __name__ == "__main__":
    main()


In [None]:
# def create_title_text(concepts, synonyms, ancestry):
#     # Open a file to write the outputs
#     # with open('/workspace/rag_pipeline/data/documents.txt', 'w') as file:
#     outputs = []
#     for _, concept in concepts.iterrows():
#             concept_id = concept['concept_id']
#             concept_name = concept['concept_name']
#             concept_synonyms = set()
#             if  not pd.isna(concept['concept_synonym_name']):
#                 concept_synonyms = set(filter_synonyms(concept_name, set(concept['concept_synonym_name'].split(';;'))))
#             # Get synonyms and related terms
#             related_terms = synonyms[synonyms['concept_id'] == concept_id]
#             if not related_terms.empty:
#                 for col in ['uml_term', 'term', 'chv_term']:
#                     if col in related_terms.columns:
#                         terms = related_terms[col].dropna().str.split(';;').explode().dropna().unique()
#                         concept_synonyms.update(terms)
#             concept_synonyms.update([concept_name])  # Correctly update the set
#             all_terms = ';;'.join(concept_synonyms)  # Join the updated set into a string
#             meta_data = {"concept_class":concept['concept_class_id'], "domain":concept['domain_id'], "code":concept['concept_code'], "cid":concept['concept_id'],
#                          "vocabulary":concept['vocabulary_id']}
#             # Prepare the title with all ancestor entities
#             ancestor_ids = ancestry.get(concept_id, [])
#             ancestor_names = concepts[concepts['concept_id'].isin(ancestor_ids)]['concept_name'].tolist()
#             title = ';;'.join(ancestor_names) if ancestor_names else concept_name

#             # Prepare the text with concept details
#             text = f"{all_terms}"

#             # Write to file
            
#             text = f"{title}\t{all_terms}\t{meta_data}"
#             outputs.append(text)
#     with open('/workspace/rag_pipeline/data/output/documents.txt', 'w') as file:
#         file.write("title\ttext\tmetadata\n")  # Writing the header
#         file.write("\n".join(outputs))  # Writing all the data

In [None]:
# from Levenshtein import distance as edit_distance
from graph_omop import Omop
# import matplotlib.pyplot as plt
import pandas as pd
# import seaborn as sns
DATA_DIR = '/workspace/rag_pipeline/data/input/omop_v5.4/extension_concepts'
FIGSIZE = (12,6)
TITLE_ARGS = {'fontsize': 20, 'y': 1.02}
# sns.set(style="whitegrid")
omop_taxonomy = Omop(DATA_DIR, taxonomy=True)
omop_taxonomy.load_concepts()
#omop_taxonomy.build_domain_pair_for_training('/Users/komalgilani/Downloads/rag_pipeline_24May/data/input/unit1_concept_pairs_wth_semantic_type.csv',domain_id='unit', add_semantic_type=True)
#omop_taxonomy.circulam_pair_for_training('/Users/komalgilani/Downloads/rag_pipeline_24May/data/input/train.txt', add_semantic_type=True)

In [None]:
import hashlib
from qdrant_client import QdrantClient
from qdrant_client.http.models import Filter, PointStruct,PointIdsList
import json

def hash_content(content):
    """Generate a hash for the content to identify duplicates."""
    return hashlib.md5(content.encode('utf-8')).hexdigest()

def fetch_all_points(client, collection_name, limit=500):
    """Fetch all points from the Qdrant collection."""
    all_points = []
    offset = None  # Starting without an offset
    while True:
        scroll_result, offset = client.scroll(
            collection_name=collection_name,
            limit=limit,
            offset=offset,
            with_payload=True,
            with_vectors=True
        )
        if not scroll_result:
            break
        all_points.extend(scroll_result)
        print(f"len(all_points): {len(all_points)}")
        if offset is None:
            break
    return all_points

def find_duplicates(points):
    """Find duplicates based on page_content and metadata."""
    seen = {}
    duplicates = []
    print(f"Number of Points: {len(points)}")
    
    for point in points:
        page_content = point.payload.get('page_content')
        if page_content and '<ENT>' in page_content:
            page_content = page_content.split('<ENT>')[1].split('</ENT>')[0].split('||')[0]
        metadata = point.payload.get('metadata')
        unique_key = f"{hash_content(page_content)}-{hash_content(json.dumps(metadata))}"

        if unique_key in seen:
            duplicates.append(point)
        else:
            seen[unique_key] = point

    return duplicates

In [2]:
client = QdrantClient(url="http://qdrant:6333")
collection_name = "SYNONYMS_MAPPING_SAP_ALL"
collection_name_1 = "SYNONYMS_MAPPING_SAP_ALL_WITHOUT_DOMAIN"
print(client.get_collection(collection_name_1))

limit = 8000  # Adjust based on your memory capacity
total_deleted = 0

points = fetch_all_points(client, collection_name, limit)
print(f"Total points fetched: {len(points)}")
duplicates = find_duplicates(points)
print(f"Total duplicates found: {len(duplicates)}")

len(all_points): 2000000
len(all_points): 2008000
len(all_points): 2016000
len(all_points): 2024000
len(all_points): 2032000
len(all_points): 2040000
len(all_points): 2048000
len(all_points): 2056000
len(all_points): 2064000
len(all_points): 2072000
len(all_points): 2080000
len(all_points): 2088000
len(all_points): 2096000
len(all_points): 2104000
len(all_points): 2112000
len(all_points): 2120000
len(all_points): 2128000
len(all_points): 2136000
len(all_points): 2144000
len(all_points): 2152000
len(all_points): 2160000
len(all_points): 2168000
len(all_points): 2176000
len(all_points): 2184000
len(all_points): 2192000
len(all_points): 2200000
len(all_points): 2208000
len(all_points): 2216000
len(all_points): 2224000
len(all_points): 2232000
len(all_points): 2240000
len(all_points): 2248000
len(all_points): 2256000
len(all_points): 2264000
len(all_points): 2272000
len(all_points): 2280000
len(all_points): 2288000
len(all_points): 2296000
len(all_points): 2304000
len(all_points): 2312000


In [3]:
def delete_duplicates(client, collection_name, duplicates):
    """Delete duplicate points from the collection."""
    ids_to_delete = [point.id for point in duplicates]
    if ids_to_delete:
        client.delete_vectors(collection_name, ["dense_vector"],ids_to_delete,wait=True)
        client.delete(collection_name=collection_name, points_selector=ids_to_delete)
        print(f"Deleted {len(ids_to_delete)} duplicate points.")
    else:
        print("No duplicates to delete.")
delete_duplicates(client, collection_name, duplicates)
total_deleted += len(duplicates)

print(f"Total duplicates deleted: {total_deleted}")

No duplicates to delete.
Total duplicates deleted: 0


In [5]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import Filter, PointStruct,PointIdsList
client = QdrantClient(url="http://qdrant:6333")
collection_name_1 = "SYNONYMS_MAPPING_SAP_ALL_WITHOUT_DOMAIN"
print(client.get_collections())

collections=[]


In [6]:
""" 

DOMAIN: CONDITIONS
  

 
    MAX PAIRS = 10:
    Graph loaded from disk.
    Number of nodes in the graph: 2770116
    Total pairs processed: 520964
    Total domain ids processed: 96888
    Total synonyms processed : 520964


MAX PAIRS = 10:  
    Graph loaded from disk.
    Number of nodes in the graph: 2770116
    Total pairs processed: 595960
    Total domain ids processed: 134811
    Total synonyms processed : 595960

DOMAIN: DRUG
Max pairs = 10
    Graph loaded from disk.
    Number of nodes in the graph: 2770116
    Total pairs processed: 3210755
    Total domain ids processed: 2062596
    Total synonyms processed : 3210755
    
DOMAIN: OBSERVATION
MAX PAIRS = 10
Graph loaded from disk.
Number of nodes in the graph: 2770116
Total pairs processed: 642195
Total domain ids processed: 155457
Total synonyms processed : 642195

DOMAIN: PROCEDURE
Max pairs = 10
Graph loaded from disk.
Number of nodes in the graph: 2770116
Total pairs processed: 1006737
Total domain ids processed: 276090
Total synonyms processed : 1006737

DOMAIN: DEVICE
Max pairs = 10
Graph loaded from disk.
Number of nodes in the graph: 2770116
Total pairs processed: 46458
Total domain ids processed: 14775
Total synonyms processed : 46458

DOMAIN: MEAS VALUE
Max pairs = 10
Graph loaded from disk.
Number of nodes in the graph: 2770116
Total pairs processed: 33882
Total domain ids processed: 23903
Total synonyms processed : 33882
DOMAIN: Unit
Max pairs = 10
    
Graph loaded from disk.
Number of nodes in the graph: 2770116
Total pairs processed: 187
Total domain ids processed: 1031
Total synonyms processed : 187
DOMAIN: MEASUREMENT


Adding synonym name as its own synonym. It means these concepts are synonyms of themselves because there synonym is not available
The increase in the number of pairs is due to the addition of synonyms of the concepts.
DOMAIN: MEAS VALUE
Max pairs = 10
Graph loaded from disk.
Number of nodes in the graph: 2770116
Total pairs processed: 35054
Total domain ids processed: 23903
Total synonyms processed : 35054
DOMAIN: DEVICE
Max pairs = 10
Graph loaded from disk.
Number of nodes in the graph: 2770116
Total pairs processed: 46464
Total domain ids processed: 14775
Total synonyms processed : 46464

DOMAIN: PROCEDURE
Max pairs = 10
Graph loaded from disk.
Number of nodes in the graph: 2770116
Total pairs processed: 1006933
Total domain ids processed: 276090
Total synonyms processed : 1006933

DOMAIN: OBSERVATION
Max pairs = 10
Number of nodes in the graph: 2770116
Total pairs processed: 642951
Total domain ids processed: 155457
Total synonyms processed : 642951

DOMAIN: DRUG
Max pairs = 10
Number of nodes in the graph: 2770116
Total pairs processed: 3224390
Total domain ids processed: 2062596
Total synonyms processed : 3224390

DOMAIN: MEASUREMENT
Max pairs = 10
Number of nodes in the graph: 2770116
Total pairs processed: 596059
Total domain ids processed: 134811
Total synonyms processed : 596059

DOMAIN: CONDITIONS
Max pairs = 10

Number of nodes in the graph: 2770116
Total pairs processed: 521098
Total domain ids processed: 96888
Total synonyms processed : 521098

DOMAIN: Unit
Max pairs = 10
Graph loaded from disk.
Number of nodes in the graph: 2770116
Total pairs processed: 1031
Total domain ids processed: 1031
Total synonyms processed : 1031
"""
# import matplotlib.pyplot as plt

# # Define the data
# domains = ['CONDITIONS', 'MEASUREMENT', 'DRUG', 'OBSERVATION', 'PROCEDURE', 'DEVICE', 'MEAS VALUE']
# max_pairs = [10, 4, 5, 8, 10, 10, 10]
# total_pairs = [520964, 360005, 306293, 519115, 3210755, 642195, 1006737]
# total_domain_ids = [96888, 134811, 2062596, 155457, 276090, 14775, 23903]

# # Create the bar chart
# plt.figure(figsize=(12, 6))
# plt.bar(domains, total_pairs)

# # Add labels and title
# plt.xlabel('Domain')
# plt.ylabel('Number of Pairs')
# plt.title('Number of Pairs for Each Domain')

# # Display the chart
# plt.show()

' \n\nDOMAIN: CONDITIONS\n  \n\n \n    MAX PAIRS = 10:\n    Graph loaded from disk.\n    Number of nodes in the graph: 2770116\n    Total pairs processed: 520964\n    Total domain ids processed: 96888\n    Total synonyms processed : 520964\n\n\nMAX PAIRS = 10:  \n    Graph loaded from disk.\n    Number of nodes in the graph: 2770116\n    Total pairs processed: 595960\n    Total domain ids processed: 134811\n    Total synonyms processed : 595960\n\nDOMAIN: DRUG\nMax pairs = 10\n    Graph loaded from disk.\n    Number of nodes in the graph: 2770116\n    Total pairs processed: 3210755\n    Total domain ids processed: 2062596\n    Total synonyms processed : 3210755\n    \nDOMAIN: OBSERVATION\nMAX PAIRS = 10\nGraph loaded from disk.\nNumber of nodes in the graph: 2770116\nTotal pairs processed: 642195\nTotal domain ids processed: 155457\nTotal synonyms processed : 642195\n\nDOMAIN: PROCEDURE\nMax pairs = 10\nGraph loaded from disk.\nNumber of nodes in the graph: 2770116\nTotal pairs process

In [2]:
# import argparse
# import os
# import pickle
# import networkx as nx
# import pandas as pd
# from tqdm import tqdm

# def merge_graphs(graph_path1, graph_path2, output_path):
#     """
#     Merge two graphs into a main graph.

#     Args:
#     graph_path1 (str): Path to the first graph (pickle file).
#     graph_path2 (str): Path to the second graph (pickle file).
#     output_path (str): Path to save the merged graph (pickle file).
#     """
#     # Load the first graph
#     with open(graph_path1, 'rb') as f:
#         graph1 = pickle.load(f)
    
#     # Load the second graph
#     with open(graph_path2, 'rb') as f:
#         graph2 = pickle.load(f)
    
#     # Merge the two graphs
#     merged_graph = nx.compose(graph1, graph2)
    
#     # Save the merged graph to disk
#     with open(output_path, 'wb') as f:
#         pickle.dump(merged_graph, f)
    
#     print(f"Merged graph saved to {output_path}")

# merge_graphs('/workspace/rag_pipeline/data/input/omop_v5.4/extension_concepts/omop_bi_graph_extension.pkl',
#                  '/workspace/rag_pipeline/data/output/omop_bi_graph_all.pkl', '/workspace/rag_pipeline/data/output/omop_bi_graph_all.pkl')


Merged graph saved to /workspace/rag_pipeline/data/output/omop_bi_graph_all.pkl


concept dir = /workspace/rag_pipeline/data/input/omop_v5.4
graph path = /workspace/rag_pipeline/data/input/omop_v5.4/omop_bi_graph_medra.pkl
['MedDRA']
Total entities=110802
Total entities with synonyms =110802


110802it [00:10, 10076.82it/s]


Entity not in graph = 0
Entity not found in graph for hierarchy check = 1186303
Entity not found in graph for Drugs check = 1120893
Entity not found chv = 0
Graph saved to disk.


In [None]:
import pandas as pd
