### Exploring SNOMED Hierarchies

In [2]:
import pandas as pd
import networkx as nx
from collections import deque

In [3]:
dag_df = pd.read_csv('../data/DAGs_standardized.csv', dtype={'Exposure': str, 'Outcome':str})
concept_df = pd.read_csv('../Standardization/athena_vocabulary/CONCEPT.csv', sep='\t', dtype={'concept_code': str, 'concept_id': str}, low_memory=False)
concept_relationship_df = pd.read_csv('../Standardization/athena_vocabulary/CONCEPT_RELATIONSHIP.csv', sep='\t', dtype={'concept_id_1':str, 'concept_id_2': str}, low_memory=False)
concept_ancestor_df = pd.read_csv('../Standardization/athena_vocabulary/CONCEPT_ANCESTOR.csv', sep='\t',dtype={'ancestor_concept_id': str, 'descendant_concept_id': str}, low_memory=False)

Only select numeric terms that have been coded to SNOMED-CT

In [4]:
my_terms = set(dag_df[
    dag_df.Exposure.str.isdigit() & dag_df.Outcome.str.isdigit()]['Exposure']
    ).union(set(dag_df[dag_df.Exposure.str.isdigit() & dag_df.Outcome.str.isdigit()]['Outcome']))

In [5]:
# Find those terms that are related (are ancestor and descendant)
filtered_ancestors = concept_ancestor_df[
    concept_ancestor_df['ancestor_concept_id'].isin(my_terms) & 
    concept_ancestor_df['descendant_concept_id'].isin(my_terms)
]

# Delete self references
filtered_ancestors = filtered_ancestors[filtered_ancestors.max_levels_of_separation>0]

descendants_terms = set(filtered_ancestors['descendant_concept_id'])
root_terms = my_terms - descendants_terms

In [6]:
# Dictionary to store hierarchy
hierarchy = {}

# Populate the hierarchy with root terms
for root in root_terms:
    hierarchy[root] = {'descendants': []}

# Recursive function to build the hierarchy
def build_hierarchy(term, level):
    # Find direct descendants of the current term
    direct_descendants = filtered_ancestors[
        (filtered_ancestors['ancestor_concept_id'] == term) & 
        (filtered_ancestors['max_levels_of_separation'] == level)
    ]['descendant_concept_id'].tolist()
    
    # For each descendant, recursively find and add its own descendants
    descendants_list = []
    for descendant in direct_descendants:
        descendants_list.append({
            'term': descendant,
            'descendants': build_hierarchy(descendant, level + 1)  # Recursive call for next level
        })
    
    return descendants_list

# Build hierarchy starting from each root term
for root in root_terms:
    hierarchy[root]['descendants'] = build_hierarchy(root, 1)

hierarchy

{'4285288': {'descendants': []},
 '44804450': {'descendants': []},
 '4329041': {'descendants': []},
 '4232457': {'descendants': []},
 '4179955': {'descendants': []},
 '4301936': {'descendants': []},
 '442985': {'descendants': []},
 '37311069': {'descendants': []},
 '4084390': {'descendants': []},
 '4295287': {'descendants': []},
 '4010813': {'descendants': []},
 '4020553': {'descendants': []},
 '4159324': {'descendants': []},
 '4009853': {'descendants': []},
 '443784': {'descendants': [{'term': '4185607', 'descendants': []},
   {'term': '4137275', 'descendants': []}]},
 '4304917': {'descendants': []},
 '255848': {'descendants': []},
 '4208414': {'descendants': []},
 '4309238': {'descendants': []},
 '4007663': {'descendants': []},
 '4196427': {'descendants': []},
 '4156503': {'descendants': []},
 '321588': {'descendants': [{'term': '44784217', 'descendants': []},
   {'term': '4024552',
    'descendants': [{'term': '444031', 'descendants': []},
     {'term': '442310', 'descendants': []}]

In [7]:
print(len(my_terms), 'total terms and ', len(hierarchy), 'root terms.')

182 total terms and  129 root terms.


### Calculating distance between each term in the hierarchy

Getting all snomed terms and relationships

In [29]:
snomed_concepts = concept_df[concept_df['vocabulary_id'] == 'SNOMED']
snomed_relationships = concept_relationship_df[
    (concept_relationship_df['concept_id_1'].isin(snomed_concepts['concept_id'])) &
    (concept_relationship_df['concept_id_2'].isin(snomed_concepts['concept_id'])) 
]

Determine the edges of the graph: what links are useful between terms?

In [35]:
useful_relationship_ids = [
    "Is a",                     # Hierarchical relationship of SNOMED
    "Finding site of",          # Links findings to anatomical sites
    "Interprets of",             # Links specific measurements to conditions
    "Causative agent of",       # Etiological link
    "Has due to",               # Etiology relationships
    "Due to of",
    "Asso finding of",
    "Pathology of",             # Links to pathological processes
    "Has dir morph",            # Direct morphology
    "Has asso morph",           # Associated morphology
    "Occurs before",            # Temporal relationship
    "Follows",                  # Temporal relationship
    "Has manifestation",        # Links findings to manifestations
    "Has complication"          # Links to complications
]

Double check they all exist

In [36]:
for rel in useful_relationship_ids:
    if rel not in snomed_relationships.relationship_id.unique():
        print(rel)

Filter relationships

In [38]:
snomed_relationships = snomed_relationships[snomed_relationships.relationship_id.isin(useful_relationship_ids)]

Create network

In [40]:
G = nx.DiGraph()

edges_with_attributes = zip(
    snomed_relationships['concept_id_1'],
    snomed_relationships['concept_id_2'],
    snomed_relationships['relationship_id']
)
G.add_edges_from((u, v, {'relationship_type': r}) for u, v, r in edges_with_attributes)

In [41]:
print(f"Number of nodes in G: {G.number_of_nodes()}")
print(f"Number of edges in G: {G.number_of_edges()}")

Number of nodes in G: 462205
Number of edges in G: 1132896


Are all terms in the network?

In [42]:
missing_terms = [term for term in my_terms if term not in G]
if missing_terms:
    print(f"Warning: These terms are missing from the graph: {missing_terms}")
else:
    print("All terms are present in the graph.")

All terms are present in the graph.


Defining algorithm (breath first search) with early stopping to stop once distance to all selected terms was found.

In [11]:
def shortest_paths_with_early_stopping(graph, selected_terms):
    """
    Compute the shortest path distances between selected terms using early stopping.

    Parameters:
    - graph: NetworkX graph.
    - selected_terms: Set of nodes for which distances are computed.

    Returns:
    - pairwise_distances: Dictionary of distances between selected terms.
    """
    selected_terms = set(selected_terms)
    pairwise_distances = {}

    for source in selected_terms:
        distances = {}
        visited = set()
        queue = deque([(source, 0)])

        while queue and len(distances) < len(selected_terms) - 1:
            current, dist = queue.popleft()

            if current in visited:
                continue
            visited.add(current)

            if current in selected_terms and current != source:
                distances[current] = dist

            for neighbor in graph.neighbors(current):
                if neighbor not in visited:
                    queue.append((neighbor, dist + 1))

        for target, dist in distances.items():
            pairwise_distances[frozenset({source, target})] = dist

    return pairwise_distances

In [15]:
pairwise_distances = shortest_paths_with_early_stopping(G, my_terms)

In [12]:
node_a = '4021291'
node_b = '4268546' 

path = nx.shortest_path(G, source=node_a, target=node_b)
print(f"Shortest path between {node_a} and {node_b}: {path}")

# Retrieve edge attributes along the path
relationships = []
for i in range(len(path) - 1):
    edge_data = G.get_edge_data(path[i], path[i + 1])
    relationships.append({
        'from': path[i],
        'to': path[i + 1],
        'relationship_type': edge_data.get('relationship_type', 'unknown')
    })

# Display the relationships
for rel in relationships:
    print(f"{rel['from']} -> {rel['to']} (type: {rel['relationship_type']})")

Shortest path between 4021291 and 4268546: ['4021291', '40642537', '4268546']
4021291 -> 40642537 (type: Has status)
40642537 -> 4268546 (type: Status of)


In [16]:
pairwise_distances

{frozenset({'4021291', '4268546'}): 2,
 frozenset({'4024552', '4268546'}): 2,
 frozenset({'4268546', '606770'}): 2,
 frozenset({'4268546', '46270002'}): 2,
 frozenset({'4041284', '4268546'}): 2,
 frozenset({'4007663', '4268546'}): 2,
 frozenset({'4046731', '4268546'}): 2,
 frozenset({'4133004', '4268546'}): 2,
 frozenset({'4033232', '4268546'}): 2,
 frozenset({'197672', '4268546'}): 2,
 frozenset({'37309626', '4268546'}): 2,
 frozenset({'4056822', '4268546'}): 2,
 frozenset({'4102202', '4268546'}): 2,
 frozenset({'4070299', '4268546'}): 2,
 frozenset({'4052648', '4268546'}): 2,
 frozenset({'4070767', '4268546'}): 2,
 frozenset({'4268546', '4329847'}): 2,
 frozenset({'4046360', '4268546'}): 2,
 frozenset({'255848', '4268546'}): 2,
 frozenset({'4169009', '4268546'}): 2,
 frozenset({'4268546', '443784'}): 2,
 frozenset({'4079975', '4268546'}): 2,
 frozenset({'4150603', '4268546'}): 2,
 frozenset({'4085730', '4268546'}): 2,
 frozenset({'4200516', '4268546'}): 2,
 frozenset({'4146455', '426

In [None]:
ancestors = {}
for node in G.nodes():
    ancestors[node] = list(nx.ancestors(G, node))

ancestor_data = []
for descendant, ancestor_list in ancestors.items():
    for ancestor in ancestor_list:
        ancestor_data.append({'descendant_concept_id': descendant, 'ancestor_concept_id': ancestor})

ancestor_df = pd.DataFrame(ancestor_data)

In [None]:
# len(ancestor_df)

Idea is to use Floyd Warshall algorithm to calculate the shortest distance between any pair of nodes.

Running Floyd warshall algortihm on entire SNOMED CT hierarchy is too computationally expensive, therefore we have to first prune the hierarchy by selecting only all the relevant ancestors and children part of the DAG terms.

In [14]:
concept_df[concept_df.concept_id=='4008453']

Unnamed: 0,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason
312254,4008453,SNOMED CT Concept,Metadata,SNOMED,Model Comp,,138875005,20020131,20991231,


In [15]:
concept_ancestor_df[
    (concept_ancestor_df['descendant_concept_id']=='4008453') |
    (concept_ancestor_df['ancestor_concept_id']=='4008453')]

Unnamed: 0,ancestor_concept_id,descendant_concept_id,min_levels_of_separation,max_levels_of_separation


In [16]:
concept_relationship_df[
    (concept_relationship_df['concept_id_1']=='4008453') |
    (concept_relationship_df['concept_id_2']=='4008453')]

Unnamed: 0,concept_id_1,concept_id_2,relationship_id,valid_start_date,valid_end_date,invalid_reason
116478,4302764,4008453,Is a,20110731,20991231,
233312,4322976,4008453,Is a,20110731,20991231,
982696,40264357,4008453,Concept replaced by,20130131,20991231,
1064301,40441161,4008453,Concept replaced by,20130131,20991231,
1145020,40546898,4008453,Is a,20110731,20991231,
1167870,40642546,4008453,Is a,20150301,20991231,
2143292,40642538,4008453,Status of,20220128,20991231,
2143293,40642539,4008453,Module of,20220128,20991231,
4847003,4008453,40642538,Has status,20220128,20991231,
4847004,4008453,40642539,Has Module,20220128,20991231,


In [36]:
relevant_relationships = concept_ancestor_df[
    (concept_ancestor_df['descendant_concept_id'].isin(my_terms)) |
    (concept_ancestor_df['ancestor_concept_id'].isin(my_terms))
]

relevant_concepts = set(relevant_relationships['ancestor_concept_id']).union(
    relevant_relationships['descendant_concept_id']
)

In [38]:
len(relevant_concepts) *100 / len(snomed_concepts)

4.304122713011143

We actually only need 3.8 percent of the SNOMED concepts (still over 40k).

In [39]:
snomed_relationships = concept_relationship_df[
    (concept_relationship_df['concept_id_1'].isin(relevant_concepts)) &
    (concept_relationship_df['concept_id_2'].isin(relevant_concepts)) &
    (concept_relationship_df['relationship_id'] == 'Is a') # ADJUST
]

In [40]:
G = nx.DiGraph()
G.add_edges_from(zip(snomed_relationships['concept_id_1'], snomed_relationships['concept_id_2']))

In [41]:
print(f"Number of nodes in G: {G.number_of_nodes()}")
print(f"Number of edges in G: {G.number_of_edges()}")

Number of nodes in G: 46372
Number of edges in G: 73574


In [42]:
missing_terms = [term for term in my_terms if term not in G]
if missing_terms:
    print(f"Warning: These terms are missing from the graph: {missing_terms}")
else:
    print("All terms are present in the graph.")



In [18]:
print("\nSample nodes in G:")
print(list(G.nodes())[:10])


Sample nodes in G:
['4285430', '4083797', '4271706', '4285434', '4123882', '4285438', '140648', '4300329', '4285442', '134743']


In [14]:
# shortest_paths = dict(nx.floyd_warshall(G))