In [1]:
import os
from pprint import pprint
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = [10, 10]

from etl.configuration.target_api_config import TargetAPIConfig
from etl.transform.standard_model.concept_schema import (
    CONCEPT,
    DELIMITER,
    concept_from
)
from etl.transform.standard_model.model import StandardModel
from common.constants import *
from helper import plotly_concept_graph

KIDS_FIRST_CONFIG = '/Users/singhn4/Projects/kids_first/kf-lib-data-ingest/src/target_apis/kids_first.py'

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
config = TargetAPIConfig(KIDS_FIRST_CONFIG)
rg = config.relationship_graph
print('\nAncestors')
for n in nx.ancestors(rg, CONCEPT.PARTICIPANT._CONCEPT_NAME):
    print(n)
print('\nDescendants')
for n in nx.descendants(rg, CONCEPT.PARTICIPANT._CONCEPT_NAME):
    print(n)


Ancestors
CONCEPT|STUDY
CONCEPT|FAMILY
CONCEPT|INVESTIGATOR

Descendants
CONCEPT|DIAGNOSIS
CONCEPT|PHENOTYPE
CONCEPT|BIOSPECIMEN
CONCEPT|GENOMIC_FILE
CONCEPT|OUTCOME


In [3]:
data = {'pedigree': [{CONCEPT.FAMILY.ID: 'F1',
                 CONCEPT.PARTICIPANT.ID: 'P1',
                 CONCEPT.PARTICIPANT.RACE: RACE.ASIAN},
                {CONCEPT.FAMILY.ID: 'F1',
                 CONCEPT.PARTICIPANT.ID: 'P2',
                CONCEPT.PARTICIPANT.RACE: None}],
'subject_sample': [{CONCEPT.PARTICIPANT.ID: 'P1',
                   CONCEPT.BIOSPECIMEN.ID: 'B1'},
                  {CONCEPT.PARTICIPANT.ID: 'P1',
                   CONCEPT.BIOSPECIMEN.ID: 'B2'},
                  {CONCEPT.PARTICIPANT.ID: 'P2',
                   CONCEPT.BIOSPECIMEN.ID: 'B3'}],
'sample': [{CONCEPT.BIOSPECIMEN.ID: 'B1',
           CONCEPT.PARTICIPANT.RACE: None},
          {CONCEPT.BIOSPECIMEN.ID: 'B2',
           CONCEPT.PARTICIPANT.RACE: None},
          {CONCEPT.BIOSPECIMEN.ID: 'B3',
           CONCEPT.PARTICIPANT.RACE: RACE.WHITE}]}
df_dict = {f's3://bucket/key/{k}.csv': 
           (f'file:///study/configs/{k}.py', pd.DataFrame(v))
      for k, v in data.items()}
    
model = StandardModel()
model.populate(df_dict)
g = model.concept_graph

In [7]:
# nx.draw_networkx(g.graph, font_size=10)
plotly_concept_graph(g.graph, notebook_mode=True)

In [5]:
def is_neighbor_valid(node_concept, neighbor, relation_graph):
    # Ancestor concepts of node_key's concept 
    ancestors = [ancestor 
                 for ancestor in nx.ancestors(relation_graph, node_concept)]

    # Valid - the neighbor is not in the set of ancestors
    if neighbor.concept not in set(ancestors):
        return True

    # Check if the neighbor is connected to other nodes with the same concept
    # or nodes with descendant concepts
    descendants = [descendant 
                 for descendant in nx.descendants(relation_graph, node_concept)]
    restrictions = set(descendants + [node_concept])

    # Start the breadth first search
    visited = set([neighbor.key])
    queue = [neighbor]
    
    while queue:
        current = queue.pop(0)
        # Is this a restricted concept?
        if current.concept in restrictions:
            return False
        
        # Add neighbor nodes if they haven't been visited and
        # they are identifier nodes. No need to look at attribute nodes
        for node_key in nx.neighbors(g.graph, current.key):
            node = g.get_node(node_key)
            if (node.key not in visited) and node.is_identifier:
                queue.append(node)
                visited.add(node.key)
    return True

# Transformation
def find_attribute_value(start_node, concept_attribute_str, relation_graph):
    # Does graph contain any concept attributes of the target type?
    if concept_attribute_str not in g.attribute_index:
        return None
    
    # Init data structures
    # Keep track of nodes visited
    visited = set([start_node.key])
    # Keep track of nodes to process
    queue = [start_node]
    
    # Check directly connected neighbor nodes before searching graph
    # Value for a given concept attribute is likely to be directly 
    # connected to its concept ID node since attributes for a particular
    # concept are typically in the same table as the concept IDs.
    for node_key in nx.neighbors(g.graph, start_node.key):
        neighbor = g.get_node(node_key) 
        if neighbor.concept_attribute_pair == concept_attribute_str:
            return neighbor.value
        
    # Search the graph for the attribute value
    # Always search nearest neighbors of the current node
    while queue:
        current = queue.pop(0)
        print(current.key)

        # Found the node with the value for this concept attr
        if current.concept_attribute_pair == concept_attribute_str:
           return current.value
        
        for node_key in nx.neighbors(g.graph, current.key):
            neighbor = g.get_node(node_key)
            if (neighbor.key not in visited) and is_neighbor_valid(concept_from(concept_attribute_str), 
                                                               neighbor,
                                                               relation_graph):
                queue.append(neighbor)
                visited.add(neighbor.key)

    return None

start_node = g.id_index[CONCEPT.PARTICIPANT._CONCEPT_NAME].get(f'{CONCEPT.PARTICIPANT.ID}{DELIMITER}P2')
attribute_str = CONCEPT.PARTICIPANT.RACE
value = find_attribute_value(start_node, attribute_str, rg)
print(value)

# is_neighbor_valid('CONCEPT|PARTICIPANT', 
#                   g.get_node('CONCEPT|FAMILY|ID|F1'),
#                   rg)

CONCEPT|PARTICIPANT|ID|P2
CONCEPT|BIOSPECIMEN|ID|B3
CONCEPT|PARTICIPANT|RACE|White
White


In [None]:
# Basic graph
graph = {'F1': set(['P1', 'P2']),
         'P1': set(['F1', 'B1', 'B2']),
         'P2': set(['F1', 'B3']),
         'B1': set(['P1', 'White']),
         'B2': set(['P1', 'White']),
         'B3': set(['P2', 'White']),
         'White': set()}
# Build graph
G = nx.DiGraph()
for node, neighbors in graph.items():
    for n in [node] + list(neighbors):
        if not G.has_node:
            G.add_node(n)
    for n in neighbors:
        G.add_edge(node, n)
nx.draw_networkx(G, font_size=10)

In [None]:
from queue import Queue

def shortest_path_bfs(graph, start, goal):
    # Don't search, start = goal
    if start == goal:
        return goal
    # Init data structures
    visited, q = set(), Queue()
    q.put((start, [start]))
    
    # Breadth first search
    while not q.empty():
        current, path = q.get()
        print(path)
        
        if current == goal:
           return path
        
        if current not in visited:
            print(current)
            visited.add(current)
            for n in nx.neighbors(graph, current):
                q.put((n, path + [n]))

def bfs(graph, start, goal): 
    # Mark all the vertices as not visited 
    visited = set()

    # Create a queue for BFS 
    queue = [] 

    # Mark the source node as  
    # visited and enqueue it 
    queue.append(start) 
    visited.add(start)

    while queue: 
        # Dequeue a vertex from  
        # queue and print it 
        current = queue.pop(0)
        print(current)
        
        if current == goal:
            return True

        # Get all adjacent vertices of the 
        # dequeued vertex s. If a adjacent 
        # has not been visited, then mark it 
        # visited and enqueue it 
        for n in nx.neighbors(graph, current): 
            if n not in visited: 
                queue.append(n) 
                visited.add(n)

In [None]:
# BFS
start = 'P1'
goal = 'White'
bfs(G, start, goal)