<a href="https://colab.research.google.com/github/javier-jaime/Tool-Crib/blob/master/Neo4j/Synthetic_Patient_Data_Knowledge_Graph_Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Synthetic Patient Data Knowledge Graph Demo

This demo is based on code from: Graph Data Processing with Cypher
By Ravindranatha Anthapu (2022) https://github.com/PacktPublishing/Cypher-Querying


We will work  with the Synthea synthetic patient dataset: https://synthetichealth.github.io/synthea/




## Generate the Data

You can run Synthea (from link above) to create and export the following CSV  files:

**allergies.csv**   Patient allergy data

**careplans.csv**   Patient care plan data, including goals

**claims.csv**   Patient claim data

**claims_transactions.csv**   Transactions per line item per claim

**conditions.csv**   Patient conditions or diagnoses

**devices.csv**   Patient-affixed permanent and semi-permanent devices

**encounters.csv**   Patient encounter data

**imaging_studies.csv**   Patient imaging metadata

**immunizations.csv**   Patient immunization data

**medications.csv**   Patient medication data

**observations.csv**   Patient observations, including vital signs and lab reports

**organizations.csv**   Provider organizations, including hospitals

**patients.csv**   Patient demographic data

**payer_transitions.csv**   Payer transition data (i.e., changes in health insurance)

## Notebook Preparation

In [1]:
# Install necessary libraries, uncomment if required
!pip install graphdatascience

Collecting graphdatascience
  Downloading graphdatascience-1.8-py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multimethod<2.0,>=1.0 (from graphdatascience)
  Downloading multimethod-1.10-py3-none-any.whl (9.9 kB)
Collecting neo4j<6.0,>=4.4.2 (from graphdatascience)
  Downloading neo4j-5.16.0.tar.gz (197 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m197.8/197.8 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting textdistance<5.0,>=4.0 (from graphdatascience)
  Downloading textdistance-4.6.1-py3-none-any.whl (31 kB)
Building wheels for collected packages: neo4j
  Building wheel for neo4j (pyproject.toml) ... [?25l[?25h

In [2]:
from graphdatascience import GraphDataScience
import time
import numpy as np
import pandas as pd
from google.colab import drive

In [3]:
# Connect to the Sandbox database
uri  = 'bolt://3.86.197.152:7687'
user = 'neo4j'
pwd  = 'decibels-slots-debt'

# Create Python GDS Object
gds = GraphDataScience(uri, auth=(user, pwd), database="neo4j")

### Load the Data

In [6]:
# File uploading Options, uncomment as required

# Option 1: Import CSV File directly from Github

path = 'https://raw.githubusercontent.com/PacktPublishing/Cypher-Querying/main/data/csv/'

# Option 2: Mount & Import CSV File from Google Drive, comment/uncomment as required

# drive.mount('/content/drive')

# path = 'drive/MyDrive/Colab Notebooks/EGP_Data/'

# Option 3: Upload Log to temporary session storage and import, comment/uncomment as required

# path = ''

In [None]:
# creating unique constraints for the patient data

gds.run_cypher("CREATE CONSTRAINT patient_id IF NOT EXISTS FOR (n:Patient) REQUIRE n.id IS UNIQUE")

gds.run_cypher("CREATE CONSTRAINT zipcode_id IF NOT EXISTS FOR (n:ZipCode) REQUIRE n.zip IS UNIQUE")

gds.run_cypher("CREATE CONSTRAINT race_id IF NOT EXISTS FOR (n:Race) REQUIRE n.type IS UNIQUE")

gds.run_cypher("CREATE CONSTRAINT eth_id IF NOT EXISTS FOR (n:Ethnicity) REQUIRE n.type IS UNIQUE")

In [None]:
# Load Patient data from CSV file

file_name = 'patients.csv'
params = {'url': path + file_name}

gds.run_cypher("""
LOAD CSV WITH HEADERS from $url as row
CALL {
  WITH row
  MERGE(p:Patient {id: row.Id})
  SET
  p.marital = row.MARITAL,
  p.ssn = row.SSN,
  p.firstName = row.FIRST,
  p.lastName = row.LAST,
  p.suffix = row.SUFFIX,
  p.prefix = row.PREFIX,
  p.city = row.CITY,
  p.county = row.COUNTY,
  p.location = point({latitude:toFloat(row.LAT),
                      longitude:toFloat(row.LON)}),
  p.drivers=row.DRIVERS,
  p.birthDate=date(row.BIRTHDATE)
  WITH row,p
  MERGE (r:Race {type: row.RACE})
  MERGE (p)-[:HAS_RACE]->(r)
  WITH row,p
  MERGE (e:Ethnicity {type: row.ETHNICITY})
  MERGE (p)-[:HAS_ETHNICITY]->(e)
  WITH row,p
    WHERE row.ZIP IS NOT NULL
  MERGE (z:ZipCode {zip: row.ZIP})
  MERGE (p)-[:HAS_ZIPCODE]->(z)
} IN TRANSACTIONS OF 1000 ROWS
""", params)

In [8]:
# creating Index and Unique constraints for the encounter data

gds.run_cypher("CREATE INDEX encounter_id IF NOT EXISTS FOR (n:Encounter) ON n.id")

gds.run_cypher("CREATE CONSTRAINT snomed_id IF NOT EXISTS FOR (n:SNOMED_CT) REQUIRE n.code IS UNIQUE")

gds.run_cypher("CREATE CONSTRAINT provider_id IF NOT EXISTS FOR (n:Provider) REQUIRE n.id IS UNIQUE")

gds.run_cypher("CREATE CONSTRAINT organization_id IF NOT EXISTS FOR (n:Organization) REQUIRE n.id IS UNIQUE")

In [10]:
# Load encounter data from CSV file

file_name = 'encounters.csv'
params = {'url': path + file_name}

gds.run_cypher("""
LOAD CSV WITH HEADERS from $url as row
CALL {
    WITH row
    MERGE(e:Encounter {id: row.Id})
    SET
    e.date=datetime(row.START),
    e.description=row.DESCRIPTION,
    e.isEnd = false
    FOREACH (ignore in CASE WHEN row.STOP IS NOT NULL AND row.STOP <> '' THEN [1] ELSE [] END |
      SET e.end=datetime(row.STOP)
    )
    FOREACH (ignore in CASE WHEN row.CODE IS NOT NULL AND row.CODE <> '' THEN [1] ELSE [] END |
      MERGE(s:SNOMED_CT {code:row.CODE})
      MERGE(e)-[:OF_TYPE]->(s)
    )
    WITH row,e
    CALL apoc.create.setLabels( e, [ 'Encounter', row.ENCOUNTERCLASS ] ) YIELD node
    WITH row,e
    MERGE(p:Patient {id: row.PATIENT})
    MERGE (p)-[:HAS_ENCOUNTER]->(e)
    WITH row,e
    MERGE (provider:Provider {id:row.PROVIDER})
    MERGE(e)-[:HAS_PROVIDER]->(provider)
    FOREACH (ignore in CASE WHEN row.ORGANIZATION IS NOT
    NULL AND row.ORGANIZATION <> '' THEN [1] ELSE [] END |
      MERGE (o:Organization {id: row.ORGANIZATION})
      MERGE (e)-[:HAS_ORGANIZATION]->(o))
} IN TRANSACTIONS OF 1000 ROWS
""", params)

In [11]:
# creating a Unique constraints for the provider data

gds.run_cypher("CREATE CONSTRAINT specialty_id IF NOT EXISTS FOR (n:Specialty) REQUIRE n.name IS UNIQUE")

In [12]:
# Load provider data from CSV file

file_name = 'providers.csv'
params = {'url': path + file_name}

gds.run_cypher("""
LOAD CSV WITH HEADERS from $url as row
CALL {
    WITH row
    MERGE (p:Provider {id: row.Id})
    SET p.name=row.NAME,
    p.gender=row.GENDER,
    p.address = row.ADDRESS,
    p.state = row.STATE,
    p.location = point({latitude:toFloat(row.LAT),
                        longitude:toFloat(row.LON)})
    WITH row,p
    MERGE (o:Organization {id: row.ORGANIZATION})
    MERGE(p)-[:BELONGS_TO]->(o)
    WITH row,p
    MERGE (s:Specialty {name: row.SPECIALITY})
    MERGE (p)-[:HAS_SPECIALTY]->(s)
    WITH row,p
      WHERE row.ZIP IS NOT NULL
    MERGE (z:ZipCode {zip: row.ZIP})
    MERGE (p)-[:HAS_ZIPCODE]->(z)
} IN TRANSACTIONS OF 1000 ROWS
""", params)

In [13]:
# Load organization data from CSV file

file_name = 'organizations.csv'
params = {'url': path + file_name}

gds.run_cypher("""
 LOAD CSV WITH HEADERS from $url as row
CALL {
  WITH row
    MERGE (o:Organization {id: row.Id})
    SET o.name=row.NAME,
    o.address = row.ADDRESS,
    o.state = row.STATE,
    o.location = point({latitude:toFloat(row.LAT),
                        longitude:toFloat(row.LON)})
    WITH row,o
      WHERE row.ZIP IS NOT NULL
    MERGE (z:ZipCode {zip: row.ZIP})
    MERGE (o)-[:HAS_ZIPCODE]->(z)
} IN TRANSACTIONS OF 1000 ROWS
""", params)

In [14]:
# Load medication data from CSV file

file_name = 'medications.csv'
params = {'url': path + file_name}

gds.run_cypher("""

LOAD CSV WITH HEADERS from $url as row
CALL {
    WITH row
    MERGE (p:Patient {id:row.PATIENT})
    MERGE (d:Drug {code:row.CODE})
    SET d.description=row.DESCRIPTION
    MERGE (ps:Encounter {id:row.ENCOUNTER, isEnd: false})
    MERGE (ps)-[:HAS_DRUG]->(d)
    MERGE (p)-[:HAS_ENCOUNTER]->(ps)
    FOREACH( ignore in CASE WHEN row.REASONCODE IS NOT NULL AND
    row.REASONCODE <> '' THEN [1] ELSE [] END |
      MERGE(s:SNOMED_CT {code:row.CODE})
      SET s:Diagnosis, s.description = row.REASONDESCRIPTION
      MERGE (ps)-[:HAS_DIAGNOSIS]->(s)
    )
    WITH row,ps,p
      WHERE row.STOP IS NOT NULL and row.STOP <> ''
    CREATE (pe:Encounter {id:row.ENCOUNTER, date:datetime(row.
      STOP)})
    SET pe.isEnd=true
    CREATE (p)-[:HAS_ENCOUNTER]->(pe)
    CREATE (pe)-[:HAS_DRUG]->(d)
    CREATE (ps)-[:HAS_END]->(pe)
} IN TRANSACTIONS OF 1000 ROWS
""", params)

In [15]:
# Load condition data from CSV file

file_name = 'conditions.csv'
params = {'url': path + file_name}

gds.run_cypher("""
LOAD CSV WITH HEADERS from $url as row
CALL {
    WITH row
    MATCH (p:Patient {id:row.PATIENT})
    MERGE (c:SNOMED_CT {code:row.CODE})
    SET c.description=row.DESCRIPTION, c:Condition
    MERGE (cs:Encounter {id:row.ENCOUNTER, isEnd: false})
      ON CREATE
      SET cs.date=datetime(row.START)
    MERGE (p)-[:HAS_ENCOUNTER]->(cs)
    MERGE (cs)-[:HAS_CONDITION]->(c)
    WITH p,c,cs,row
      WHERE row.STOP IS NOT NULL and row.STOP <> ''
    MERGE (ce:Encounter {id:row.ENCOUNTER,
                         date:datetime(row.STOP)})
    SET ce.isEnd=true
    MERGE (p)-[:HAS_ENCOUNTER]->(ce)
    MERGE (ce)-[:HAS_CONDITION]->(c)
    MERGE (cs)-[:HAS_END]->(ce)
} IN TRANSACTIONS OF 1000 ROWS
""", params)

In [16]:
# Load procedure data from CSV file

file_name = 'procedures.csv'
params = {'url': path + file_name}

gds.run_cypher("""
LOAD CSV WITH HEADERS from $url as row
CALL {
    WITH row
    MATCH (p:Patient {id:row.PATIENT})
    MERGE (c:SNOMED_CT {code:row.CODE})
    SET c.description=row.DESCRIPTION, c:Procedure
    MERGE (cs:Encounter {id:row.ENCOUNTER, isEnd: false})
      ON CREATE
      SET cs.date=datetime(row.START)
    MERGE (p)-[:HAS_ENCOUNTER]->(cs)
    MERGE (cs)-[:HAS_PROCEDURE]->(c)
} IN TRANSACTIONS OF 1000 ROWS
""", params)

In [17]:
# Load allergie data from CSV file

file_name = 'allergies.csv'
params = {'url': path + file_name}

gds.run_cypher("""
LOAD CSV WITH HEADERS from $url as row
CALL {
    WITH row
    MATCH (p:Patient {id:row.PATIENT})
    MERGE (c:SNOMED_CT {code:row.CODE})
    SET c.description=row.DESCRIPTION, c:Allergy
    MERGE (cs:Encounter {id:row.ENCOUNTER, isEnd: false})
      ON CREATE
      SET cs.date=datetime(row.START)
    MERGE (p)-[:HAS_ENCOUNTER]->(cs)
    MERGE (cs)-[:ALLERGY_STARTED]->(c)
    WITH p,c,cs,row
      WHERE row.STOP IS NOT NULL and row.STOP <> ''
    MERGE (ce:Encounter {id:row.ENCOUNTER,
                         date:datetime(row.STOP)})
    SET ce.isEnd=true
    MERGE (p)-[:HAS_ENCOUNTER]->(ce)
    MERGE (ce)-[:ALLERGY_ENDED]->(c)
    MERGE (cs)-[:HAS_END]->(ce)
} IN TRANSACTIONS OF 1000 ROWS
""", params)

In [18]:
# Load care plan data from CSV file

file_name = 'careplans.csv'
params = {'url': path + file_name}

gds.run_cypher("""
LOAD CSV WITH HEADERS from $url as row
CALL {
    WITH row
    MATCH (p:Patient {id:row.PATIENT})
    MERGE (cp:CarePlan {code:row.Id})
    MERGE (c:SNOMED_CT {code:row.CODE})
      SET c.description=row.DESCRIPTION, c:Care
    MERGE (cp)-[:HAS_CARE_TYPE]->(c)
    MERGE (cs:Encounter {id:row.ENCOUNTER, isEnd: false})
      ON CREATE
      SET cs.date=datetime(row.START)
    MERGE (cs)-[:HAS_CARE_TYPE]->(c)
    MERGE (p)-[:HAS_ENCOUNTER]->(cs)
    MERGE (cs)-[:CARE_PLAN_START]->(cp)
    WITH p,cp,cs,row
      WHERE row.STOP IS NOT NULL and row.STOP <> ''
    CREATE (ce:Encounter {id:row.ENCOUNTER, date:datetime(row.STOP)})
      SET ce.code=row.CODE, ce.isEnd=true
    MERGE (p)-[:HAS_ENCOUNTER]->(ce)
    MERGE (ce)-[:CARE_PLAN_END]->(cp)
    MERGE (cs)-[:HAS_END]->(ce)
} IN TRANSACTIONS OF 1000 ROWS
""", params)