In [None]:
import pandas as pd
from nebulagraph_lite import nebulagraph_let as ng_let
import os, math

# Data loading


In [None]:
#primekg = pd.read_csv("~/scratch-llm/data/PrimeKG_data/raw_data/kg.csv", low_memory=False)
nodes = pd.read_csv("~/scratch-llm/data/PrimeKG/datasets/data/kg/nodes.csv", low_memory=False)
edges = pd.read_csv("~/scratch-llm/data/PrimeKG/datasets/data/kg/edges.csv")

In [2]:
# load NebulaGraph JupyterNotebook extension
# !udocker pull vesoft/nebula-metad:v3
# !udocker create --name=nebula-metad vesoft/nebula-metad:v3
# !udocker setup --execmode=F1 nebula-metad

# !udocker pull vesoft/nebula-graphd:v3
# !udocker create --name=nebula-graphd vesoft/nebula-graphd:v3
# !udocker setup --execmode=F1 nebula-graphd

# !udocker pull vesoft/nebula-storaged:v
# !udocker create --name=nebula-storaged vesoft/nebula-storaged:v3
# !udocker setup --execmode=F1 nebula-storaged

n = ng_let()
n.start() # This takes around 5 mins

[1;3;38;2;47;75;124mMessage: Activating storaged...[0m
[1;3;38;2;102;81;145mResult of `SHOW HOSTS`:[0m
[1;3;38;2;47;75;124m    errors:[0m
[1;3;38;2;47;75;124m        code: 0[0m
[1;3;38;2;102;81;145m    results:[0m
[1;3;38;2;47;75;124m        spaceName: [0m
[1;3;38;2;102;81;145m        data:[0m
[1;3;38;2;47;75;124m            meta:[0m
[1;3;38;2;47;75;124m                None, None, None, None, None, None, None[0m
[1;3;38;2;102;81;145m            row:[0m
[1;3;38;2;102;81;145m                127.0.0.1, 9779, ONLINE, 121, PrimeKG:100, PrimeKG_manual:20, basketballplayer:1, PrimeKG:100, PrimeKG_manual:20, basketballplayer:1, 3.8.0[0m
[1;3;38;2;160;81;149m        columns:[0m
[1;3;38;2;160;81;149m            Host, Port, Status, Leader count, Leader distribution, Partition distribution, Version[0m
[1;3;38;2;212;80;135m        errors:[0m
[1;3;38;2;47;75;124m            code: 0[0m
[1;3;38;2;249;93;106m        latencyInUs: 1004[0m
[1;3;38;2;168;255;159mInfo: loadi

In [None]:
%load_ext ngql

In [3]:
%reload_ext ngql

In [4]:
%ngql --address 127.0.0.1 --port 9669 --user root --password nebula

[1;3;38;2;0;135;107m[OK] Connection Pool Created[0m


Unnamed: 0,Name
0,PrimeKG
1,PrimeKG_manual
2,basketballplayer


In [None]:
# Create a new Graph space (graph) for the PrimeKG
%ngql CREATE SPACE IF NOT EXISTS PrimeKG (vid_type = INT64);
%ngql USE PrimeKG;

# Create the graph from NebulaGraph directly

## Nodes

In [None]:
print(nodes['node_type'].unique()) #types of nodes

In [None]:
# save the data for each 'node_type' in a separate file
for node_type in nodes['node_type'].unique():
    sanitized_node_type = node_type.replace('/', '_')
    output_path = os.path.join('~/scratch-llm/data/PrimeKG_data/sub_data/', 'node_'+ sanitized_node_type + '.csv')
    nodes[nodes['node_type'] == node_type].to_csv(output_path, index=False)

In [None]:
# Create Tags (node_type) for each node in the PrimeKG
%ngql CREATE TAG IF NOT EXISTS anatomy(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS gene_protein(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS drug(node_name string, node_source string, node_id string, description string, half_life string, indication string, mechanism_of_action string, protein_binding string, pharmacodynamics string, state string, atc_1 string, atc_2 string, atc_3 string, atc_4 string, category string, group string, pathway string, molecular_weight string, tpsa string, clogp string);
%ngql CREATE TAG IF NOT EXISTS disease(node_name string, node_source string, mondo_id int, mondo_name string, group_id_bert string, group_name_bert string, mondo_definition string, umls_description string, orphanet_definition string, orphanet_prevalence string, orphanet_epidemiology string, orphanet_clinical_description string, orphanet_management_and_treatment string, mayo_symptoms string, mayo_causes string, mayo_risk_factors string, mayo_complications string, mayo_prevention string, mayo_see_doc string);
%ngql CREATE TAG IF NOT EXISTS pathway(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS biological_process(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS effect_phenotype(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS molecular_function(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS cellular_component(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS exposure(node_name string, node_source string, node_id string);

In [None]:
%ngql DESCRIBE TAG drug;

In [None]:
# Load each node source data into the corresponding Tag (working fine)
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_gene_protein.csv --tag gene_protein --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_biological_process.csv --tag biological_process --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_effect_phenotype.csv --tag effect_phenotype --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_molecular_function.csv --tag molecular_function --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_cellular_component.csv --tag cellular_component --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_pathway.csv --tag pathway --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_exposure.csv --tag exposure --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_anatomy.csv --tag anatomy --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source


### Extra information for nodes 'drug' and 'disease'

In [None]:
# disease and drug feature files are available as .tab format so they need to be converted to .csv
# many formatting issues on both files that had to be correcteed manually before merging with the 'node_' dataset
disease_tab = "~/scratch-llm/data/PrimeKG_data/raw_data/disease_features.tab"
drug_tab = "~/scratch-llm/data/PrimeKG_data/raw_data/drug_features.tab"

df = pd.read_csv(disease_tab, delimiter='\t')
disease_csv = "~/scratch-llm/data/PrimeKG_data/sub_data/disease_features.csv"
df.to_csv(disease_csv, index=False)


df = pd.read_csv(drug_tab, delimiter='\t')
drug_csv = "~/scratch-llm/data/PrimeKG_data/sub_data/drug_features.csv"
df.to_csv(drug_csv, index=False) 

In [None]:
# merge extra drug and disease information 
node_drug = pd.read_csv("~/scratch-llm/data/PrimeKG_data/sub_data/node_drug.csv")
features_drug = pd.read_csv("~/scratch-llm/data/PrimeKG_data/sub_data/drug_features.csv")
node_disease = pd.read_csv("~/scratch-llm/data/PrimeKG_data/sub_data/node_disease.csv")
features_disease = pd.read_csv("~/scratch-llm/data/PrimeKG_data/sub_data/disease_features.csv")

# Merge the dataframes on 'node_index' and save to csv
merged_df = pd.merge(node_drug, features_drug, on='node_index')
merged_df.to_csv("~/scratch-llm/data/PrimeKG_data/sub_data/node_merged_drug.csv", index=False)
merged_df = pd.merge(node_disease, features_disease, on='node_index')
merged_df.to_csv("~/scratch-llm/data/PrimeKG_data/sub_data/node_merged_disease.csv", index=False)


In [None]:
#Load the extended drug and disease data into the corresponding Tags
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/merged_disease.csv --tag disease --header --space PrimeKG --vid 0 --props 3:node_name,4:node_source,5:mondo_id,6:mondo_name,7:group_id_bert,8:group_name_bert,9:mondo_definition,10:umls_description,11:orphanet_definition,12:orphanet_prevalence,13:orphanet_epidemiology,14:orphanet_clinical_description,15:orphanet_management_and_treatment,16:mayo_symptoms,17:mayo_causes,18:mayo_risk_factors,19:mayo_complications,20:mayo_prevention,21:mayo_see_doc
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_merged_drug.csv --tag drug --header --space PrimeKG --batch 100 --vid 0 --props 2:node_id,3:node_name,4:node_source,5:description,6:half_life,7:indication,8:mechanism_of_action,9:protein_binding,10:pharmacodynamics,11:state,12:atc_1,13:atc_2,14:atc_3,15:atc_4,16:category,17:group,18:pathway,19:molecular_weight,20:tpsa,21:clogp


## Edges

In [None]:
print(edges['relation'].unique()) # types of edges

In [None]:
# save the data for each 'relation' (edge type) in a separate file
for relation in edges['relation'].unique():
    sanitized_relation = relation.replace('-', '_')
    sanitized_relation = sanitized_relation.replace('\t', '_')
    output_path = os.path.join('~/scratch-llm/data/PrimeKG_data/sub_data/', 'edge_'+ sanitized_relation + '.csv')
    edges[edges['relation'] == relation].to_csv(output_path, index=False)

In [None]:
%ngql CREATE EDGE IF NOT EXISTS protein_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS drug_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS contraindication(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS indication(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS off_label_use(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS drug_drug(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS phenotype_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS phenotype_phenotype(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS disease_phenotype_negative(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS disease_phenotype_positive(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS disease_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS disease_disease(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS drug_effect(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS bioprocess_bioprocess(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS molfunc_molfunc(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS cellcomp_cellcomp(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS molfunc_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS cellcomp_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS bioprocess_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_disease(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_exposure(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_bioprocess(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_molfunc(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_cellcomp(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS pathway_pathway(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS pathway_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS anatomy_anatomy(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS anatomy_protein_present(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS anatomy_protein_absent(display_relation string);

In [None]:
# for each edge type, load the data accordingly, this takes like 20mins from new
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_protein_protein.csv --space PrimeKG --header --edge protein_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_drug_protein.csv --space PrimeKG --header --edge drug_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_contraindication.csv --space PrimeKG --header --edge contraindication --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_indication.csv --space PrimeKG --header --edge indication --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_off_label_use.csv --space PrimeKG --header --edge off_label_use --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_drug_drug.csv --space PrimeKG --header --edge drug_drug --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_phenotype_protein.csv --space PrimeKG --header --edge phenotype_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_phenotype_phenotype.csv --space PrimeKG --header --edge phenotype_phenotype --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_phenotype_negative.csv --space PrimeKG --header --edge disease_phenotype_negative --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_phenotype_positive.csv --space PrimeKG --header --edge disease_phenotype_positive --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_protein.csv --space PrimeKG --header --edge disease_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_disease.csv --space PrimeKG --header --edge disease_disease --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_drug_effect.csv --space PrimeKG --header --edge drug_effect --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_bioprocess_bioprocess.csv --space PrimeKG --header --edge bioprocess_bioprocess --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_molfunc_molfunc.csv --space PrimeKG --header --edge molfunc_molfunc --src 2 --dst 3 --props 1:display_relation


In [None]:
# load the edge data in two batches
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_cellcomp_cellcomp.csv --space PrimeKG --header --edge cellcomp_cellcomp --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_molfunc_protein.csv --space PrimeKG --header --edge molfunc_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_cellcomp_protein.csv --space PrimeKG --header --edge cellcomp_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_bioprocess_protein.csv --space PrimeKG --header --edge bioprocess_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_protein.csv --space PrimeKG --header --edge exposure_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_disease.csv --space PrimeKG --header --edge exposure_disease --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_exposure.csv --space PrimeKG --header --edge exposure_exposure --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_bioprocess.csv --space PrimeKG --header --edge exposure_bioprocess --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_molfunc.csv --space PrimeKG --header --edge exposure_molfunc --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_cellcomp.csv --space PrimeKG --header --edge exposure_cellcomp --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_pathway_pathway.csv --space PrimeKG --header --edge pathway_pathway --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_pathway_protein.csv --space PrimeKG --header --edge pathway_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_anatomy_anatomy.csv --space PrimeKG --header --edge anatomy_anatomy --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_anatomy_protein_present.csv --space PrimeKG --batch 500 --header --edge anatomy_protein_present --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_anatomy_protein_absent.csv --space PrimeKG --header --edge anatomy_protein_absent --src 2 --dst 3 --props 1:display_relation


## Checks

In [None]:
%ngql DESCRIBE TAG drug;

In [None]:
#small check-up, with extra drug and disease information for nodes
%ngql GET SUBGRAPH WITH PROP 1 STEPS FROM 14045 OUT drug_protein YIELD VERTICES AS nodes, EDGES AS relationships;

# Load directly to NebulaPropertyGraphStore

In [5]:
from llama_index.core.schema import NodeRelationship, TextNode, IndexNode, RelatedNodeInfo
from llama_index.graph_stores.nebula import NebulaPropertyGraphStore
from llama_index.core import (
    StorageContext,
)

define files to use:

In [6]:
data_dir = '~/scratch-llm/data/PrimeKG_data/sub_data/'

# List of node types and their corresponding CSV files
node_files = {
    'node_gene_protein.csv',
    'node_merged_drug.csv', # manually corrected and generated from "extra information section"
    'node_effect_phenotype.csv',
    'node_merged_disease.csv', # manually corrected and generated from "extra information section"
    'node_biological_process.csv',
    'node_molecular_function.csv',
    'node_cellular_component.csv',
    'node_exposure.csv',
    'node_pathway.csv',
    'node_anatomy.csv'
}

edge_files ={
    'edge_protein_protein.csv',
    'edge_drug_protein.csv',
    'edge_contraindication.csv',
    'edge_indication.csv',
    'edge_off_label_use.csv',
    'edge_drug_drug.csv', # around 40 mins
    'edge_phenotype_protein.csv',
    'edge_phenotype_phenotype.csv',
    'edge_disease_phenotype_negative.csv',
    'edge_disease_phenotype_positive.csv',
    'edge_disease_protein.csv',
    'edge_disease_disease.csv',
    'edge_drug_effect.csv',
    'edge_bioprocess_bioprocess.csv',
    'edge_molfunc_molfunc.csv',
    'edge_cellcomp_cellcomp.csv',
    'edge_molfunc_protein.csv',
    'edge_cellcomp_protein.csv',
    'edge_bioprocess_protein.csv',
    'edge_exposure_protein.csv',
    'edge_exposure_disease.csv',
    'edge_exposure_exposure.csv',
    'edge_exposure_bioprocess.csv',
    'edge_exposure_molfunc.csv',
    'edge_exposure_cellcomp.csv',
    'edge_pathway_pathway.csv',
    'edge_pathway_protein.csv',
    'edge_anatomy_anatomy.csv',
    'edge_anatomy_protein_present.csv', #around 90 mins
    'edge_anatomy_protein_absent.csv'
}

## Nodes

In [None]:
# create TextNodes
def create_text_nodes(file_path):
    df = pd.read_csv(file_path)
    text_nodes = []
    for _, row in df.iterrows():
        id = row['node_index']
        if len(row) > 5: # longer text info (drug/disease)
            metadata_columns = ['node_index', 'node_id', 'node_type', 'node_name', 
                            'node_source', 'mondo_id', 'mondo_name', 'group_id_bert', 
                            'group_name_bert', 'orphanet_prevalence', 'orphanet_epidemiology']
            text_columns = ['mondo_definition', 'umls_description', 'orphanet_definition', 
                        'orphanet_clinical_description', 'orphanet_management_and_treatment', 
                        'mayo_symptoms', 'mayo_causes', 'mayo_risk_factors', 'mayo_complications',
                        'mayo_prevention', 'mayo_see_doc','description', 'indication', 'mechanism_of_action',
                        'half_life', 'protein_binding', 'pharmacodynamics', 'state', 'atc_1', 'atc_2', 'atc_3', 
                        'atc_4', 'category', 'group', 'pathway', 'molecular_weight', 'tpsa', 'clogp'] 
            available_metadata_columns = [col for col in metadata_columns if col in df.columns]
            available_text_columns = [col for col in text_columns if col in df.columns] 

            # Extract metadata, not including empty entries
            metadata = {
                col: row[col] for col in available_metadata_columns
                if col in row and not (isinstance(row[col], float) and math.isnan(row[col]))
            }

            # Extract and concatenate text values, not including empty entries
            text_entries = [str(row[col]) for col in available_text_columns if pd.notna(row[col])]
            text = "\n".join(text_entries) 

            text_node = TextNode(id_= str(id), text=text, metadata=metadata)
            text_nodes.append(text_node)

        else: # no text data, all other files
            metadata = row.drop(['node_index']).to_dict()
            text_node = TextNode(id_= str(id), metadata=metadata)
            text_nodes.append(text_node)

    return text_nodes


In [128]:
# generate all nodes from the node files
all_nodes = []
for file in node_files:
    nodes = create_text_nodes(data_dir+file)
    all_nodes.extend(nodes)


In [None]:
all_nodes[1]

'Any osteogenesis imperfecta in which the cause of the disease is a mutation in the BMP1 gene.\nautosomal dominant collagen disease resulting from defective biosynthesis of collagen type I and characterized by brittle, osteoporotic, and easily fractured bones; may also present with blue sclerae, loose joints, and imperfect dentin formation.'

## Edges

In [134]:
# create all edges
from tqdm.notebook import tqdm

def create_all_edges(file_path, all_nodes):
    node_dict = {node.id_: node for node in all_nodes} #speed up search
    df = pd.read_csv(os.path.join(file_path))

    for _, row in tqdm(df.iterrows(), total=len(df), leave=False, desc=f"Processing {file_path}"):
        x_index = str(row['x_index'])

        if x_index in node_dict:
            node = node_dict[x_index]
            if NodeRelationship.CHILD not in node.relationships:
                node.relationships[NodeRelationship.CHILD] = []
            y_index = str(row['y_index'])
            metadata = row.drop(['x_index', 'y_index']).to_dict()
            
            # check that the relationship is not there already
            if not any(isinstance(r, RelatedNodeInfo) and r.node_id == y_index for r in node.relationships[NodeRelationship.CHILD]):
                new_relationship = RelatedNodeInfo(node_id=y_index, metadata=metadata)
                node.relationships[NodeRelationship.CHILD].append(new_relationship)

In [135]:
# all relationships for all nodes, takes around 120 mins
for file in tqdm(edge_files, desc='Procesing edge files'):
    create_all_edges(data_dir+file, all_nodes)

Procesing edge files:   0%|          | 0/30 [00:00<?, ?it/s]

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_molfunc.csv:   0%|          | 0/90 [00:00<?,…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_drug_drug.csv:   0%|          | 0/2672628 [00:00<?, ?…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_molfunc_protein.csv:   0%|          | 0/139060 [00:00…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_phenotype_phenotype.csv:   0%|          | 0/37472 [00…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_cellcomp_protein.csv:   0%|          | 0/166804 [00:0…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_anatomy_protein_present.csv:   0%|          | 0/30364…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_off_label_use.csv:   0%|          | 0/5144 [00:00<?, …

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_disease.csv:   0%|          | 0/4608 [00:00<…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_exposure.csv:   0%|          | 0/4140 [00:00…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_cellcomp.csv:   0%|          | 0/20 [00:00<?…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_drug_effect.csv:   0%|          | 0/129568 [00:00<?, …

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_bioprocess_protein.csv:   0%|          | 0/289610 [00…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_drug_protein.csv:   0%|          | 0/51306 [00:00<?, …

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_molfunc_molfunc.csv:   0%|          | 0/27148 [00:00<…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_indication.csv:   0%|          | 0/18542 [00:00<?, ?i…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_cellcomp_cellcomp.csv:   0%|          | 0/9690 [00:00…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_protein_protein.csv:   0%|          | 0/642150 [00:00…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_bioprocess_bioprocess.csv:   0%|          | 0/105772 …

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_bioprocess.csv:   0%|          | 0/3250 [00:…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_disease.csv:   0%|          | 0/62914 [00:00<…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_phenotype_negative.csv:   0%|          | 0/23…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_anatomy_protein_absent.csv:   0%|          | 0/39774 …

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_phenotype_protein.csv:   0%|          | 0/6660 [00:00…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_protein.csv:   0%|          | 0/161052 [00:00…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_protein.csv:   0%|          | 0/2424 [00:00<…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_contraindication.csv:   0%|          | 0/61060 [00:00…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_phenotype_positive.csv:   0%|          | 0/30…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_pathway_protein.csv:   0%|          | 0/85292 [00:00<…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_anatomy_anatomy.csv:   0%|          | 0/28064 [00:00<…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_pathway_pathway.csv:   0%|          | 0/5070 [00:00<?…

In [None]:
# save `all_nodes` with pickle
import pickle
with open(os.path.expanduser('~/scratch-llm/storage/all_nodes.pkl'), 'wb') as f:
    pickle.dump(all_nodes, f)

In [139]:
# load from pickle
import pickle
with open(os.path.expanduser('~/scratch-llm/storage/all_nodes.pkl'), 'rb') as f:
    nodes = pickle.load(f)
    

In [172]:
nodes[3]

TextNode(id_='27158', embedding=None, metadata={'node_index': 27158, 'node_id': '13924_12592_14672_13460_12591_12536_30861_8146_8148_32846_13459_44329_14544_9805_49223_9804_14086_8147_13515_14029_12581_19019', 'node_type': 'disease', 'node_name': 'osteogenesis imperfecta', 'node_source': 'MONDO_grouped', 'mondo_id': 13924, 'mondo_name': 'osteogenesis imperfecta type 13', 'group_id_bert': '13924_12592_14672_13460_12591_12536_30861_8146_8148_32846_13459_44329_14544_9805_49223_9804_14086_8147_13515_14029_12581_19019', 'group_name_bert': 'osteogenesis imperfecta'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text='Any osteogenesis imperfecta in which the cause of the disease is a mutation in the BMP1 gene.\nA group of usually autosomal dominant inherited disorders characterized by defective synthesis of collagen type I resulting in defective collagen formation. It is characterized by brittle 

check ups that edges are correctly generated:

In [17]:
%ngql USE PrimeKG;

In [173]:
%ngql GET SUBGRAPH WITH PROP 1 STEPS FROM 27158 OUT disease_disease YIELD VERTICES AS nodes, EDGES AS relationships;

Unnamed: 0,nodes,relationships
0,"[(27158 :disease{group_id_bert: ""13924_12592_14672_13460_12591_12536_30861_8146_8148_32846_13459_44329_14544_9805_49223_9804_14086_8147_13515_14029_12581_19019"", group_name_bert: ""osteogenesis imperfecta"", mayo_causes: __NULL__, mayo_complications: __NULL__, mayo_prevention: __NULL__, mayo_risk_factors: __NULL__, mayo_see_doc: __NULL__, mayo_symptoms: __NULL__, mondo_definition: ""Osteogenesis imperfecta (OI) comprises a heterogeneous group of genetic disorders characterized by increased bone fragility, low bone mass, and susceptibility to bone fractures with variable severity."", mondo_id: 19019, mondo_name: ""osteogenesis imperfecta"", node_name: ""osteogenesis imperfecta"", node_source: ""MONDO_grouped"", orphanet_clinical_description: ""Age at diagnosis depends on the severity of the disease. Five clinically distinct types of OI have been identified. The most clinically relevant characteristic of all types of OI is bone fragility, which manifests as multiple spontaneous fractures. Osteogenesis imperfecta type II is lethal, type III is severe, types IV and V are moderate, and type I is mild (see these terms). Type I is nondeforming with normal height or mild short stature, blue sclera, and no dentinogenesis imperfecta (DI; see this term). Patients with type II present multiple rib and long bone fractures at birth, marked deformities, broad long bones, low density on skull X-rays, and dark sclera. The main signs of type III include very short stature, a triangular face, severe scoliosis, grayish sclera, and DI. Patients with type IV have moderately short stature, mild to moderate scoliosis, grayish or white sclera, and DI. Type V is characterized by mild to moderate short stature, dislocation of the radial head, mineralized interosseous membranes, hyperplastic callus, white sclera, and no DI. Other genetically different types have been observed (types VI to IX) but they are not clinically different from types II-IV."", orphanet_definition: ""Osteogenesis imperfecta (OI) comprises a heterogeneous group of genetic disorders characterized by increased bone fragility, low bone mass, and susceptibility to bone fractures with variable severity."", orphanet_epidemiology: ""Prevalence is estimated at between 1/10000 and 1/20000."", orphanet_management_and_treatment: ""Management should be multidisciplinary involving experienced medical, orthopedic, physiotherapy and rehabilitation specialists. Bisphosphonates with potent antiresorptive properties are now considered as the standard of care for severe forms but do not constitute a cure. Prevention of vitamin D and calcium deficiency is essential throughout life. Surgical management is essential for the correction of bone and spinal deformities and the prevention of long bone fractures (centro-medullary osteosynthesis). Early physiotherapy improves autonomy by helping to evaluate any motor deficits, reducing the risk of falls and encouraging patients to take up a sporting activity."", orphanet_prevalence: ""1-5/10000"", umls_description: ""An abnormality of the musculoskeletal system that is present at birth or detected in the neonatal period.""})]","[(27158)-[:disease_disease@0{display_relation: ""parent-child""}]->(37924), (27158)-[:disease_disease@0{display_relation: ""parent-child""}]->(35636), (27158)-[:disease_disease@0{display_relation: ""parent-child""}]->(35449), (27158)-[:disease_disease@0{display_relation: ""parent-child""}]->(32378), (27158)-[:disease_disease@0{display_relation: ""parent-child""}]->(27261), (27158)-[:disease_disease@0{display_relation: ""parent-child""}]->(39331), (27158)-[:disease_disease@0{display_relation: ""parent-child""}]->(27314), (27158)-[:disease_disease@0{display_relation: ""parent-child""}]->(37565)]"
1,"[(32378 :disease{group_id_bert: __NULL__, group_name_bert: __NULL__, mayo_causes: ""Microcephaly usually is the result of abnormal brain development, which can occur in the womb or during infancy. Microcephaly may be genetic. Other causes may include: Craniosynostosis. The premature fusing of the joints between the bony plates that form an infant's skull keeps the brain from growing. Treating craniosynostosis usually means your infant needs surgery to separate the fused bones. If there are no underlying problems in the brain, this surgery allows the brain adequate space to grow and develop. Chromosomal abnormalities. Down syndrome and other conditions may result in microcephaly. Decreased oxygen to the fetal brain . Certain complications of pregnancy or delivery can impair oxygen delivery to the fetal brain. Infections passed to the fetus during pregnancy. These include toxoplasmosis, cytomegalovirus, German measles, chickenpox and Zika virus. Exposure to drugs, alcohol or certain toxic chemicals in the womb. Any of these put your baby at risk of brain abnormalities. Severe malnutrition. Not getting adequate nutrition during pregnancy can affect your baby's development. Uncontrolled phenylketonuria, also known as PKU, in the mother. PKU is a birth defect that hampers the body's ability to break down the amino acid phenylalanine."", mayo_complications: ""Learning your child has microcephaly can raise questions about future pregnancies. Work with your doctor to determine the cause of the microcephaly. If the cause is genetic, you and may want to talk to a genetics counselor about the risk of microcephaly in future pregnancies."", mayo_prevention: __NULL__, mayo_risk_factors: ""Some children with microcephaly are of normal intelligence and development, even though their heads will always be small for their age and sex. But depending on the cause and severity of the microcephaly, complications may include: Developmental delays, such as in speech and movement, Difficulties with coordination and balance, Dwarfism or short stature, Facial distortions, Hyperactivity, Intellectual disabilities, Seizures"", mayo_see_doc: ""When to see a doctor, Chances are your doctor will detect microcephaly at the baby's birth or at a regular well-baby checkup. However, if you think your baby's head is smaller than normal or isn't growing as it should, talk to your doctor."", mayo_symptoms: ""The primary sign of microcephaly is: A head size significantly smaller than that of other children of the same age and sex, Head size is measured as the distance around the top of the child's head . Using standardized growth charts, the measurement is compared with other children's measurements in percentiles. Some children just have small heads, whose measurement falls as low as the first percentile. In children with microcephaly, head size measures significantly below average, possibly even below the first percentile for your baby's age and sex. A child with more severe microcephaly may also have a backward-sloping forehead."", mondo_definition: ""X-linked microcephaly-growth retardation-prognathism-cryptorchidism syndrome is a rare syndromic intellectual disability characterized by hypotonia, microcephaly, severe developmental delay, seizures, intellectual disability, growth retardation, cardiovascular septal defects, cryptorchidism, hypospadias, and dysmorphic features - prominent ears, prognathism, thin upper lip, dental crowding."", mondo_id: 18569, mondo_name: ""X-linked microcephaly-growth retardation-prognathism-cryptorchidism syndrome"", node_name: ""osteogenesis imperfecta-retinopathy-seizures-intellectual disability syndrome"", node_source: ""MONDO"", orphanet_clinical_description: __NULL__, orphanet_definition: ""X-linked microcephaly-growth retardation-prognathism-cryptorchidism syndrome is a rare syndromic intellectual disability characterized by hypotonia, microcephaly, severe developmental delay, seizures, intellectual disability, growth retardation, cardiac septal defects, cryptorchidism, hypospadias, and dysmorphic features - prominent ears, prognathism, thin upper lip, dental crowding."", orphanet_epidemiology: __NULL__, orphanet_management_and_treatment: __NULL__, orphanet_prevalence: ""<1/1000000"", umls_description: __NULL__}), (35449 :disease{group_id_bert: __NULL__, group_name_bert: __NULL__, mayo_causes: __NULL__, mayo_complications: __NULL__, mayo_prevention: __NULL__, mayo_risk_factors: __NULL__, mayo_see_doc: __NULL__, mayo_symptoms: __NULL__, mondo_definition: __NULL__, mondo_id: 15503, mondo_name: ""nose and cavum anomaly"", node_name: ""hereditary connective tissue disorder"", node_source: ""MONDO"", orphanet_clinical_description: __NULL__, orphanet_definition: __NULL__, orphanet_epidemiology: __NULL__, orphanet_management_and_treatment: __NULL__, orphanet_prevalence: __NULL__, umls_description: ""An abnormality of the nose. ""}), (37924 :disease{group_id_bert: __NULL__, group_name_bert: __NULL__, mayo_causes: ""Doctors aren't sure what causes lymphoma. But it begins when a disease-fighting white blood cell called a lymphocyte develops a genetic mutation. The mutation tells the cell to multiply rapidly, causing many diseased lymphocytes that continue multiplying. The mutation also allows the cells to go on living when other normal cells would die. This causes too many diseased and ineffective lymphocytes in your lymph nodes and causes the lymph nodes, spleen and liver to swell."", mayo_complications: __NULL__, mayo_prevention: __NULL__, mayo_risk_factors: ""Factors that can increase the risk of lymphoma include: Your age. Some types of lymphoma are more common in young adults, while others are most often diagnosed in people over 55. Being male. Males are slightly more likely to develop lymphoma than are females. Having an impaired immune system. Lymphoma is more common in people with immune system diseases or in people who take drugs that suppress their immune system. Developing certain infections. Some infections are associated with an increased risk of lymphoma, including the Epstein-Barr virus and Helicobacter pylori infection."", mayo_see_doc: ""When to see a doctor, Make an appointment with your doctor if you have any persistent signs or symptoms that worry you."", mayo_symptoms: ""Signs and symptoms of lymphoma may include: Painless swelling of lymph nodes in your neck, armpits or groin, Persistent fatigue, Fever, Night sweats, Shortness of breath, Unexplained weight loss, Itchy skin"", mondo_definition: ""Anaplastic large cell lymphoma (ALCL) is a rare and aggressive peripheral T-cell non-Hodgkin lymphoma, belonging to the group of CD30-positive lymphoproliferative disorders, which affects lymph nodes and extranodal sites. It is comprised of two sub-types, based on the expression of a protein called anaplastic lymphoma kinase (ALK): ALK positive and ALK negative ALCL."", mondo_id: 20325, mondo_name: ""anaplastic large cell lymphoma"", node_name: ""primary bone dysplasia with increased bone density"", node_source: ""MONDO_grouped"", orphanet_clinical_description: ""ALCL is characterized by peripheral, mediastinal, or abdominal lymph node involvement. It manifests with the development of painless and enlarged lymph nodes, especially in the neck or armpit (axillary lymph nodes). General symptoms include loss of appetite and fatigue as well as fever, weight loss, and night sweats (B symptoms). Mediastinal involvement manifests as cough, dypsnea and/or edema. ALCL can also extend to extranodal sites such as the bones, bone marrow, subcutaneous tissue, lungs, spleen and liver."", orphanet_definition: ""A rare and aggressive peripheral T-cell non-Hodgkin lymphoma, belonging to the group of CD30-positive lymphoproliferative disorders, which affects lymph nodes and extranodal sites. It is comprised of two sub-types, based on the expression of a protein called anaplastic lymphoma kinase (ALK): ALK positive and ALK negative ALCL."", orphanet_epidemiology: ""ALCL accounts for approximately 3% of adult non-Hodgkin lymphomas and 10% to 20% of childhood lymphomas. Its prevalence is unknown. The ALK positive subtype usually affects children and young adults. The ALK negative subtype is more commonly found in older patients over the age of 40."", orphanet_management_and_treatment: ""Anthracycline-based chemotherapy, such as CHOP (cyclophosphamide, doxorubicin, vincristine and prednisone) or CHOP-like regimens, constitutes the first-line of treatment. It may only be combined with radiotherapy in stage I/II disease. Pediatric patients have distinct protocols similar to protocols used for B-cell lymphomas, with other drugs such methotrexate, etoposide, and cytarabine being used. High-dose chemotherapy followed by autologous stem cell transplantation can also be performed, usually in cases of relapse or as first line treatment in cases with an adverse prognosis. Antibody-drug conjugate therapy (brentuximab velotin) may be given when at least one chemotherapy regimen is unsuccesful."", orphanet_prevalence: ""1-9/100000"", umls_description: __NULL__}), (27261 :disease{group_id_bert: ""10218_13120_9301_13410_10226_10442_20712_14634_100250_30049_7938_60489_13066_14480"", group_name_bert: ""46,XX sex reversal"", mayo_causes: __NULL__, mayo_complications: __NULL__, mayo_prevention: __NULL__, mayo_risk_factors: __NULL__, mayo_see_doc: __NULL__, mayo_symptoms: __NULL__, mondo_definition: __NULL__, mondo_id: 14480, mondo_name: ""46,XY sex reversal 9"", node_name: ""osteogenesis imperfecta with opalescent teeth, blue sclerae and wormian bones but without fractures"", node_source: ""MONDO"", orphanet_clinical_description: __NULL__, orphanet_definition: __NULL__, orphanet_epidemiology: __NULL__, orphanet_management_and_treatment: __NULL__, orphanet_prevalence: __NULL__, umls_description: __NULL__}), (35636 :disease{group_id_bert: __NULL__, group_name_bert: __NULL__, mayo_causes: __NULL__, mayo_complications: __NULL__, mayo_prevention: __NULL__, mayo_risk_factors: __NULL__, mayo_see_doc: __NULL__, mayo_symptoms: __NULL__, mondo_definition: ""A developmental disorder of mental health that categorizes specific learning disabilities and developmental disorders affecting coordination."", mondo_id: 592, mondo_name: ""specific developmental disorder"", node_name: ""osteochondrodysplasia"", node_source: ""MONDO"", orphanet_clinical_description: __NULL__, orphanet_definition: __NULL__, orphanet_epidemiology: __NULL__, orphanet_management_and_treatment: __NULL__, orphanet_prevalence: __NULL__, umls_description: __NULL__}), (27314 :disease{group_id_bert: ""7204_14573_16085"", group_name_bert: ""Cole-Carpenter syndrome"", mayo_causes: __NULL__, mayo_complications: __NULL__, mayo_prevention: __NULL__, mayo_risk_factors: __NULL__, mayo_see_doc: __NULL__, mayo_symptoms: __NULL__, mondo_definition: ""Cole-Carpenter syndrome is an extremely rare form of bone dysplasia characterized by the features of osteogenesis imperfecta such as bone fragility associated with multiple fractures, bone deformities (metaphyseal irregularities and bowing of the long bones) and blue sclera, in association with growth failure, craniosynostosis, hydrocephalus, ocular proptosis, and distinctive facial features (e.g. frontal bossing, midface hypoplasia, and micrognathia)."", mondo_id: 16085, mondo_name: ""Cole-Carpenter syndrome"", node_name: ""Cole-Carpenter syndrome"", node_source: ""MONDO_grouped"", orphanet_clinical_description: __NULL__, orphanet_definition: ""An extremely rare form of bone dysplasia characterized by the features of osteogenesis imperfecta such as bone fragility associated with multiple fractures, bone deformities (metaphyseal irregularities and bowing of the long bones) and blue sclera, in association with growth failure, craniosynostosis, hydrocephalus, ocular proptosis, and distinctive facial features (e.g. frontal bossing, midface hypoplasia, and micrognathia)."", orphanet_epidemiology: __NULL__, orphanet_management_and_treatment: __NULL__, orphanet_prevalence: ""<1/1000000"", umls_description: ""A group of usually autosomal dominant inherited disorders characterized by defective synthesis of collagen type I resulting in defective collagen formation. It is characterized by brittle and easily fractured bones.""}), (39331 :disease{group_id_bert: __NULL__, group_name_bert: __NULL__, mayo_causes: __NULL__, mayo_complications: __NULL__, mayo_prevention: __NULL__, mayo_risk_factors: __NULL__, mayo_see_doc: __NULL__, mayo_symptoms: __NULL__, mondo_definition: __NULL__, mondo_id: 16881, mondo_name: ""partial deletion of chromosome 19"", node_name: ""brittle bone disorder"", node_source: ""MONDO"", orphanet_clinical_description: __NULL__, orphanet_definition: __NULL__, orphanet_epidemiology: __NULL__, orphanet_management_and_treatment: __NULL__, orphanet_prevalence: __NULL__, umls_description: __NULL__}), (37565 :disease{group_id_bert: __NULL__, group_name_bert: __NULL__, mayo_causes: ""Delirium occurs when the normal sending and receiving of signals in the brain become impaired. This impairment is most likely caused by a combination of factors that make the brain vulnerable and trigger a malfunction in brain activity. Delirium may have a single cause or more than one cause, such as a combination of a medical condition and drug toxicity. Sometimes no cause can be identified. Possible causes include: Certain medications or drug toxicity, Alcohol or drug intoxication or withdrawal, A medical condition, such as a stroke, heart attack, worsening lung or liver disease, or an injury from a fall, Metabolic imbalances, such as low sodium or low calcium, Severe, chronic or terminal illness, Fever and acute infection, particularly in children, Urinary tract infection, pneumonia or the flu, especially in older adults, Exposure to a toxin, such as carbon monoxide, cyanide or other poisons, Malnutrition or dehydration, Sleep deprivation or severe emotional distress, Pain, Surgery or other medical procedures that include anesthesia, Several medications or combinations of drugs can trigger delirium, including some types of: Pain drugs, Sleep medications, Medications for mood disorders, such as anxiety and depression, Allergy medications, Asthma medications, Steroid medicines called corticosteroids, Parkinson's disease drugs, Drugs for treating spasms or convulsions"", mayo_complications: ""Delirium may last only a few hours or as long as several weeks or months. If issues contributing to delirium are addressed, the recovery time is often shorter. The degree of recovery depends to some extent on the health and mental status before the onset of delirium. People with dementia, for example, may experience a significant overall decline in memory and thinking skills. People in better health are more likely to fully recover. People with other serious, chronic or terminal illnesses may not regain the levels of thinking skills or functioning that they had before the onset of delirium. Delirium in seriously ill people is also more likely to lead to: General decline in health, Poor recovery from surgery, Need for institutional care, Increased risk of death"", mayo_prevention: ""The most successful approach to preventing delirium is to target risk factors that might trigger an episode. Hospital environments present a special challenge frequent room changes, invasive procedures, loud noises, poor lighting, and lack of natural light and sleep can worsen confusion. Evidence indicates that certain strategies promoting good sleep habits, helping the person remain calm and well-oriented, and helping prevent medical problems or other complications can help prevent or reduce the severity of delirium."", mayo_risk_factors: ""Any condition that results in a hospital stay, especially in intensive care or after surgery, increases the risk of delirium, as does being a resident in a nursing home. Delirium is more common in older adults. Examples of other conditions that increase the risk of delirium include: Brain disorders such as dementia, stroke or Parkinson's disease, Previous delirium episodes, Visual or hearing impairment, The presence of multiple medical problems"", mayo_see_doc: ""When to see a doctor, If a relative, friend or someone in your care shows any signs or symptoms of delirium, see a doctor. Your input about the person's symptoms, typical thinking and everyday abilities will be important for a proper diagnosis and for finding the underlying cause. If you notice signs and symptoms of delirium in a person in a hospital or nursing home, report your concerns to the nursing staff or doctor rather than assuming that those problems have been observed. Older people recovering in the hospital or living in a long-term care facility are particularly at risk of delirium."", mayo_symptoms: ""Signs and symptoms of delirium usually begin over a few hours or a few days. They often fluctuate throughout the day, and there may be periods of no symptoms. Symptoms tend to be worse during the night when it's dark and things look less familiar. Primary signs and symptoms include those below. Reduced awareness of the environment, This may result in: An inability to stay focused on a topic or to switch topics, Getting stuck on an idea rather than responding to questions or conversation, Being easily distracted by unimportant things, Being withdrawn, with little or no activity or little response to the environment, Poor thinking skills, This may appear as: Poor memory, particularly of recent events, Disorientation for example, not knowing where you are or who you are, Difficulty speaking or recalling words, Rambling or nonsense speech, Trouble understanding speech, Difficulty reading or writing, Behavior changes, These may include: Seeing things that don't exist, Restlessness, agitation or combative behavior, Calling out, moaning or making other sounds, Being quiet and withdrawn especially in older adults, Slowed movement or lethargy, Disturbed sleep habits, Reversal of night-day sleep-wake cycle, Emotional disturbances, These may appear as: Anxiety, fear or paranoia, Depression, Irritability or anger, A sense of feeling elated, Apathy, Rapid and unpredictable mood shifts, Personality changes, Types of delirium, Experts have identified three types of delirium: Hyperactive delirium. Probably the most easily recognized type, this may include restlessness, agitation, rapid mood changes or hallucinations, and refusal to cooperate with care. Hypoactive delirium. This may include inactivity or reduced motor activity, sluggishness, abnormal drowsiness, or seeming to be in a daze. Mixed delirium. This includes both hyperactive and hypoactive signs and symptoms. The person may quickly switch back and forth from hyperactive to hypoactive states. Delirium and dementia, Dementia and delirium may be particularly difficult to distinguish, and a person may have both. In fact, delirium frequently occurs in people with dementia. But having episodes of delirium does not always mean a person has dementia. So a dementia assessment should not be done during a delirium episode because the results could be misleading. Dementia is the progressive decline of memory and other thinking skills due to the gradual dysfunction and loss of brain cells. The most common cause of dementia is Alzheimer's disease. Some differences between the symptoms of delirium and dementia include: Onset. The onset of delirium occurs within a short time, while dementia usually begins with relatively minor symptoms that gradually worsen over time. Attention. The ability to stay focused or maintain attention is significantly impaired with delirium. A person in the early stages of dementia remains generally alert. Fluctuation. The appearance of delirium symptoms can fluctuate significantly and frequently throughout the day. While people with dementia have better and worse times of day, their memory and thinking skills stay at a fairly constant level during the course of a day."", mondo_definition: ""A disorder characterized by confusion; inattentiveness; disorientation; illusions; hallucinations; agitation; and in some instances autonomic nervous system overactivity. It may result from toxic/metabolic conditions or structural brain lesions. (From Adams et al., Principles of Neurology, 6th ed, pp411-2)"", mondo_id: 45057, mondo_name: ""delirium"", node_name: ""inherited odontologic disease"", node_source: ""MONDO"", orphanet_clinical_description: __NULL__, orphanet_definition: __NULL__, orphanet_epidemiology: __NULL__, orphanet_management_and_treatment: __NULL__, orphanet_prevalence: __NULL__, umls_description: ""A life-threatening disorder characterized by delirium, seizures, and neuromuscular changes.""})]","[(32378)-[:disease_disease@0{display_relation: ""parent-child""}]->(27158), (35449)-[:disease_disease@0{display_relation: ""parent-child""}]->(27158), (37924)-[:disease_disease@0{display_relation: ""parent-child""}]->(27158), (27261)-[:disease_disease@0{display_relation: ""parent-child""}]->(27158), (35636)-[:disease_disease@0{display_relation: ""parent-child""}]->(27158), (27314)-[:disease_disease@0{display_relation: ""parent-child""}]->(27158), (39331)-[:disease_disease@0{display_relation: ""parent-child""}]->(27158), (37565)-[:disease_disease@0{display_relation: ""parent-child""}]->(27158)]"


In [166]:
#look for the index from the node that has the node_id 95635
for i, node in enumerate(nodes):
    if node.node_id == '29657':
        print(i)
        break


12225


In [167]:
nodes[12225].relationships # why emptyy??? if on PrimeKG so many relationships??!!

{}

In [69]:
start_node = all_nodes[52029]
related_nodes_info = start_node.relationships.get(NodeRelationship.CHILD, [])

related_nodes = []
edges = []

for relation in related_nodes_info:
    # Find the related node by its node_id
    related_node = next((node for node in all_nodes if node.node_id == relation.node_id), None)
    if related_node:
        related_nodes.append(related_node)
        edges.append((start_node.node_id, related_node.node_id, relation.metadata))

subgraph_nodes = [start_node] + related_nodes  # Include the starting node and its neighbors


## Generate index

In [21]:
%ngql CREATE SPACE IF NOT EXISTS PrimeKG_manual(vid_type=FIXED_STRING(256), partition_num=20, replica_factor=1);

In [20]:
%ngql drop space PrimeKG_manual;

In [11]:
# PropertyGraphIndex requires NebulaPropertyGraphStore
graph_store = NebulaPropertyGraphStore(
    space= "PrimeKG_manual", 
    username = "root",
    password = "nebula",
    url = "nebula://localhost:9669", #changing id to STRING
    props_schema= "`id` STRING,`node_id` STRING, `node_name` STRING, `node_source` STRING, `mondo_id` STRING, `mondo_name` STRING, `group_id_bert` STRING, `group_name_bert` STRING, `mondo_definition` STRING, `umls_description` STRING, `orphanet_definition` STRING, `orphanet_prevalence` STRING, `orphanet_epidemiology` STRING, `orphanet_clinical_description` STRING, `orphanet_management_and_treatment` STRING, `mayo_symptoms` STRING, `mayo_causes` STRING, `mayo_risk_factors` STRING, `mayo_complications` STRING, `mayo_prevention` STRING, `mayo_see_doc` STRING, `display_relation` STRING, `description` STRING, `half_life` STRING, `indication` STRING, `mechanism_of_action` STRING, `protein_binding` STRING, `pharmacodynamics` STRING, `state` STRING, `atc_1` STRING, `atc_2` STRING, `atc_3` STRING, `atc_4` STRING, `category` STRING, `group` STRING, `pathway` STRING, `molecular_weight` STRING, `tpsa` STRING, `clogp` STRING, `_node_content` STRING,`_node_type` STRING,`document_id` STRING,`doc_id` STRING,`ref_doc_id` STRING, `triplet_source_id` STRING"
)

storage_context = StorageContext.from_defaults(property_graph_store=graph_store)

convert TextNode into IndexNode to insert the nodes into the index:

In [24]:
from llama_index.core.vector_stores.simple import SimpleVectorStore
from llama_index.core import PropertyGraphIndex, Settings
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import torch

vec_store = SimpleVectorStore()

In [25]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [26]:
# Create IndexNode objects from TextNode objects
index_nodes = [IndexNode.from_text_node(node, index_id="index_1") for node in subgraph_nodes]
len(index_nodes)


195

In [28]:
# this was intended to be for the bug but not really doing anything
from transformers import AutoTokenizer

# Load tokenizer and set pad_token_id explicitly
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B")
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token  # Set pad_token to eos_token if not defined

In [None]:
llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.8, "do_sample": True},
    # system_prompt=system_prompt,
    # query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="meta-llama/Llama-3.2-3B",
    model_name="meta-llama/Llama-3.2-3B",
    device_map="auto",
    stopping_ids=[50278, 50279, 50277, 1, 0],
    tokenizer_kwargs={"max_length": 4096, "pad_token_id": tokenizer.pad_token_id},
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16}
)

Settings.llm = llm
Settings.chunk_size = 1024
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-m3") 

In [30]:
index = PropertyGraphIndex.from_existing(
    llm=llm,
    vector_store=vec_store,
    property_graph_store=graph_store,
    show_progress=True,
    storage_context=storage_context,
    embed_kg_nodes=True
)

In [31]:
import nest_asyncio
nest_asyncio.apply()

# Ensure that node IDs are strings fitting the vertex ID length limit
for node in index_nodes:
	node.id_ = str(node.id_)

# Insert nodes into the index
index.insert_nodes(index_nodes)

Extracting paths from text:   0%|          | 0/195 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `p

save index to persist storage

In [32]:
index.storage_context.persist(persist_dir="~/scratch-llm/storage/PrimeKG_index_mid/")


In [12]:
nodes = graph_store.get_all_nodes()

In [13]:
nodes[0]

EntityNode(label='entity', embedding=None, properties={'triplet_source_id': '64175', 'id': 64175}, name='2077')