In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nebulagraph_lite import nebulagraph_let as ng_let
import os, math, torch, re, pickle
from tqdm.notebook import tqdm


from llama_index.core.schema import NodeRelationship, TextNode, RelatedNodeInfo
from llama_index.graph_stores.nebula import NebulaPropertyGraphStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

from typing import Sequence, List
from llama_index.core.schema import BaseNode, MetadataMode
from llama_index.core.embeddings.utils import resolve_embed_model
from llama_index.core.settings import Settings

In [2]:
# load NebulaGraph JupyterNotebook extension
# !udocker pull vesoft/nebula-metad:v3
# !udocker create --name=nebula-metad vesoft/nebula-metad:v3
# !udocker setup --execmode=F1 nebula-metad
# !udocker pull vesoft/nebula-graphd:v3
# !udocker create --name=nebula-graphd vesoft/nebula-graphd:v3
# !udocker setup --execmode=F1 nebula-graphd
# !udocker pull vesoft/nebula-storaged:v3
# !udocker create --name=nebula-storaged vesoft/nebula-storaged:v3
# !udocker setup --execmode=F1 nebula-storaged

n = ng_let(in_container=True)
n.start() # This takes around 5 mins


[1;3;38;2;47;75;124mMessage: Activating storaged...[0m
[1;3;38;2;102;81;145mResult of `SHOW HOSTS`:[0m
[1;3;38;2;47;75;124m    errors:[0m
[1;3;38;2;47;75;124m        code: 0[0m
[1;3;38;2;102;81;145m    results:[0m
[1;3;38;2;47;75;124m        spaceName: [0m
[1;3;38;2;102;81;145m        data:[0m
[1;3;38;2;47;75;124m            meta:[0m
[1;3;38;2;47;75;124m                None, None, None, None, None, None, None[0m
[1;3;38;2;102;81;145m            row:[0m
[1;3;38;2;102;81;145m                127.0.0.1, 9779, ONLINE, 201, PrimeKG:100, PrimeKG_nebula:100, basketballplayer:1, PrimeKG:100, PrimeKG_nebula:100, basketballplayer:1, 3.8.0[0m
[1;3;38;2;160;81;149m        columns:[0m
[1;3;38;2;160;81;149m            Host, Port, Status, Leader count, Leader distribution, Partition distribution, Version[0m
[1;3;38;2;212;80;135m        errors:[0m
[1;3;38;2;47;75;124m            code: 0[0m
[1;3;38;2;249;93;106m        latencyInUs: 958[0m
[1;3;38;2;168;255;159mInfo: load

In [3]:
%reload_ext ngql
%ngql --address 127.0.0.1 --port 9669 --user root --password nebula


[1;3;38;2;0;135;107m[OK] Connection Pool Created[0m


Unnamed: 0,Name
0,PrimeKG
1,PrimeKG_nebula
2,basketballplayer


# Create the graph from NebulaGraph directly

In [12]:
%ngql CREATE SPACE IF NOT EXISTS PrimeKG_nebula(vid_type=FIXED_STRING(256));
%ngql USE PrimeKG_nebula;

In [5]:
#primekg = pd.read_csv("~/scratch-llm/data/PrimeKG_data/raw_data/kg.csv", low_memory=False)
nodes = pd.read_csv("~/scratch-llm/data/PrimeKG_data/raw_data/nodes.csv",
    low_memory=False,
    sep=',',
    quotechar='"',  
    escapechar='\\', 
)

edges = pd.read_csv("~/scratch-llm/data/PrimeKG_data/raw_data/edges.csv")

## Nodes

In [None]:
print(nodes['node_type'].unique()) #types of nodes

In [6]:
# save the data for each 'node_type' in a separate file
for node_type in nodes['node_type'].unique():
    sanitized_node_type = node_type.replace('/', '_')
    output_path = os.path.join('~/scratch-llm/data/PrimeKG_data/sub_data/', 'node_'+ sanitized_node_type + '.csv')
    nodes[nodes['node_type'] == node_type].to_csv(output_path, index=False)

In [19]:
# Create Tags (node_type) for each node in the PrimeKG
%ngql CREATE TAG IF NOT EXISTS anatomy(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS gene_protein(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS drug(node_name string, node_source string, node_id string, description string, half_life string, indication string, mechanism_of_action string, protein_binding string, pharmacodynamics string, state string, atc_1 string, atc_2 string, atc_3 string, atc_4 string, category string, group string, pathway string, molecular_weight string, tpsa string, clogp string);
%ngql CREATE TAG IF NOT EXISTS disease(node_name string, node_source string, mondo_id int, mondo_name string, group_id_bert string, group_name_bert string, mondo_definition string, umls_description string, orphanet_definition string, orphanet_prevalence string, orphanet_epidemiology string, orphanet_clinical_description string, orphanet_management_and_treatment string, mayo_symptoms string, mayo_causes string, mayo_risk_factors string, mayo_complications string, mayo_prevention string, mayo_see_doc string);
%ngql CREATE TAG IF NOT EXISTS pathway(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS biological_process(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS effect_phenotype(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS molecular_function(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS cellular_component(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS exposure(node_name string, node_source string, node_id string);

In [None]:
# Load each node source data into the corresponding Tag (working fine)
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_gene_protein.csv --tag gene_protein --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_biological_process.csv --tag biological_process --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_effect_phenotype.csv --tag effect_phenotype --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_molecular_function.csv --tag molecular_function --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_cellular_component.csv --tag cellular_component --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_pathway.csv --tag pathway --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_exposure.csv --tag exposure --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_anatomy.csv --tag anatomy --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source


### Extra information for nodes 'drug' and 'disease'

In [None]:
# disease and drug feature files are available as .tab format so they need to be converted to .csv
# many formatting issues on both files that had to be correcteed manually before merging with the 'node_' dataset
disease_tab = "~/scratch-llm/data/PrimeKG_data/raw_data/disease_features.tab"
drug_tab = "~/scratch-llm/data/PrimeKG_data/raw_data/drug_features.tab"

df = pd.read_csv(disease_tab, delimiter='\t')
disease_csv = "~/scratch-llm/data/PrimeKG_data/sub_data/disease_features.csv"
df.to_csv(disease_csv, index=False)


df = pd.read_csv(drug_tab, delimiter='\t')
drug_csv = "~/scratch-llm/data/PrimeKG_data/sub_data/drug_features.csv"
df.to_csv(drug_csv, index=False) 

In [10]:
# merge extra drug and disease information 
node_drug = pd.read_csv("~/scratch-llm/data/PrimeKG_data/sub_data/node_drug.csv")
features_drug = pd.read_csv("~/scratch-llm/data/PrimeKG_data/sub_data/drug_features.csv")
node_disease = pd.read_csv("~/scratch-llm/data/PrimeKG_data/sub_data/node_disease.csv")
features_disease = pd.read_csv("~/scratch-llm/data/PrimeKG_data/sub_data/disease_features.csv")


In [37]:
node_disease = pd.read_csv("~/scratch-llm/data/PrimeKG_data/sub_data/node_disease.csv")


In [35]:
features_disease2 = pd.read_csv("~/scratch-llm/data/knowledge_graph/disease_features_fixed.csv")

In [45]:
# the disease_features has multiple rows with the same 'node_index' so we need to merge them
def merge_column_values(column):
    # Check if all values in the column can be converted to float
    try:
        numeric_column = pd.to_numeric(column, errors='coerce')
        if numeric_column.notna().all():  # If all are numbers, return unique values
            return '_'.join(map(str, sorted(set(numeric_column))))  # Keep the first numeric value (assuming they're the same)
    except Exception:
        pass
    
    # For strings, concatenate unique, non-empty values
    return '; '.join(filter(lambda x: pd.notna(x) and str(x).strip() != '', set(column)))

merged_features_disease = features_disease2.groupby('node_index', as_index=False).agg(merge_column_values)

In [46]:
merged_df = pd.merge(node_disease, merged_features_disease, on='node_index')
merged_df.to_csv("~/scratch-llm/data/node_merged_drug2.csv", index=False)

In [None]:
# Merge the dataframes on 'node_index' and save to csv
merged_df = pd.merge(node_drug, features_drug, on='node_index')
merged_df.to_csv("~/scratch-llm/data/PrimeKG_data/sub_data/node_merged_drug.csv", index=False)

merged_df = pd.merge(node_disease, merged_features_disease, on='node_index')
# merged_df.fillna("", inplace=True) # in case included 'extra' diseases that are only present in features_disease
merged_df.to_csv("~/scratch-llm/data/PrimeKG_data/sub_data/node_merged_disease.csv", index=False)

In [None]:
#Load the extended drug and disease data into the corresponding Tags
# NOTE: formatting problems for node_merged_disease.csv
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_merged_disease.csv --tag disease --header --space PrimeKG --batch 50 --vid 0 --props 3:node_name,4:node_source,5:mondo_id,6:mondo_name,7:group_id_bert,8:group_name_bert,9:mondo_definition,10:umls_description,11:orphanet_definition,12:orphanet_prevalence,13:orphanet_epidemiology,14:orphanet_clinical_description,15:orphanet_management_and_treatment,16:mayo_symptoms,17:mayo_causes,18:mayo_risk_factors,19:mayo_complications,20:mayo_prevention,21:mayo_see_doc


loading diseases without extra info:

In [50]:
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_disease.csv --tag disease --header --space PrimeKG --batch 50 --vid 0 --props 3:node_name,4:node_source

[1;3;38;2;0;120;215m[INFO] Parsed 17080 vertices 'PrimeKG' for tag 'disease' in memory[0m


Loading Vertices:   0%|          | 0/342 [00:00<?, ?it/s]

Loaded 50 of 17080 vertices
Loaded 100 of 17080 vertices
Loaded 150 of 17080 vertices
Loaded 200 of 17080 vertices
Loaded 250 of 17080 vertices
Loaded 300 of 17080 vertices
Loaded 350 of 17080 vertices
Loaded 400 of 17080 vertices
Loaded 450 of 17080 vertices
Loaded 500 of 17080 vertices
Loaded 550 of 17080 vertices
Loaded 600 of 17080 vertices
Loaded 650 of 17080 vertices
Loaded 700 of 17080 vertices
Loaded 750 of 17080 vertices
Loaded 800 of 17080 vertices
Loaded 850 of 17080 vertices
Loaded 900 of 17080 vertices
Loaded 950 of 17080 vertices
Loaded 1000 of 17080 vertices
Loaded 1050 of 17080 vertices
Loaded 1100 of 17080 vertices
Loaded 1150 of 17080 vertices
Loaded 1200 of 17080 vertices
Loaded 1250 of 17080 vertices
Loaded 1300 of 17080 vertices
Loaded 1350 of 17080 vertices
Loaded 1400 of 17080 vertices
Loaded 1450 of 17080 vertices
Loaded 1500 of 17080 vertices
Loaded 1550 of 17080 vertices
Loaded 1600 of 17080 vertices
Loaded 1650 of 17080 vertices
Loaded 1700 of 17080 vertices


In [None]:
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_merged_drug.csv --tag drug --header --space PrimeKG --batch 100 --vid 0 --props 2:node_id,3:node_name,4:node_source,5:description,6:half_life,7:indication,8:mechanism_of_action,9:protein_binding,10:pharmacodynamics,11:state,12:atc_1,13:atc_2,14:atc_3,15:atc_4,16:category,17:group,18:pathway,19:molecular_weight,20:tpsa,21:clogp


## Edges

In [None]:
print(edges['relation'].unique()) # types of edges

In [8]:
# save the data for each 'relation' (edge type) in a separate file
for relation in edges['relation'].unique():
    sanitized_relation = relation.replace('-', '_')
    sanitized_relation = sanitized_relation.replace('\t', '_')
    output_path = os.path.join('~/scratch-llm/data/PrimeKG_data/sub_data/', 'edge_'+ sanitized_relation + '.csv')
    edges[edges['relation'] == relation].to_csv(output_path, index=False)

In [52]:
%ngql CREATE EDGE IF NOT EXISTS protein_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS drug_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS contraindication(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS indication(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS off_label_use(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS drug_drug(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS phenotype_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS phenotype_phenotype(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS disease_phenotype_negative(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS disease_phenotype_positive(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS disease_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS disease_disease(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS drug_effect(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS bioprocess_bioprocess(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS molfunc_molfunc(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS cellcomp_cellcomp(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS molfunc_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS cellcomp_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS bioprocess_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_disease(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_exposure(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_bioprocess(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_molfunc(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_cellcomp(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS pathway_pathway(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS pathway_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS anatomy_anatomy(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS anatomy_protein_present(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS anatomy_protein_absent(display_relation string);

In [None]:
# for each edge type, load the data accordingly, this takes like 20mins from new
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_protein_protein.csv --space PrimeKG --batch 100 --header --edge protein_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_drug_protein.csv --space PrimeKG --header --batch 100 --edge drug_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_contraindication.csv --space PrimeKG --header --batch 100 --edge contraindication --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_indication.csv --space PrimeKG --header --batch 100 --edge indication --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_off_label_use.csv --space PrimeKG --header --batch 100 --edge off_label_use --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_drug_drug.csv --space PrimeKG --header --batch 100 --edge drug_drug --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_phenotype_protein.csv --space PrimeKG --header --batch 100 --edge phenotype_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_phenotype_phenotype.csv --space PrimeKG --header --batch 100 --edge phenotype_phenotype --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_phenotype_negative.csv --space PrimeKG --header --batch 100 --edge disease_phenotype_negative --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_phenotype_positive.csv --space PrimeKG --header --batch 100 --edge disease_phenotype_positive --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_protein.csv --space PrimeKG --header --batch 100 --edge disease_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_disease.csv --space PrimeKG --header --batch 100 --edge disease_disease --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_drug_effect.csv --space PrimeKG --header --batch 100 --edge drug_effect --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_bioprocess_bioprocess.csv --space PrimeKG --header --batch 100 --edge bioprocess_bioprocess --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_molfunc_molfunc.csv --space PrimeKG --header --batch 100 --edge molfunc_molfunc --src 2 --dst 3 --props 1:display_relation


In [None]:
# load the edge data in two batches
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_cellcomp_cellcomp.csv --space PrimeKG --header --batch 100 --edge cellcomp_cellcomp --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_molfunc_protein.csv --space PrimeKG --header --batch 100 --edge molfunc_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_cellcomp_protein.csv --space PrimeKG --header --batch 100 --edge cellcomp_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_bioprocess_protein.csv --space PrimeKG --header --batch 100 --edge bioprocess_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_protein.csv --space PrimeKG --header --batch 100 --edge exposure_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_disease.csv --space PrimeKG --header --batch 100 --edge exposure_disease --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_exposure.csv --space PrimeKG --header --batch 100 --edge exposure_exposure --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_bioprocess.csv --space PrimeKG --header --batch 100 --edge exposure_bioprocess --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_molfunc.csv --space PrimeKG --header --batch 100 --edge exposure_molfunc --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_cellcomp.csv --space PrimeKG --header --batch 100 --edge exposure_cellcomp --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_pathway_pathway.csv --space PrimeKG --header --batch 100 --edge pathway_pathway --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_pathway_protein.csv --space PrimeKG --header --batch 100 --edge pathway_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_anatomy_anatomy.csv --space PrimeKG --header --batch 100 --edge anatomy_anatomy --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_anatomy_protein_present.csv --space PrimeKG --batch 100 --header --edge anatomy_protein_present --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_anatomy_protein_absent.csv --space PrimeKG --header --batch 100 --edge anatomy_protein_absent --src 2 --dst 3 --props 1:display_relation


# Load directly to NebulaPropertyGraphStore

define files to use:

In [5]:
data_dir = '~/scratch-llm/data/PrimeKG_data/sub_data/'

# List of node types and their corresponding CSV files
node_files = {
    'node_gene_protein.csv',
    'node_merged_drug.csv', # manually corrected and generated from "extra information section"
    'node_effect_phenotype.csv',
    'node_merged_disease.csv', # manually corrected and generated from "extra information section"
    'node_biological_process.csv',
    'node_molecular_function.csv',
    'node_cellular_component.csv',
    'node_exposure.csv',
    'node_pathway.csv',
    'node_anatomy.csv'
}

edge_files ={
    'edge_protein_protein.csv',
    'edge_drug_protein.csv',
    'edge_contraindication.csv',
    'edge_indication.csv',
    'edge_off_label_use.csv',
    'edge_drug_drug.csv',
    'edge_phenotype_protein.csv',
    'edge_phenotype_phenotype.csv',
    'edge_disease_phenotype_negative.csv',
    'edge_disease_phenotype_positive.csv',
    'edge_disease_protein.csv',
    'edge_disease_disease.csv',
    'edge_drug_effect.csv',
    'edge_bioprocess_bioprocess.csv',
    'edge_molfunc_molfunc.csv',
    'edge_cellcomp_cellcomp.csv',
    'edge_molfunc_protein.csv',
    'edge_cellcomp_protein.csv',
    'edge_bioprocess_protein.csv',
    'edge_exposure_protein.csv',
    'edge_exposure_disease.csv',
    'edge_exposure_exposure.csv',
    'edge_exposure_bioprocess.csv',
    'edge_exposure_molfunc.csv',
    'edge_exposure_cellcomp.csv',
    'edge_pathway_pathway.csv',
    'edge_pathway_protein.csv',
    'edge_anatomy_anatomy.csv',
    'edge_anatomy_protein_present.csv',
    'edge_anatomy_protein_absent.csv'
}

## Nodes

### create TextNodes:

In [None]:
# create TextNodes
def create_text_nodes(file_path):
    df = pd.read_csv(file_path)
    text_nodes = []
    for _, row in df.iterrows():
        id = row['node_index']
        if len(row) > 5: # longer text info (drug/disease)
            metadata_columns = ['node_index', 'node_id', 'node_type', 'node_name', 
                            'node_source', 'mondo_id', 'mondo_name', 'group_id_bert', 
                            'group_name_bert', 'orphanet_prevalence']
            
            text_columns = ['mondo_definition', 'umls_description', 'orphanet_definition', 
                        'orphanet_clinical_description', 'orphanet_management_and_treatment','orphanet_epidemiology', 
                        'mayo_symptoms', 'mayo_causes', 'mayo_risk_factors', 'mayo_complications',
                        'mayo_prevention', 'mayo_see_doc','description', 'indication', 'mechanism_of_action',
                        'half_life', 'protein_binding', 'pharmacodynamics', 'state', 'atc_1', 'atc_2', 'atc_3', 
                        'atc_4', 'category', 'group', 'pathway', 'molecular_weight', 'tpsa', 'clogp'] 
            available_metadata_columns = [col for col in metadata_columns if col in df.columns]
            available_text_columns = [col for col in text_columns if col in df.columns] 

            # Extract metadata, not including empty entries     
            metadata = {
                col: row[col] for col in available_metadata_columns
                if col in row and not (isinstance(row[col], float) and math.isnan(row[col]))
            }
            metadata = {k: str(v) for k, v in metadata.items()} #ensure entries are strings

            # Extract and concatenate text values, not including empty entries
            text_entries = [str(row[col]) for col in available_text_columns if pd.notna(row[col])]
            text = "\n".join(text_entries)            

            text_node = TextNode(id_= str(id), text=text, metadata=metadata)
            text_nodes.append(text_node)

        else: # no text data, all other files
            metadata = row.to_dict()
            metadata = {k: str(v) for k, v in metadata.items()} #ensure entries are strings
            text_node = TextNode(id_= str(id), metadata=metadata)
            text_nodes.append(text_node)

    return text_nodes

# generate all nodes from the node files
all_nodes = []
for file in node_files:
    nodes = create_text_nodes(data_dir+file)
    all_nodes.extend(nodes)


### create Entity and ChunkNodes:

In [18]:
# Convert TextNode objects to the format expected by NebulaPropertyGraphStore
from llama_index.core.graph_stores.types import EntityNode, ChunkNode
all_entity_nodes = []

def create_entity_nodes(file_path, label):
    df = pd.read_csv(file_path)
    nodes = []
    for _, row in df.iterrows():
        text=None
        id = row['node_index']
        metadata_columns = ['node_index', 'node_id', 'node_type', 'node_name', 
                            'node_source', 'mondo_id', 'mondo_name', 'group_id_bert', 
                            'group_name_bert', 'orphanet_prevalence']
        
        text_columns = ['mondo_definition', 'umls_description', 'orphanet_definition', 
                        'orphanet_clinical_description', 'orphanet_management_and_treatment','orphanet_epidemiology', 
                        'mayo_symptoms', 'mayo_causes', 'mayo_risk_factors', 'mayo_complications',
                        'mayo_prevention', 'mayo_see_doc','description', 'indication', 'mechanism_of_action',
                        'half_life', 'protein_binding', 'pharmacodynamics', 'state', 'atc_1', 'atc_2', 'atc_3', 
                        'atc_4', 'category', 'group', 'pathway', 'molecular_weight', 'tpsa', 'clogp'] 
        available_metadata_columns = [col for col in metadata_columns if col in df.columns]
        available_text_columns = [col for col in text_columns if col in df.columns] 

        # Extract metadata, not including empty entries     
        metadata = {
            col: row[col] for col in available_metadata_columns
            if col in row and not (isinstance(row[col], float) and math.isnan(row[col]))
        }
        metadata = {k: str(v) for k, v in metadata.items()}  # ensure entries are strings

        # Extract and concatenate text values, not including empty entries
        text_entries = [str(row[col]) for col in available_text_columns if pd.notna(row[col])]
        text = " ".join(text_entries)            

        if text is not None:  # if there is text data, create a ChunkNode
            node = ChunkNode(id_=str(id), label=label, properties=metadata, text=text)
        else:  # if there is no text data, empty ChunkNode
            node = ChunkNode(id_=str(id), label=label, properties=metadata, text='')
        
        nodes.append(node)

    return nodes


In [15]:
for file in node_files:
    # remove 'node_' and '.csv' and merged_ from the label
    label = file.replace('node_', '').replace('.csv', '').replace('merged_', '')
    nodes = create_entity_nodes(data_dir + file, label)
    all_entity_nodes.extend(nodes)
    

In [None]:
#save to pickle
with open(os.path.expanduser('~/scratch-llm/storage/all_entity_nodes.pkl'), 'wb') as f:
    pickle.dump(all_entity_nodes, f)

## Edges

### append edges to TextNodes:

In [None]:
# create all edges

def create_all_edges(file_path, all_nodes):
    node_dict = {node.id_: node for node in all_nodes} #speed up search
    df = pd.read_csv(os.path.join(file_path))

    for _, row in tqdm(df.iterrows(), total=len(df), leave=False, desc=f"Processing {file_path}"):
        x_index = str(row['x_index'])
        y_index = str(row['y_index'])
        
        if x_index in node_dict:
            origin_node = node_dict[x_index]
            if NodeRelationship.CHILD not in origin_node.relationships:
                origin_node.relationships[NodeRelationship.CHILD] = []
           
            metadata = row.drop(['x_index', 'y_index']).to_dict()
            new_relationship = RelatedNodeInfo(node_id=y_index, metadata=metadata)
            origin_node.relationships[NodeRelationship.CHILD].append(new_relationship)


In [None]:
# all relationships for all nodes, takes around 20 mins
for file in tqdm(edge_files, desc='Procesing edge files'):
    create_all_edges(data_dir+file, all_nodes)

save nodes-relationships to pickle

In [None]:
# save `all_nodes` with pickle
with open(os.path.expanduser('~/scratch-llm/storage/all_nodes.pkl'), 'wb') as f:
    pickle.dump(all_nodes, f)

### create Relations for Entity/ChunkNodes:

In [None]:
from llama_index.core.graph_stores.types import Relation

def create_realtions(file_path, label):
    df = pd.read_csv(file_path)
    relations = []
    for _, row in tqdm(df.iterrows(), total=len(df), leave=False, desc=f"Processing {file_path}"):
        start_node = row['x_index']
        end_node = row['y_index']
        properties = row.drop(['relation']).to_dict()
        properties = row.drop(['relation', 'x_index', 'y_index']).to_dict()
        relation = Relation(source_id=str(start_node), target_id=str(end_node), label=label, properties=properties)
        relations.append(relation)
    return relations



In [None]:
all_relations = [] 
for file in tqdm(edge_files, desc='Procesing edge files'): #35 mins
    relations = create_realtions(data_dir+file, file[5:-4].replace('_', '-'))
    all_relations.extend(relations)

In [None]:
#save relations with pickle
with open(os.path.expanduser('~/scratch-llm/storage/all_relations.pkl'), 'wb') as f:
    pickle.dump(all_relations, f)


# Generate node embeddings manually

Embedding models

In [20]:
model_names = {"BAAI/bge-small-en-v1.5",
                "all-mpnet-base-v2",
                "BAAI/bge-m3",
                "all-MiniLM-L6-v2",  
                "all-MiniLM-L12-v2"}

## Embeddings for TextNodes:

In [22]:

def embed_nodes(nodes: Sequence[BaseNode], embed_model=None, use_async=True, show_progress=False) -> Sequence[BaseNode]:
    """Embed nodes with specified embedding model."""
    if embed_model is None:
        embed_model = Settings.embed_model  # Use default embed model from settings

    embed_model = resolve_embed_model(embed_model)

    node_texts = []
    for node in nodes:
        node_text = node.metadata["node_name"]        
        
        # Add text content if it exists
        content = node.get_content(metadata_mode=MetadataMode.NONE)
        if content and content.strip():
            cleaned_content = re.sub(r'\n+', ' ', content)
            node_text += f": {cleaned_content}"
        
        node_texts.append(node_text)
    print(f"Embedding {len(node_texts)} nodes")

    if use_async:
        embeddings = embed_model.aget_text_embedding_batch(node_texts, show_progress=show_progress)
    else:
        embeddings = embed_model.get_text_embedding_batch(node_texts, show_progress=show_progress)

    for node, embedding in zip(nodes, embeddings):
        node.embedding = embedding

    return nodes



In [None]:
# load all_nodes from pickle
with open(os.path.expanduser('~/scratch-llm/storage/nodes/all_nodes.pkl'), 'rb') as f:
    all_nodes = pickle.load(f)

In [None]:
for name in model_names:
    print(f"Embedding nodes with model: {name}")
    Settings.embed_model = HuggingFaceEmbedding(model_name=name)
    all_nodes_embedded = embed_nodes(all_nodes, embed_model=Settings.embed_model, use_async=False, show_progress=True)
    with open(os.path.expanduser(f'~/scratch-llm/storage/nodes/all_nodes_{name}.pkl'), 'wb') as f:
        pickle.dump(all_nodes_embedded, f)
    

# Graph store: upsert nodes

Load data and NebulaPropertyGraphStore:

In [None]:
with open(os.path.expanduser('~/scratch-llm/storage/nodes/all_entity_nodes.pkl'), 'rb') as f:
    all_entity_nodes = pickle.load(f)

#load relations with pickle
with open(os.path.expanduser('~/scratch-llm/storage/all_relations.pkl'), 'rb') as f:
    all_relations = pickle.load(f)

# PropertyGraphIndex requires NebulaPropertyGraphStore
graph_store = NebulaPropertyGraphStore(
    space= "PrimeKG", 
    username = "root",
    password = "nebula",
    url = "nebula://localhost:9669",
    props_schema= "`node_index` STRING, `node_type` STRING, `node_id` STRING, `node_name` STRING, `node_source` STRING, `mondo_id` STRING, `mondo_name` STRING, `group_id_bert` STRING, `group_name_bert` STRING, `orphanet_prevalence` STRING, `umls_description` STRING, `orphanet_definition` STRING, `orphanet_epidemiology` STRING, `orphanet_clinical_description` STRING, `orphanet_management_and_treatment` STRING, `mayo_symptoms` STRING, `mayo_causes` STRING, `mayo_risk_factors` STRING, `mayo_complications` STRING, `mayo_prevention` STRING, `mayo_see_doc` STRING, `display_relation` STRING, `_node_content` STRING, `_node_type` STRING, `document_id` STRING, `doc_id` STRING, `ref_doc_id` STRING, `triplet_source_id` STRING",
)


upsert Entity/ChunkNodes (nodes) and Relations (edges) into the NebulaPropertyGraphStore:

In [None]:
# Ensure all node_id values are strings
for node in all_entity_nodes:
    node.properties['node_id'] = str(node.properties['node_id'])

# Upsert all nodes to the graph store in batches of 1000, there is no batch_size parameter, 4 mins
for i in tqdm(range(0, len(all_entity_nodes), 1000), desc='Upserting nodes'):
    batch = all_entity_nodes[i:i+1000]
    graph_store.upsert_nodes(batch)


In [None]:
#upsert edges for all nodes, 70 mins
for i in tqdm(range(0, len(all_relations), 1000), desc='Upserting edges'):
    batch = all_relations[i:i+1000]
    graph_store.upsert_relations(batch)
