In [2]:
import pandas as pd
from nebulagraph_lite import nebulagraph_let as ng_let
import os, math

# Data loading


In [None]:
#primekg = pd.read_csv("~/scratch-llm/data/PrimeKG_data/raw_data/kg.csv", low_memory=False)
nodes = pd.read_csv("~/scratch-llm/data/PrimeKG/datasets/data/kg/nodes.csv", low_memory=False)
edges = pd.read_csv("~/scratch-llm/data/PrimeKG/datasets/data/kg/edges.csv")

In [2]:
# load NebulaGraph JupyterNotebook extension
# !udocker pull vesoft/nebula-metad:v3
# !udocker create --name=nebula-metad vesoft/nebula-metad:v3
# !udocker setup --execmode=F1 nebula-metad

# !udocker pull vesoft/nebula-graphd:v3
# !udocker create --name=nebula-graphd vesoft/nebula-graphd:v3
# !udocker setup --execmode=F1 nebula-graphd

# !udocker pull vesoft/nebula-storaged:v
# !udocker create --name=nebula-storaged vesoft/nebula-storaged:v3
# !udocker setup --execmode=F1 nebula-storaged

n = ng_let()
n.start() # This takes around 5 mins

[1;3;38;2;47;75;124mMessage: Activating storaged...[0m
[1;3;38;2;102;81;145mResult of `SHOW HOSTS`:[0m
[1;3;38;2;47;75;124m    errors:[0m
[1;3;38;2;47;75;124m        code: 0[0m
[1;3;38;2;102;81;145m    results:[0m
[1;3;38;2;47;75;124m        spaceName: [0m
[1;3;38;2;102;81;145m        data:[0m
[1;3;38;2;47;75;124m            meta:[0m
[1;3;38;2;47;75;124m                None, None, None, None, None, None, None[0m
[1;3;38;2;102;81;145m            row:[0m
[1;3;38;2;102;81;145m                127.0.0.1, 9779, ONLINE, 121, PrimeKG:100, PrimeKG_manual:20, basketballplayer:1, PrimeKG:100, PrimeKG_manual:20, basketballplayer:1, 3.8.0[0m
[1;3;38;2;160;81;149m        columns:[0m
[1;3;38;2;160;81;149m            Host, Port, Status, Leader count, Leader distribution, Partition distribution, Version[0m
[1;3;38;2;212;80;135m        errors:[0m
[1;3;38;2;47;75;124m            code: 0[0m
[1;3;38;2;249;93;106m        latencyInUs: 1459[0m
[1;3;38;2;168;255;159mInfo: loadi

In [None]:
%load_ext ngql

In [3]:
%reload_ext ngql

In [4]:
%ngql --address 127.0.0.1 --port 9669 --user root --password nebula

[1;3;38;2;0;135;107m[OK] Connection Pool Created[0m


Unnamed: 0,Name
0,PrimeKG
1,PrimeKG_manual
2,basketballplayer


In [None]:
# Create a new Graph space (graph) for the PrimeKG
%ngql CREATE SPACE IF NOT EXISTS PrimeKG (vid_type = INT64);
%ngql USE PrimeKG;

# Create the graph from NebulaGraph directly

## Nodes

In [None]:
print(nodes['node_type'].unique()) #types of nodes

In [None]:
# save the data for each 'node_type' in a separate file
for node_type in nodes['node_type'].unique():
    sanitized_node_type = node_type.replace('/', '_')
    output_path = os.path.join('~/scratch-llm/data/PrimeKG_data/sub_data/', 'node_'+ sanitized_node_type + '.csv')
    nodes[nodes['node_type'] == node_type].to_csv(output_path, index=False)

In [None]:
# Create Tags (node_type) for each node in the PrimeKG
%ngql CREATE TAG IF NOT EXISTS anatomy(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS gene_protein(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS drug(node_name string, node_source string, node_id string, description string, half_life string, indication string, mechanism_of_action string, protein_binding string, pharmacodynamics string, state string, atc_1 string, atc_2 string, atc_3 string, atc_4 string, category string, group string, pathway string, molecular_weight string, tpsa string, clogp string);
%ngql CREATE TAG IF NOT EXISTS disease(node_name string, node_source string, mondo_id int, mondo_name string, group_id_bert string, group_name_bert string, mondo_definition string, umls_description string, orphanet_definition string, orphanet_prevalence string, orphanet_epidemiology string, orphanet_clinical_description string, orphanet_management_and_treatment string, mayo_symptoms string, mayo_causes string, mayo_risk_factors string, mayo_complications string, mayo_prevention string, mayo_see_doc string);
%ngql CREATE TAG IF NOT EXISTS pathway(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS biological_process(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS effect_phenotype(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS molecular_function(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS cellular_component(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS exposure(node_name string, node_source string, node_id string);

In [None]:
%ngql DESCRIBE TAG drug;

In [None]:
# Load each node source data into the corresponding Tag (working fine)
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_gene_protein.csv --tag gene_protein --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_biological_process.csv --tag biological_process --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_effect_phenotype.csv --tag effect_phenotype --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_molecular_function.csv --tag molecular_function --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_cellular_component.csv --tag cellular_component --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_pathway.csv --tag pathway --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_exposure.csv --tag exposure --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_anatomy.csv --tag anatomy --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source


### Extra information for nodes 'drug' and 'disease'

In [None]:
# disease and drug feature files are available as .tab format so they need to be converted to .csv
# many formatting issues on both files that had to be correcteed manually before merging with the 'node_' dataset
disease_tab = "~/scratch-llm/data/PrimeKG_data/raw_data/disease_features.tab"
drug_tab = "~/scratch-llm/data/PrimeKG_data/raw_data/drug_features.tab"

df = pd.read_csv(disease_tab, delimiter='\t')
disease_csv = "~/scratch-llm/data/PrimeKG_data/sub_data/disease_features.csv"
df.to_csv(disease_csv, index=False)


df = pd.read_csv(drug_tab, delimiter='\t')
drug_csv = "~/scratch-llm/data/PrimeKG_data/sub_data/drug_features.csv"
df.to_csv(drug_csv, index=False) 

In [49]:
# merge extra drug and disease information 
node_drug = pd.read_csv("~/scratch-llm/data/PrimeKG_data/sub_data/node_drug.csv")
features_drug = pd.read_csv("~/scratch-llm/data/PrimeKG_data/sub_data/drug_features.csv")
node_disease = pd.read_csv("~/scratch-llm/data/PrimeKG_data/sub_data/node_disease.csv")
features_disease = pd.read_csv("~/scratch-llm/data/PrimeKG_data/sub_data/disease_features.csv")


In [120]:
# the disease_features has multiple rows with the same 'node_index' so we need to merge them
def merge_column_values(column):
    # Check if all values in the column can be converted to float
    try:
        numeric_column = pd.to_numeric(column, errors='coerce')
        if numeric_column.notna().all():  # If all are numbers, return unique values
            return '; '.join(map(str, sorted(set(numeric_column))))  # Keep the first numeric value (assuming they're the same)
    except Exception:
        pass
    
    # For strings, concatenate unique, non-empty values
    return '; '.join(filter(lambda x: pd.notna(x) and str(x).strip() != '', set(column)))

merged_features_disease = features_disease.groupby('node_index', as_index=False).agg(merge_column_values)

In [119]:
# diseases only present in the features_disease file (NOT in the node_disease file)
merged_features_disease[~merged_features_disease['node_index'].isin(node_disease['node_index'])].head(2)


Unnamed: 0,node_index,mondo_id,mondo_name,group_id_bert,group_name_bert,mondo_definition,umls_description,orphanet_definition,orphanet_prevalence,orphanet_epidemiology,orphanet_clinical_description,orphanet_management_and_treatment,mayo_symptoms,mayo_causes,mayo_risk_factors,mayo_complications,mayo_prevention,mayo_see_doc
6558,35428,2816,adrenal cortex disease,,,A disease involving the adrenal cortex.,Pathological processes of the adrenal cortex.,,,,,,,,,,,
6559,35429,21034,genetic alopecia,,,An instance of alopecia that is caused by a modification of the individual's genome.,,,,,,,,,,,,


In [None]:
# Merge the dataframes on 'node_index' and save to csv
merged_df = pd.merge(node_drug, features_drug, on='node_index')
merged_df.to_csv("~/scratch-llm/data/PrimeKG_data/sub_data/node_merged_drug.csv", index=False)


merged_df = pd.merge(node_disease, merged_features_disease, on='node_index', how='left')
# merged_df.fillna("", inplace=True) # in case included 'extra' diseases that are only present in features_disease
merged_df.to_csv("~/scratch-llm/data/PrimeKG_data/sub_data/node_merged_disease.csv", index=False)


In [None]:
#Load the extended drug and disease data into the corresponding Tags
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_merged_disease.csv --tag disease --header --space PrimeKG --vid 0 --props 3:node_name,4:node_source,5:mondo_id,6:mondo_name,7:group_id_bert,8:group_name_bert,9:mondo_definition,10:umls_description,11:orphanet_definition,12:orphanet_prevalence,13:orphanet_epidemiology,14:orphanet_clinical_description,15:orphanet_management_and_treatment,16:mayo_symptoms,17:mayo_causes,18:mayo_risk_factors,19:mayo_complications,20:mayo_prevention,21:mayo_see_doc
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_merged_drug.csv --tag drug --header --space PrimeKG --batch 100 --vid 0 --props 2:node_id,3:node_name,4:node_source,5:description,6:half_life,7:indication,8:mechanism_of_action,9:protein_binding,10:pharmacodynamics,11:state,12:atc_1,13:atc_2,14:atc_3,15:atc_4,16:category,17:group,18:pathway,19:molecular_weight,20:tpsa,21:clogp


## Edges

In [None]:
print(edges['relation'].unique()) # types of edges

In [None]:
# save the data for each 'relation' (edge type) in a separate file
for relation in edges['relation'].unique():
    sanitized_relation = relation.replace('-', '_')
    sanitized_relation = sanitized_relation.replace('\t', '_')
    output_path = os.path.join('~/scratch-llm/data/PrimeKG_data/sub_data/', 'edge_'+ sanitized_relation + '.csv')
    edges[edges['relation'] == relation].to_csv(output_path, index=False)

In [None]:
%ngql CREATE EDGE IF NOT EXISTS protein_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS drug_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS contraindication(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS indication(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS off_label_use(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS drug_drug(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS phenotype_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS phenotype_phenotype(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS disease_phenotype_negative(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS disease_phenotype_positive(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS disease_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS disease_disease(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS drug_effect(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS bioprocess_bioprocess(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS molfunc_molfunc(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS cellcomp_cellcomp(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS molfunc_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS cellcomp_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS bioprocess_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_disease(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_exposure(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_bioprocess(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_molfunc(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_cellcomp(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS pathway_pathway(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS pathway_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS anatomy_anatomy(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS anatomy_protein_present(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS anatomy_protein_absent(display_relation string);

In [None]:
# for each edge type, load the data accordingly, this takes like 20mins from new
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_protein_protein.csv --space PrimeKG --header --edge protein_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_drug_protein.csv --space PrimeKG --header --edge drug_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_contraindication.csv --space PrimeKG --header --edge contraindication --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_indication.csv --space PrimeKG --header --edge indication --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_off_label_use.csv --space PrimeKG --header --edge off_label_use --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_drug_drug.csv --space PrimeKG --header --edge drug_drug --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_phenotype_protein.csv --space PrimeKG --header --edge phenotype_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_phenotype_phenotype.csv --space PrimeKG --header --edge phenotype_phenotype --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_phenotype_negative.csv --space PrimeKG --header --edge disease_phenotype_negative --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_phenotype_positive.csv --space PrimeKG --header --edge disease_phenotype_positive --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_protein.csv --space PrimeKG --header --edge disease_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_disease.csv --space PrimeKG --header --edge disease_disease --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_drug_effect.csv --space PrimeKG --header --edge drug_effect --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_bioprocess_bioprocess.csv --space PrimeKG --header --edge bioprocess_bioprocess --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_molfunc_molfunc.csv --space PrimeKG --header --edge molfunc_molfunc --src 2 --dst 3 --props 1:display_relation


In [None]:
# load the edge data in two batches
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_cellcomp_cellcomp.csv --space PrimeKG --header --edge cellcomp_cellcomp --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_molfunc_protein.csv --space PrimeKG --header --edge molfunc_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_cellcomp_protein.csv --space PrimeKG --header --edge cellcomp_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_bioprocess_protein.csv --space PrimeKG --header --edge bioprocess_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_protein.csv --space PrimeKG --header --edge exposure_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_disease.csv --space PrimeKG --header --edge exposure_disease --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_exposure.csv --space PrimeKG --header --edge exposure_exposure --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_bioprocess.csv --space PrimeKG --header --edge exposure_bioprocess --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_molfunc.csv --space PrimeKG --header --edge exposure_molfunc --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_cellcomp.csv --space PrimeKG --header --edge exposure_cellcomp --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_pathway_pathway.csv --space PrimeKG --header --edge pathway_pathway --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_pathway_protein.csv --space PrimeKG --header --edge pathway_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_anatomy_anatomy.csv --space PrimeKG --header --edge anatomy_anatomy --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_anatomy_protein_present.csv --space PrimeKG --batch 500 --header --edge anatomy_protein_present --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_anatomy_protein_absent.csv --space PrimeKG --header --edge anatomy_protein_absent --src 2 --dst 3 --props 1:display_relation


## Checks

In [None]:
%ngql DESCRIBE TAG drug;

In [None]:
#small check-up, with extra drug and disease information for nodes
%ngql GET SUBGRAPH WITH PROP 1 STEPS FROM 14045 OUT drug_protein YIELD VERTICES AS nodes, EDGES AS relationships;

# Load directly to NebulaPropertyGraphStore

In [5]:
from llama_index.core.schema import NodeRelationship, TextNode, IndexNode, RelatedNodeInfo
from llama_index.graph_stores.nebula import NebulaPropertyGraphStore
from llama_index.core import (
    StorageContext,
)

define files to use:

In [7]:
data_dir = '~/scratch-llm/data/PrimeKG_data/sub_data/'

# List of node types and their corresponding CSV files
node_files = {
    'node_gene_protein.csv',
    'node_merged_drug.csv', # manually corrected and generated from "extra information section"
    'node_effect_phenotype.csv',
    'node_merged_disease.csv', # manually corrected and generated from "extra information section"
    'node_biological_process.csv',
    'node_molecular_function.csv',
    'node_cellular_component.csv',
    'node_exposure.csv',
    'node_pathway.csv',
    'node_anatomy.csv'
}

edge_files ={
    'edge_protein_protein.csv',
    'edge_drug_protein.csv',
    'edge_contraindication.csv',
    'edge_indication.csv',
    'edge_off_label_use.csv',
    'edge_drug_drug.csv', # around 40 mins
    'edge_phenotype_protein.csv',
    'edge_phenotype_phenotype.csv',
    'edge_disease_phenotype_negative.csv',
    'edge_disease_phenotype_positive.csv',
    'edge_disease_protein.csv',
    'edge_disease_disease.csv',
    'edge_drug_effect.csv',
    'edge_bioprocess_bioprocess.csv',
    'edge_molfunc_molfunc.csv',
    'edge_cellcomp_cellcomp.csv',
    'edge_molfunc_protein.csv',
    'edge_cellcomp_protein.csv',
    'edge_bioprocess_protein.csv',
    'edge_exposure_protein.csv',
    'edge_exposure_disease.csv',
    'edge_exposure_exposure.csv',
    'edge_exposure_bioprocess.csv',
    'edge_exposure_molfunc.csv',
    'edge_exposure_cellcomp.csv',
    'edge_pathway_pathway.csv',
    'edge_pathway_protein.csv',
    'edge_anatomy_anatomy.csv',
    'edge_anatomy_protein_present.csv', #around 90 mins
    'edge_anatomy_protein_absent.csv'
}

## Nodes

In [4]:
# create TextNodes
def create_text_nodes(file_path):
    df = pd.read_csv(file_path)
    text_nodes = []
    for _, row in df.iterrows():
        id = row['node_index']
        if len(row) > 5: # longer text info (drug/disease)
            metadata_columns = ['node_index', 'node_id', 'node_type', 'node_name', 
                            'node_source', 'mondo_id', 'mondo_name', 'group_id_bert', 
                            'group_name_bert', 'orphanet_prevalence']
            
            text_columns = ['mondo_definition', 'umls_description', 'orphanet_definition', 
                        'orphanet_clinical_description', 'orphanet_management_and_treatment','orphanet_epidemiology', 
                        'mayo_symptoms', 'mayo_causes', 'mayo_risk_factors', 'mayo_complications',
                        'mayo_prevention', 'mayo_see_doc','description', 'indication', 'mechanism_of_action',
                        'half_life', 'protein_binding', 'pharmacodynamics', 'state', 'atc_1', 'atc_2', 'atc_3', 
                        'atc_4', 'category', 'group', 'pathway', 'molecular_weight', 'tpsa', 'clogp'] 
            available_metadata_columns = [col for col in metadata_columns if col in df.columns]
            available_text_columns = [col for col in text_columns if col in df.columns] 

            # Extract metadata, not including empty entries     
            metadata = {
                col: row[col] for col in available_metadata_columns
                if col in row and not (isinstance(row[col], float) and math.isnan(row[col]))
            }
            metadata = {k: str(v) for k, v in metadata.items()} #ensure entries are strings

            # Extract and concatenate text values, not including empty entries
            text_entries = [str(row[col]) for col in available_text_columns if pd.notna(row[col])]
            text = "\n".join(text_entries)            

            text_node = TextNode(id_= str(id), text=text, metadata=metadata)
            text_nodes.append(text_node)

        else: # no text data, all other files
            metadata = row.to_dict()
            metadata = {k: str(v) for k, v in metadata.items()} #ensure entries are strings
            text_node = TextNode(id_= str(id), metadata=metadata)
            text_nodes.append(text_node)

    return text_nodes


In [5]:
# generate all nodes from the node files
all_nodes = []
for file in node_files:
    nodes = create_text_nodes(data_dir+file)
    all_nodes.extend(nodes)


In [123]:
# find index of id_ = 35769
for i, node in enumerate(all_nodes):
    if node.id_ == '35428':
        print(i)

73846


## Edges

In [6]:
# create all edges
from tqdm.notebook import tqdm

def create_all_edges(file_path, all_nodes):
    node_dict = {node.id_: node for node in all_nodes} #speed up search
    df = pd.read_csv(os.path.join(file_path))

    for _, row in tqdm(df.iterrows(), total=len(df), leave=False, desc=f"Processing {file_path}"):
        x_index = str(row['x_index'])

        if x_index in node_dict:
            node = node_dict[x_index]
            if NodeRelationship.CHILD not in node.relationships:
                node.relationships[NodeRelationship.CHILD] = []
            y_index = str(row['y_index'])
            metadata = row.drop(['x_index', 'y_index']).to_dict()
            
            # check that the relationship is not there already
            if not any(isinstance(r, RelatedNodeInfo) and r.node_id == y_index for r in node.relationships[NodeRelationship.CHILD]):
                new_relationship = RelatedNodeInfo(node_id=y_index, metadata=metadata)
                node.relationships[NodeRelationship.CHILD].append(new_relationship)


In [8]:
# all relationships for all nodes, takes around 120 mins
for file in tqdm(edge_files, desc='Procesing edge files'):
    create_all_edges(data_dir+file, all_nodes)

Procesing edge files:   0%|          | 0/30 [00:00<?, ?it/s]

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_bioprocess_bioprocess.csv:   0%|          | 0/105772 …

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_anatomy_protein_absent.csv:   0%|          | 0/39774 …

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_anatomy_protein_present.csv:   0%|          | 0/30364…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_cellcomp_cellcomp.csv:   0%|          | 0/9690 [00:00…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_bioprocess_protein.csv:   0%|          | 0/289610 [00…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_molfunc_protein.csv:   0%|          | 0/139060 [00:00…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_drug_effect.csv:   0%|          | 0/129568 [00:00<?, …

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_protein_protein.csv:   0%|          | 0/642150 [00:00…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_protein.csv:   0%|          | 0/161052 [00:00…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_indication.csv:   0%|          | 0/18542 [00:00<?, ?i…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_anatomy_anatomy.csv:   0%|          | 0/28064 [00:00<…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_cellcomp.csv:   0%|          | 0/20 [00:00<?…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_cellcomp_protein.csv:   0%|          | 0/166804 [00:0…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_phenotype_protein.csv:   0%|          | 0/6660 [00:00…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_exposure.csv:   0%|          | 0/4140 [00:00…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_drug_drug.csv:   0%|          | 0/2672628 [00:00<?, ?…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_drug_protein.csv:   0%|          | 0/51306 [00:00<?, …

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_molfunc.csv:   0%|          | 0/90 [00:00<?,…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_phenotype_positive.csv:   0%|          | 0/30…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_protein.csv:   0%|          | 0/2424 [00:00<…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_disease.csv:   0%|          | 0/4608 [00:00<…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_off_label_use.csv:   0%|          | 0/5144 [00:00<?, …

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_phenotype_phenotype.csv:   0%|          | 0/37472 [00…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_pathway_protein.csv:   0%|          | 0/85292 [00:00<…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_molfunc_molfunc.csv:   0%|          | 0/27148 [00:00<…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_pathway_pathway.csv:   0%|          | 0/5070 [00:00<?…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_disease.csv:   0%|          | 0/62914 [00:00<…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_contraindication.csv:   0%|          | 0/61060 [00:00…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_phenotype_negative.csv:   0%|          | 0/23…

Processing ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_bioprocess.csv:   0%|          | 0/3250 [00:…

save nodes-relationships to pickle

In [9]:
# save `all_nodes` with pickle
import pickle
with open(os.path.expanduser('~/scratch-llm/storage/all_nodes.pkl'), 'wb') as f:
    pickle.dump(all_nodes, f)

In [6]:
# load from pickle
import pickle
with open(os.path.expanduser('~/scratch-llm/storage/all_nodes.pkl'), 'rb') as f:
    nodes = pickle.load(f)
    

check up that edges are correctly generated:

In [174]:
%ngql USE PrimeKG;

In [177]:
%ngql GET SUBGRAPH WITH PROP 1 STEPS FROM 35446 OUT disease_disease YIELD VERTICES AS nodes, EDGES AS relationships;

Unnamed: 0,nodes,relationships
0,"[(35446 :disease{group_id_bert: __NULL__, group_name_bert: __NULL__, mayo_causes: __NULL__, mayo_complications: __NULL__, mayo_prevention: __NULL__, mayo_risk_factors: __NULL__, mayo_see_doc: __NULL__, mayo_symptoms: __NULL__, mondo_definition: ""A non-neoplastic or neoplastic disorder that affects the testis or the ovary."", mondo_id: 2259, mondo_name: ""gonadal disease"", node_name: ""enuresis"", node_source: ""MONDO"", orphanet_clinical_description: __NULL__, orphanet_definition: __NULL__, orphanet_epidemiology: __NULL__, orphanet_management_and_treatment: __NULL__, orphanet_prevalence: __NULL__, umls_description: ""A non-neoplastic or neoplastic disorder that affects the testis or the ovary.""})]","[(35446)-[:disease_disease@0{display_relation: ""parent-child""}]->(27997), (35446)-[:disease_disease@0{display_relation: ""parent-child""}]->(35757)]"
1,"[(27997 :disease{group_id_bert: __NULL__, group_name_bert: __NULL__, mayo_causes: __NULL__, mayo_complications: __NULL__, mayo_prevention: __NULL__, mayo_risk_factors: __NULL__, mayo_see_doc: __NULL__, mayo_symptoms: __NULL__, mondo_definition: ""Combined immunodeficiency with faciooculoskeletal anomalies is an extremely rare combined immunodeficiency disorder characterized by primary immunodeficiency manifesting with repeated bacterial, viral and fungal infections, in association with neurological manifestations (hypotonia, cerebellar ataxia, myoclonic seizures), developmental delay, optic atrophy, facial dysmorphism (high forehead, hypoplastic supraorbital ridges, palpebral edema, hypertelorism, flat nasal bridge, broad nasal root and tip, anteverted nares, thin lower lip overlapped by upper lip, square chin) and skeletal anomalies (short metacarpals/metatarsals with cone-shaped epiphyses, osteopenia)."", mondo_id: 13226, mondo_name: ""combined immunodeficiency with faciooculoskeletal anomalies"", node_name: ""enuresis, nocturnal"", node_source: ""MONDO_grouped"", orphanet_clinical_description: __NULL__, orphanet_definition: ""Combined immunodeficiency with faciooculoskeletal anomalies is an extremely rare combined immunodeficiency disorder characterized by primary immunodeficiency manifesting with repeated bacterial, viral and fungal infections, in association with neurological manifestations (hypotonia, cerebellar ataxia, myoclonic seizures), developmental delay, optic atrophy, facial dysmorphism (high forehead, hypoplastic supraorbital ridges, palpebral edema, hypertelorism, flat nasal bridge, broad nasal root and tip, anteverted nares, thin lower lip overlapped by upper lip, square chin) and skeletal anomalies (short metacarpals/metatarsals with cone-shaped epiphyses, osteopenia)."", orphanet_epidemiology: __NULL__, orphanet_management_and_treatment: __NULL__, orphanet_prevalence: ""<1/1000000"", umls_description: ""An extremely rare combined immunodeficiency disorder characterized by primary immunodeficiency manifesting with repeated bacterial, viral and fungal infections, in association with neurological manifestations , developmental delay, optic atrophy, facial dysmorphism and skeletal anomalies""}), (35757 :disease{group_id_bert: __NULL__, group_name_bert: __NULL__, mayo_causes: __NULL__, mayo_complications: __NULL__, mayo_prevention: __NULL__, mayo_risk_factors: __NULL__, mayo_see_doc: __NULL__, mayo_symptoms: __NULL__, mondo_definition: ""A group of disorders that affect a person's ability to learn or process specific types of information which is in contrast to his/her apparent level of intellect."", mondo_id: 4681, mondo_name: ""learning disability"", node_name: ""psychiatric disorder"", node_source: ""MONDO"", orphanet_clinical_description: __NULL__, orphanet_definition: __NULL__, orphanet_epidemiology: __NULL__, orphanet_management_and_treatment: __NULL__, orphanet_prevalence: __NULL__, umls_description: ""A change to an individual's judgment, orientation , intellectual functioning, or mood from their baseline.""})]","[(27997)-[:disease_disease@0{display_relation: ""parent-child""}]->(35446), (35757)-[:disease_disease@0{display_relation: ""parent-child""}]->(35446)]"


In [9]:
# find a node with 'node_type' = 'disease'
for i, node in enumerate(nodes):
    if node.metadata.get('node_index') == '27182':
        print(i)
        break


81150


In [149]:
nodes[81155].metadata

{'node_index': '27187',
 'node_id': '8196',
 'node_type': 'disease',
 'node_name': 'parastremmatic dwarfism',
 'node_source': 'MONDO',
 'mondo_id': '13818; 24541',
 'mondo_name': 'trichohepatoenteric syndrome 2; trichohepatoenteric syndrome 1',
 'group_id_bert': '24541_13818',
 'group_name_bert': 'trichohepatoenteric syndrome'}

In [10]:
#small subgraph with text node_index = 27187
start_node = nodes[81155] 
related_nodes_info = start_node.relationships.get(NodeRelationship.CHILD, [])

related_nodes = []
edges = []
for relation in related_nodes_info:
    # Find the related node by its node_id
    related_node = next((node for node in nodes if node.node_id == relation.node_id), None)
    if related_node:
        related_nodes.append(related_node)
        edges.append((start_node.node_id, related_node.node_id, relation.metadata))

subgraph_nodes = [start_node] + related_nodes  # Include the starting node and its neighbors


In [11]:
subgraph_nodes[0].metadata

{'node_index': '27187',
 'node_id': '8196',
 'node_type': 'disease',
 'node_name': 'parastremmatic dwarfism',
 'node_source': 'MONDO',
 'mondo_id': '13818; 24541',
 'mondo_name': 'trichohepatoenteric syndrome 2; trichohepatoenteric syndrome 1',
 'group_id_bert': '24541_13818',
 'group_name_bert': 'trichohepatoenteric syndrome'}

## Generate index

In [152]:
%ngql CREATE SPACE IF NOT EXISTS PrimeKG_manual(vid_type=FIXED_STRING(256), partition_num=20, replica_factor=1);

In [151]:
%ngql drop space PrimeKG_manual;

convert TextNode into IndexNode to insert the nodes into the index:

In [15]:
from llama_index.core.vector_stores.simple import SimpleVectorStore
from llama_index.core import PropertyGraphIndex, Settings
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import torch
from transformers import AutoTokenizer

vec_store = SimpleVectorStore()

In [13]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [14]:
# Create IndexNode objects from TextNode objects
index_nodes = [IndexNode.from_text_node(node, index_id="index_1") for node in subgraph_nodes]


In [17]:
index_nodes[0].metadata

{'node_index': '27187',
 'node_id': '8196',
 'node_type': 'disease',
 'node_name': 'parastremmatic dwarfism',
 'node_source': 'MONDO',
 'mondo_id': '13818; 24541',
 'mondo_name': 'trichohepatoenteric syndrome 2; trichohepatoenteric syndrome 1',
 'group_id_bert': '24541_13818',
 'group_name_bert': 'trichohepatoenteric syndrome'}

In [21]:
# Load tokenizer and set pad_token_id explicitly
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B")
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

if tokenizer.eos_token_id is None:
    tokenizer.eos_token_id = tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id)


print(f"Pad Token: {tokenizer.pad_token}")
print(f"Pad Token ID: {tokenizer.pad_token_id}")
print(f"EOS Token: {tokenizer.eos_token}")
print(f"EOS Token ID: {tokenizer.eos_token_id}")

Pad Token: <|end_of_text|>
Pad Token ID: 128001
EOS Token: <|end_of_text|>
EOS Token ID: 128001


In [19]:
llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.8, "do_sample": True},
    # system_prompt=system_prompt,
    # query_wrapper_prompt=query_wrapper_prompt,
    tokenizer=tokenizer,
    model_name="meta-llama/Llama-3.2-3B",
    device_map="auto",
    stopping_ids=[50278, 50279, 50277, 1, 0, 128001], # last newly included
    tokenizer_kwargs={"max_length": 4096},
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16}
)

Settings.llm = llm
Settings.chunk_size = 1024
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-m3") 

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
del llm  # Clear the model from memory
torch.cuda.empty_cache()  # Clear the GPU cache

In [20]:
# PropertyGraphIndex requires NebulaPropertyGraphStore
graph_store = NebulaPropertyGraphStore(
    space= "PrimeKG_manual", 
    username = "root",
    password = "nebula",
    url = "nebula://localhost:9669",
    props_schema= "`node_index` STRING, `node_type` STRING, `node_id` STRING, `node_name` STRING, `node_source` STRING, `mondo_id` STRING, `mondo_name` STRING, `group_id_bert` STRING, `group_name_bert` STRING, `orphanet_prevalence` STRING, `display_relation` STRING, `_node_content` STRING, `_node_type` STRING, `document_id` STRING, `doc_id` STRING, `ref_doc_id` STRING, `triplet_source_id` STRING",
    overwrite=True
)
storage_context = StorageContext.from_defaults(property_graph_store=graph_store)

index = PropertyGraphIndex.from_existing(
    llm=llm,
    vector_store=vec_store,
    property_graph_store=graph_store,
    show_progress=True,
    storage_context=storage_context,
    embed_kg_nodes=True
)

import nest_asyncio
nest_asyncio.apply()

index.insert_nodes(index_nodes)

Extracting paths from text:   0%|          | 0/11 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Extracting paths from text: 100%|██████████| 11/11 [00:28<00:00,  2.57s/it]
Extracting implicit paths: 100%|██████████| 11/11 [00:00<00:00, 787.45it/s]
G

save index to persist storage

In [32]:
index.storage_context.persist(persist_dir="~/scratch-llm/storage/PrimeKG_index_mid/")
