In [1]:
import pandas as pd
from nebulagraph_lite import nebulagraph_let as ng_let
import os, math

# Data loading


In [None]:
#primekg = pd.read_csv("~/scratch-llm/data/PrimeKG_data/raw_data/kg.csv", low_memory=False)
nodes = pd.read_csv("~/scratch-llm/data/PrimeKG/datasets/data/kg/nodes.csv", low_memory=False)
edges = pd.read_csv("~/scratch-llm/data/PrimeKG/datasets/data/kg/edges.csv")

In [2]:
# load NebulaGraph JupyterNotebook extension
# !udocker pull vesoft/nebula-metad:v3
# !udocker create --name=nebula-metad vesoft/nebula-metad:v3
# !udocker setup --execmode=F1 nebula-metad

# !udocker pull vesoft/nebula-graphd:v3
# !udocker create --name=nebula-graphd vesoft/nebula-graphd:v3
# !udocker setup --execmode=F1 nebula-graphd

# !udocker pull vesoft/nebula-storaged:v
# !udocker create --name=nebula-storaged vesoft/nebula-storaged:v3
# !udocker setup --execmode=F1 nebula-storaged

n = ng_let()
n.start() # This takes around 5 mins

[1;3;38;2;47;75;124mMessage: Activating storaged...[0m
[1;3;38;2;102;81;145mResult of `SHOW HOSTS`:[0m
[1;3;38;2;47;75;124m    errors:[0m
[1;3;38;2;47;75;124m        code: 0[0m
[1;3;38;2;102;81;145m    results:[0m
[1;3;38;2;47;75;124m        spaceName: [0m
[1;3;38;2;102;81;145m        data:[0m
[1;3;38;2;47;75;124m            meta:[0m
[1;3;38;2;47;75;124m                None, None, None, None, None, None, None[0m
[1;3;38;2;102;81;145m            row:[0m
[1;3;38;2;102;81;145m                127.0.0.1, 9779, ONLINE, 121, PrimeKG:100, PrimeKG_manual:20, basketballplayer:1, PrimeKG:100, PrimeKG_manual:20, basketballplayer:1, 3.8.0[0m
[1;3;38;2;160;81;149m        columns:[0m
[1;3;38;2;160;81;149m            Host, Port, Status, Leader count, Leader distribution, Partition distribution, Version[0m
[1;3;38;2;212;80;135m        errors:[0m
[1;3;38;2;47;75;124m            code: 0[0m
[1;3;38;2;249;93;106m        latencyInUs: 1418[0m
[1;3;38;2;168;255;159mInfo: loadi

In [None]:
%load_ext ngql

In [2]:
%reload_ext ngql

In [3]:
%ngql --address 127.0.0.1 --port 9669 --user root --password nebula

[1;3;38;2;0;135;107m[OK] Connection Pool Created[0m


Unnamed: 0,Name
0,PrimeKG
1,PrimeKG_manual
2,basketballplayer


In [None]:
# Create a new Graph space (graph) for the PrimeKG
%ngql CREATE SPACE IF NOT EXISTS PrimeKG (vid_type = INT64);
%ngql USE PrimeKG;

# Create the graph from NebulaGraph directly

## Nodes

In [None]:
print(nodes['node_type'].unique()) #types of nodes

In [None]:
# save the data for each 'node_type' in a separate file
for node_type in nodes['node_type'].unique():
    sanitized_node_type = node_type.replace('/', '_')
    output_path = os.path.join('~/scratch-llm/data/PrimeKG_data/sub_data/', 'node_'+ sanitized_node_type + '.csv')
    nodes[nodes['node_type'] == node_type].to_csv(output_path, index=False)

In [None]:
# Create Tags (node_type) for each node in the PrimeKG
%ngql CREATE TAG IF NOT EXISTS anatomy(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS gene_protein(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS drug(node_name string, node_source string, node_id string, description string, half_life string, indication string, mechanism_of_action string, protein_binding string, pharmacodynamics string, state string, atc_1 string, atc_2 string, atc_3 string, atc_4 string, category string, group string, pathway string, molecular_weight string, tpsa string, clogp string);
%ngql CREATE TAG IF NOT EXISTS disease(node_name string, node_source string, mondo_id int, mondo_name string, group_id_bert string, group_name_bert string, mondo_definition string, umls_description string, orphanet_definition string, orphanet_prevalence string, orphanet_epidemiology string, orphanet_clinical_description string, orphanet_management_and_treatment string, mayo_symptoms string, mayo_causes string, mayo_risk_factors string, mayo_complications string, mayo_prevention string, mayo_see_doc string);
%ngql CREATE TAG IF NOT EXISTS pathway(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS biological_process(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS effect_phenotype(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS molecular_function(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS cellular_component(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS exposure(node_name string, node_source string, node_id string);

In [None]:
%ngql DESCRIBE TAG drug;

In [None]:
# Load each node source data into the corresponding Tag (working fine)
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_gene_protein.csv --tag gene_protein --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_biological_process.csv --tag biological_process --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_effect_phenotype.csv --tag effect_phenotype --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_molecular_function.csv --tag molecular_function --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_cellular_component.csv --tag cellular_component --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_pathway.csv --tag pathway --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_exposure.csv --tag exposure --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_anatomy.csv --tag anatomy --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source


### Extra information for nodes 'drug' and 'disease'

In [None]:
# disease and drug feature files are available as .tab format so they need to be converted to .csv
# many formatting issues on both files that had to be correcteed manually before merging with the 'node_' dataset
disease_tab = "~/scratch-llm/data/PrimeKG_data/raw_data/disease_features.tab"
drug_tab = "~/scratch-llm/data/PrimeKG_data/raw_data/drug_features.tab"

df = pd.read_csv(disease_tab, delimiter='\t')
disease_csv = "~/scratch-llm/data/PrimeKG_data/sub_data/disease_features.csv"
df.to_csv(disease_csv, index=False)


df = pd.read_csv(drug_tab, delimiter='\t')
drug_csv = "~/scratch-llm/data/PrimeKG_data/sub_data/drug_features.csv"
df.to_csv(drug_csv, index=False) 

In [96]:
# merge extra drug and disease information 
node_drug = pd.read_csv("~/scratch-llm/data/PrimeKG_data/sub_data/node_drug.csv")
features_drug = pd.read_csv("~/scratch-llm/data/PrimeKG_data/sub_data/drug_features.csv")
node_disease = pd.read_csv("~/scratch-llm/data/PrimeKG_data/sub_data/node_disease.csv")
features_disease = pd.read_csv("~/scratch-llm/data/PrimeKG_data/sub_data/disease_features.csv")


In [None]:
# the disease_features has multiple rows with the same 'node_index' so we need to merge them
def merge_column_values(column):
    # Check if all values in the column can be converted to float
    try:
        numeric_column = pd.to_numeric(column, errors='coerce')
        if numeric_column.notna().all():  # If all are numbers, return unique values
            return '_'.join(map(str, sorted(set(numeric_column))))  # Keep the first numeric value (assuming they're the same)
    except Exception:
        pass
    
    # For strings, concatenate unique, non-empty values
    return '; '.join(filter(lambda x: pd.notna(x) and str(x).strip() != '', set(column)))

merged_feature_disease = features_disease.groupby('node_index', as_index=False).agg(merge_column_values)

In [None]:
# Merge the dataframes on 'node_index' and save to csv
merged_df = pd.merge(node_drug, features_drug, on='node_index')
merged_df.to_csv("~/scratch-llm/data/PrimeKG_data/sub_data/node_merged_drug.csv", index=False)

merged_df = pd.merge(node_disease, merged_feature_disease, on='node_index', how='left')
merged_df.to_csv("~/scratch-llm/data/PrimeKG_data/sub_data/node_merged_disease.csv", index=False)


In [None]:
#Load the extended drug and disease data into the corresponding Tags
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_merged_disease.csv --tag disease --header --space PrimeKG --vid 0 --props 3:node_name,4:node_source,5:mondo_id,6:mondo_name,7:group_id_bert,8:group_name_bert,9:mondo_definition,10:umls_description,11:orphanet_definition,12:orphanet_prevalence,13:orphanet_epidemiology,14:orphanet_clinical_description,15:orphanet_management_and_treatment,16:mayo_symptoms,17:mayo_causes,18:mayo_risk_factors,19:mayo_complications,20:mayo_prevention,21:mayo_see_doc
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_merged_drug.csv --tag drug --header --space PrimeKG --batch 100 --vid 0 --props 2:node_id,3:node_name,4:node_source,5:description,6:half_life,7:indication,8:mechanism_of_action,9:protein_binding,10:pharmacodynamics,11:state,12:atc_1,13:atc_2,14:atc_3,15:atc_4,16:category,17:group,18:pathway,19:molecular_weight,20:tpsa,21:clogp


## Edges

In [None]:
print(edges['relation'].unique()) # types of edges

In [None]:
# save the data for each 'relation' (edge type) in a separate file
for relation in edges['relation'].unique():
    sanitized_relation = relation.replace('-', '_')
    sanitized_relation = sanitized_relation.replace('\t', '_')
    output_path = os.path.join('~/scratch-llm/data/PrimeKG_data/sub_data/', 'edge_'+ sanitized_relation + '.csv')
    edges[edges['relation'] == relation].to_csv(output_path, index=False)

In [None]:
%ngql CREATE EDGE IF NOT EXISTS protein_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS drug_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS contraindication(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS indication(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS off_label_use(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS drug_drug(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS phenotype_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS phenotype_phenotype(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS disease_phenotype_negative(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS disease_phenotype_positive(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS disease_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS disease_disease(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS drug_effect(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS bioprocess_bioprocess(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS molfunc_molfunc(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS cellcomp_cellcomp(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS molfunc_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS cellcomp_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS bioprocess_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_disease(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_exposure(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_bioprocess(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_molfunc(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_cellcomp(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS pathway_pathway(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS pathway_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS anatomy_anatomy(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS anatomy_protein_present(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS anatomy_protein_absent(display_relation string);

In [None]:
# for each edge type, load the data accordingly, this takes like 20mins from new
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_protein_protein.csv --space PrimeKG --header --edge protein_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_drug_protein.csv --space PrimeKG --header --edge drug_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_contraindication.csv --space PrimeKG --header --edge contraindication --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_indication.csv --space PrimeKG --header --edge indication --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_off_label_use.csv --space PrimeKG --header --edge off_label_use --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_drug_drug.csv --space PrimeKG --header --edge drug_drug --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_phenotype_protein.csv --space PrimeKG --header --edge phenotype_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_phenotype_phenotype.csv --space PrimeKG --header --edge phenotype_phenotype --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_phenotype_negative.csv --space PrimeKG --header --edge disease_phenotype_negative --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_phenotype_positive.csv --space PrimeKG --header --edge disease_phenotype_positive --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_protein.csv --space PrimeKG --header --edge disease_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_disease.csv --space PrimeKG --header --edge disease_disease --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_drug_effect.csv --space PrimeKG --header --edge drug_effect --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_bioprocess_bioprocess.csv --space PrimeKG --header --edge bioprocess_bioprocess --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_molfunc_molfunc.csv --space PrimeKG --header --edge molfunc_molfunc --src 2 --dst 3 --props 1:display_relation


In [None]:
# load the edge data in two batches
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_cellcomp_cellcomp.csv --space PrimeKG --header --edge cellcomp_cellcomp --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_molfunc_protein.csv --space PrimeKG --header --edge molfunc_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_cellcomp_protein.csv --space PrimeKG --header --edge cellcomp_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_bioprocess_protein.csv --space PrimeKG --header --edge bioprocess_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_protein.csv --space PrimeKG --header --edge exposure_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_disease.csv --space PrimeKG --header --edge exposure_disease --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_exposure.csv --space PrimeKG --header --edge exposure_exposure --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_bioprocess.csv --space PrimeKG --header --edge exposure_bioprocess --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_molfunc.csv --space PrimeKG --header --edge exposure_molfunc --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_cellcomp.csv --space PrimeKG --header --edge exposure_cellcomp --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_pathway_pathway.csv --space PrimeKG --header --edge pathway_pathway --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_pathway_protein.csv --space PrimeKG --header --edge pathway_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_anatomy_anatomy.csv --space PrimeKG --header --edge anatomy_anatomy --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_anatomy_protein_present.csv --space PrimeKG --batch 500 --header --edge anatomy_protein_present --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_anatomy_protein_absent.csv --space PrimeKG --header --edge anatomy_protein_absent --src 2 --dst 3 --props 1:display_relation


## Checks

In [None]:
%ngql DESCRIBE TAG drug;

In [None]:
#small check-up, with extra drug and disease information for nodes
%ngql GET SUBGRAPH WITH PROP 1 STEPS FROM 14045 OUT drug_protein YIELD VERTICES AS nodes, EDGES AS relationships;

# Load directly to NebulaPropertyGraphStore

In [4]:
from llama_index.core.schema import NodeRelationship, TextNode, IndexNode, RelatedNodeInfo
from llama_index.graph_stores.nebula import NebulaPropertyGraphStore
from llama_index.core import (
    StorageContext,
)

define files to use:

In [194]:
data_dir = '~/scratch-llm/data/PrimeKG_data/sub_data/'

# List of node types and their corresponding CSV files
node_files = {
    'node_gene_protein.csv',
    'node_merged_drug.csv', # manually corrected and generated from "extra information section"
    'node_effect_phenotype.csv',
    'node_merged_disease.csv', # manually corrected and generated from "extra information section"
    'node_biological_process.csv',
    'node_molecular_function.csv',
    'node_cellular_component.csv',
    'node_exposure.csv',
    'node_pathway.csv',
    'node_anatomy.csv'
}

edge_files ={
    'edge_protein_protein.csv',
    'edge_drug_protein.csv',
    'edge_contraindication.csv',
    'edge_indication.csv',
    'edge_off_label_use.csv',
    # 'edge_drug_drug.csv', # around 40 mins
    'edge_phenotype_protein.csv',
    'edge_phenotype_phenotype.csv',
    'edge_disease_phenotype_negative.csv',
    'edge_disease_phenotype_positive.csv',
    'edge_disease_protein.csv',
    'edge_disease_disease.csv',
    'edge_drug_effect.csv',
    'edge_bioprocess_bioprocess.csv',
    'edge_molfunc_molfunc.csv',
    'edge_cellcomp_cellcomp.csv',
    'edge_molfunc_protein.csv',
    'edge_cellcomp_protein.csv',
    'edge_bioprocess_protein.csv',
    'edge_exposure_protein.csv',
    'edge_exposure_disease.csv',
    'edge_exposure_exposure.csv',
    'edge_exposure_bioprocess.csv',
    'edge_exposure_molfunc.csv',
    'edge_exposure_cellcomp.csv',
    'edge_pathway_pathway.csv',
    'edge_pathway_protein.csv',
    'edge_anatomy_anatomy.csv',
    # 'edge_anatomy_protein_present.csv', #around 90 mins
    'edge_anatomy_protein_absent.csv'
}

## Nodes

In [None]:
# create TextNodes
def create_text_nodes(file_path):
    df = pd.read_csv(file_path)
    text_nodes = []
    for _, row in df.iterrows():
        id = row['node_index']
        if len(row) > 5: # longer text info (drug/disease)
            metadata_columns = ['node_index', 'node_id', 'node_type', 'node_name', 
                            'node_source', 'mondo_id', 'mondo_name', 'group_id_bert', 
                            'group_name_bert', 'orphanet_prevalence', 'orphanet_epidemiology']
            
            text_columns = ['mondo_definition', 'umls_description', 'orphanet_definition', 
                        'orphanet_clinical_description', 'orphanet_management_and_treatment', 
                        'mayo_symptoms', 'mayo_causes', 'mayo_risk_factors', 'mayo_complications',
                        'mayo_prevention', 'mayo_see_doc','description', 'indication', 'mechanism_of_action',
                        'half_life', 'protein_binding', 'pharmacodynamics', 'state', 'atc_1', 'atc_2', 'atc_3', 
                        'atc_4', 'category', 'group', 'pathway', 'molecular_weight', 'tpsa', 'clogp'] 
            available_metadata_columns = [col for col in metadata_columns if col in df.columns]
            available_text_columns = [col for col in text_columns if col in df.columns] 

            # Extract metadata, not including empty entries
            metadata = {
                col: row[col] for col in available_metadata_columns
                if col in row and not (isinstance(row[col], float) and math.isnan(row[col]))
            }

            # Extract and concatenate text values, not including empty entries
            text_entries = [str(row[col]) for col in available_text_columns if pd.notna(row[col])]
            text = "\n".join(text_entries) 

            text_node = TextNode(id_= str(id), text=text, metadata=metadata)
            text_nodes.append(text_node)

        else: # no text data, all other files
            metadata = row.to_dict()
            text_node = TextNode(id_= str(id), metadata=metadata)
            text_nodes.append(text_node)

    return text_nodes


In [174]:
# generate all nodes from the node files
all_nodes = []
for file in node_files:
    nodes = create_text_nodes(data_dir+file)
    all_nodes.extend(nodes)


In [217]:
# find index of id_ = 35769
for i, node in enumerate(all_nodes):
    if node.id_ == '53309':
        print(i)

48158


## Edges

In [195]:
# create all edges
from tqdm.notebook import tqdm

def create_all_edges(file_path, all_nodes):
    node_dict = {node.id_: node for node in all_nodes} #speed up search
    df = pd.read_csv(os.path.join(file_path))

    for _, row in tqdm(df.iterrows(), total=len(df), leave=False, desc=f"Processing {file_path}"):
        x_index = str(row['x_index'])

        if x_index in node_dict:
            node = node_dict[x_index]
            if NodeRelationship.CHILD not in node.relationships:
                node.relationships[NodeRelationship.CHILD] = []
            y_index = str(row['y_index'])
            metadata = row.drop(['x_index', 'y_index']).to_dict()
            
            # check that the relationship is not there already
            if not any(isinstance(r, RelatedNodeInfo) and r.node_id == y_index for r in node.relationships[NodeRelationship.CHILD]):
                new_relationship = RelatedNodeInfo(node_id=y_index, metadata=metadata)
                node.relationships[NodeRelationship.CHILD].append(new_relationship)


In [None]:
# all relationships for all nodes, takes around 120 mins
for file in tqdm(edge_files, desc='Procesing edge files'):
    create_all_edges(data_dir+file, all_nodes)

save nodes-relationships to pickle

In [214]:
# save `all_nodes` with pickle
import pickle
with open(os.path.expanduser('~/scratch-llm/storage/all_nodes.pkl'), 'wb') as f:
    pickle.dump(all_nodes, f)

In [5]:
# load from pickle
import pickle
with open(os.path.expanduser('~/scratch-llm/storage/all_nodes.pkl'), 'rb') as f:
    nodes = pickle.load(f)
    

check up that edges are correctly generated:

In [6]:
%ngql USE PrimeKG;

In [7]:
%ngql GET SUBGRAPH WITH PROP 1 STEPS FROM 35444 OUT disease_disease YIELD VERTICES AS nodes, EDGES AS relationships;

Unnamed: 0,nodes,relationships
0,"[(35444 :disease{group_id_bert: __NULL__, group_name_bert: __NULL__, mayo_causes: __NULL__, mayo_complications: __NULL__, mayo_prevention: __NULL__, mayo_risk_factors: __NULL__, mayo_see_doc: __NULL__, mayo_symptoms: __NULL__, mondo_definition: __NULL__, mondo_id: 15572, mondo_name: ""cerebral malformation due to abnormal neuronal migration"", node_name: ""color vision disorder"", node_source: ""MONDO_grouped"", orphanet_clinical_description: __NULL__, orphanet_definition: __NULL__, orphanet_epidemiology: __NULL__, orphanet_management_and_treatment: __NULL__, orphanet_prevalence: __NULL__, umls_description: __NULL__})]","[(35444)-[:disease_disease@0{display_relation: ""parent-child""}]->(98588), (35444)-[:disease_disease@0{display_relation: ""parent-child""}]->(36168), (35444)-[:disease_disease@0{display_relation: ""parent-child""}]->(36169), (35444)-[:disease_disease@0{display_relation: ""parent-child""}]->(94302), (35444)-[:disease_disease@0{display_relation: ""parent-child""}]->(36477), (35444)-[:disease_disease@0{display_relation: ""parent-child""}]->(38527), (35444)-[:disease_disease@0{display_relation: ""parent-child""}]->(35968), (35444)-[:disease_disease@0{display_relation: ""parent-child""}]->(36252), (35444)-[:disease_disease@0{display_relation: ""parent-child""}]->(28357), (35444)-[:disease_disease@0{display_relation: ""parent-child""}]->(94421), (35444)-[:disease_disease@0{display_relation: ""parent-child""}]->(35804), (35444)-[:disease_disease@0{display_relation: ""parent-child""}]->(35805), (35444)-[:disease_disease@0{display_relation: ""parent-child""}]->(29151)]"
1,"[(38527 :disease{group_id_bert: __NULL__, group_name_bert: __NULL__, mayo_causes: ""Often the cause of the cardiomyopathy is unknown. In some people, however, it's the result of another condition or passed on from a parent . Certain health conditions or behaviors that can lead to acquired cardiomyopathy include: Long-term high blood pressure, Heart tissue damage from a heart attack, Long-term rapid heart rate, Heart valve problems, COVID-19 infection, Certain infections, especially those that cause inflammation of the heart, Metabolic disorders, such as obesity, thyroid disease or diabetes, Lack of essential vitamins or minerals in your diet, such as thiamin, Pregnancy complications, Iron buildup in your heart muscle, The growth of tiny lumps of inflammatory cells in any part of your body, including your heart and lungs, The buildup of abnormal proteins in the organs, Connective tissue disorders, Drinking too much alcohol over many years, Use of cocaine, amphetamines or anabolic steroids, Use of some chemotherapy drugs and radiation to treat cancer, Types of cardiomyopathy include: Dilated cardiomyopathy. In this type of cardiomyopathy, the pumping ability of your heart's main pumping chamber the left ventricle becomes enlarged and can't effectively pump blood out of the heart. Although this type can affect people of all ages, it occurs most often in middle-aged people and is more likely to affect men. The most common cause is coronary artery disease or heart attack. However, it can also be caused by genetic defects., , Hypertrophic cardiomyopathy. This type involves abnormal thickening of your heart muscle, which makes it harder for the heart to work. It mostly affects the muscle of your heart's main pumping chamber . Hypertrophic cardiomyopathy can develop at any age, but the condition tends to be more severe if it occurs during childhood. Most people with this type of cardiomyopathy have a family history of the disease. Some genetic mutations have been linked to hypertrophic cardiomyopathy., , Restrictive cardiomyopathy. In this type, the heart muscle becomes stiff and less flexible, so it can't expand and fill with blood between heartbeats. This least common type of cardiomyopathy can occur at any age, but it most often affects older people. Restrictive cardiomyopathy can occur for no known reason, or it can by caused by a disease elsewhere in the body that affects the heart, such as amyloidosis., Arrhythmogenic right ventricular dysplasia. In this rare type of cardiomyopathy, the muscle in the lower right heart chamber is replaced by scar tissue, which can lead to heart rhythm problems. It's often caused by genetic mutations. Unclassified cardiomyopathy. Other types of cardiomyopathy fall into this category."", mayo_complications: ""Cardiomyopathy can lead to serious complications, including: Heart failure. Your heart can't pump enough blood to meet your body's needs. Untreated, heart failure can be life-threatening. Blood clots. Because your heart can't pump effectively, blood clots might form in your heart. If clots enter your bloodstream, they can block the blood flow to other organs, including your heart and brain. Heart valve problems. Because cardiomyopathy causes the heart to enlarge, the heart valves might not close properly. This can cause blood to flow backward in the valve. Cardiac arrest and sudden death. Cardiomyopathy can trigger abnormal heart rhythms that cause fainting or, in some cases, sudden death if your heart stops beating effectively."", mayo_prevention: ""In many cases, you can't prevent cardiomyopathy. Let your doctor know if you have a family history of the condition. You can help reduce your risk of cardiomyopathy and other types of heart disease by living a heart-healthy lifestyle and making lifestyle choices such as: Avoiding the use of alcohol or cocaine, Controlling high blood pressure, high cholesterol and diabetes, Eating a healthy diet, Getting regular exercise, Getting enough sleep, Reducing your stress"", mayo_risk_factors: ""There are a number of things that can increase your risk of cardiomyopathy, including: Family history of cardiomyopathy, heart failure and sudden cardiac arrest, Long-term high blood pressure, Conditions that affect the heart, including a past heart attack, coronary artery disease or an infection in the heart, Obesity, which makes the heart work harder, Long-term alcohol misuse, Illicit drug use, such as cocaine, amphetamines and anabolic steroids, Treatment with certain chemotherapy drugs and radiation for cancer, Many diseases also raise your risk of cardiomyopathy, including: Diabetes, Thyroid disease, Storage of excess iron in the body, Amyloidosis, Sarcoidosis, Connective tissue disorders"", mayo_see_doc: ""When to see a doctor, See your doctor if you have one or more signs or symptoms associated with cardiomyopathy. Call 911 or your local emergency number if you have severe difficulty breathing, fainting or chest pain that lasts for more than a few minutes. Some types of cardiomyopathy can be passed down through families . If you have the condition, your doctor might recommend that your family members be checked."", mayo_symptoms: ""There might be no signs or symptoms in the early stages of cardiomyopathy. But as the condition advances, signs and symptoms usually appear, including: Breathlessness with activity or even at rest, Swelling of the legs, ankles and feet, Bloating of the abdomen due to fluid buildup, Cough while lying down, Difficulty lying flat to sleep, Fatigue, Heartbeats that feel rapid, pounding or fluttering, Chest discomfort or pressure, Dizziness, lightheadedness and fainting, Signs and symptoms tend to get worse unless treated. In some people, the condition worsens quickly; in others, it might not worsen for a long time."", mondo_definition: __NULL__, mondo_id: 16337, mondo_name: ""syndrome associated with dilated cardiomyopathy"", node_name: ""colorblindness, partial"", node_source: ""MONDO"", orphanet_clinical_description: __NULL__, orphanet_definition: __NULL__, orphanet_epidemiology: __NULL__, orphanet_management_and_treatment: __NULL__, orphanet_prevalence: __NULL__, umls_description: __NULL__}), (94421 :disease{group_id_bert: __NULL__, group_name_bert: __NULL__, mayo_causes: __NULL__, mayo_complications: __NULL__, mayo_prevention: __NULL__, mayo_risk_factors: __NULL__, mayo_see_doc: __NULL__, mayo_symptoms: __NULL__, mondo_definition: __NULL__, mondo_id: __NULL__, mondo_name: __NULL__, node_name: ""cerebral visual impairment"", node_source: ""MONDO"", orphanet_clinical_description: __NULL__, orphanet_definition: __NULL__, orphanet_epidemiology: __NULL__, orphanet_management_and_treatment: __NULL__, orphanet_prevalence: __NULL__, umls_description: __NULL__}), (36252 :disease{group_id_bert: __NULL__, group_name_bert: __NULL__, mayo_causes: __NULL__, mayo_complications: __NULL__, mayo_prevention: __NULL__, mayo_risk_factors: __NULL__, mayo_see_doc: __NULL__, mayo_symptoms: __NULL__, mondo_definition: ""A disease involving the thyroid gland."", mondo_id: 3240, mondo_name: ""thyroid gland disease"", node_name: ""binocular vision disease"", node_source: ""MONDO"", orphanet_clinical_description: __NULL__, orphanet_definition: __NULL__, orphanet_epidemiology: __NULL__, orphanet_management_and_treatment: __NULL__, orphanet_prevalence: __NULL__, umls_description: ""Enlargement of the thyroid gland.""}), (35804 :disease{group_id_bert: __NULL__, group_name_bert: __NULL__, mayo_causes: __NULL__, mayo_complications: __NULL__, mayo_prevention: __NULL__, mayo_risk_factors: __NULL__, mayo_see_doc: __NULL__, mayo_symptoms: __NULL__, mondo_definition: ""An agnosia that is a loss of the ability to distinguishing environmental and non-verbal auditory cues including difficulty distinguishing speech from non-speech sounds even though hearing is usually normal."", mondo_id: 667, mondo_name: ""auditory agnosia"", node_name: ""visual agnosia (disease)"", node_source: ""MONDO"", orphanet_clinical_description: __NULL__, orphanet_definition: __NULL__, orphanet_epidemiology: __NULL__, orphanet_management_and_treatment: __NULL__, orphanet_prevalence: __NULL__, umls_description: __NULL__}), (36169 :disease{group_id_bert: __NULL__, group_name_bert: __NULL__, mayo_causes: __NULL__, mayo_complications: __NULL__, mayo_prevention: __NULL__, mayo_risk_factors: __NULL__, mayo_see_doc: __NULL__, mayo_symptoms: __NULL__, mondo_definition: ""A disease or disorder that involves the ligament."", mondo_id: 45044, mondo_name: ""ligament disease"", node_name: ""blindness (disorder)"", node_source: ""MONDO"", orphanet_clinical_description: __NULL__, orphanet_definition: __NULL__, orphanet_epidemiology: __NULL__, orphanet_management_and_treatment: __NULL__, orphanet_prevalence: __NULL__, umls_description: __NULL__}), (35805 :disease{group_id_bert: __NULL__, group_name_bert: __NULL__, mayo_causes: __NULL__, mayo_complications: __NULL__, mayo_prevention: __NULL__, mayo_risk_factors: __NULL__, mayo_see_doc: __NULL__, mayo_symptoms: __NULL__, mondo_definition: ""Any impairment to the vision."", mondo_id: 21084, mondo_name: ""vision disorder"", node_name: ""perceptual disorders"", node_source: ""MONDO"", orphanet_clinical_description: __NULL__, orphanet_definition: __NULL__, orphanet_epidemiology: __NULL__, orphanet_management_and_treatment: __NULL__, orphanet_prevalence: __NULL__, umls_description: ""A visual defect characterized by the inability to see as clearly in bright light as in dim light. The word hemeralopia literally means day blindness. ""}), (28357 :disease{group_id_bert: __NULL__, group_name_bert: __NULL__, mayo_causes: __NULL__, mayo_complications: __NULL__, mayo_prevention: __NULL__, mayo_risk_factors: __NULL__, mayo_see_doc: __NULL__, mayo_symptoms: __NULL__, mondo_definition: ""Majeed syndrome is a rare genetic multisystemic disorder characterized by the triad of chronic recurrent multifocal osteomyelitis, congenital dyserythropoietic anemia, and variable transient inflammatory dermatosis."", mondo_id: 12316, mondo_name: ""Majeed syndrome"", node_name: ""achromatopsia"", node_source: ""MONDO_grouped"", orphanet_clinical_description: ""Onset is generally within the first two years of life although it may present later in childhood (range 1 month to 8 years) as bone pain. Chronic recurrent multifocal osteomyelitis (CRMO) associated with Majeed syndrome is typically more severe than that of non-syndromic CRMO, and is more persistent, with short remissions and more frequent exacerbations. It can be associated with fever, joint pain, delayed bone age, growth failure, short adult stature, and development of flexion contractures. Patients also have a hypochromic, microcytic anemia with dyserythropoiesis present on 11 of 11 bone marrow biopsies. In half of the cases, the anemia is mild with the remainder requiring one or more transfusions for anemia. The inflammatory neutrophilic dermatosis Sweet syndrome has been reported in two patients with Majeed syndrome. Other reported manifestations include failure to thrive, hepatomegaly, neutropenia, and transient cholestatic jaundice. The course is chronic and the syndrome may have a significant impact on quality of life."", orphanet_definition: ""Majeed syndrome is a rare genetic multisystemic disorder characterized by chronic recurrent multifocal osteomyelitis, congenital dyserythropoietic anemia, which may be accompanied by neutrophilic dermatosis."", orphanet_epidemiology: ""The syndrome is extremely rare. Fourteen cases born into consanguineous families, from the Middle East, India and Spain have been reported."", orphanet_management_and_treatment: ""Treatment is empiric. Nonsteroidal anti-inflammatory drugs (NSAIDs) are the main treatment options for non-syndromic CRMO, but are not likely to control bone inflammation in Majeed syndrome patients. Corticosteroids may also be used to control CRMO and inflammatory dermatosis, but have a multitude of side effects that limit their use for long term treatment. Methotrexate with or without pamidronate has been utilized in a few cases with mild to moderate improvement reported. TNF inhibitors were used in 2 children without significant benefit. IL-1 beta inhibition looks more promising with resolution of clinical symptoms, normalization of blood inflammatory markers and normalization of radiologic bone lesions in 4 of 4 patients treated."", orphanet_prevalence: ""<1/1000000"", umls_description: ""An autoinflammatory disease caused by mutations in the lpin2 gene. It is characterized by early-onset chronic recurrent multifocal osteomyelitis, congenital dyserythropoietic anemia and inflammatory dermatosis.""}), (36168 :disease{group_id_bert: __NULL__, group_name_bert: __NULL__, mayo_causes: ""Doctors don't know what causes endometrial cancer. What's known is that something occurs to create changes in the DNA of cells in the endometrium the lining of the uterus. The mutation turns normal, healthy cells into abnormal cells. Healthy cells grow and multiply at a set rate, eventually dying at a set time. Abnormal cells grow and multiply out of control, and they don't die at a set time. The accumulating abnormal cells form a mass . Cancer cells invade nearby tissues and can separate from an initial tumor to spread elsewhere in the body ."", mayo_complications: ""To reduce your risk of endometrial cancer, you may wish to: Talk to your doctor about the risks of hormone therapy after menopause. If you're considering hormone replacement therapy to help control menopause symptoms, talk to your doctor about the risks and benefits. Unless you've undergone a hysterectomy, replacing estrogen alone after menopause may increase your risk of endometrial cancer. Taking a combination of estrogen and progestin can reduce this risk. Hormone therapy carries other risks, so weigh the benefits and risks with your doctor. Consider taking birth control pills. Using oral contraceptives for at least one year may reduce endometrial cancer risk. The risk reduction is thought to last for several years after you stop taking oral contraceptives. Oral contraceptives have side effects, though, so discuss the benefits and risks with your doctor. Maintain a healthy weight. Obesity increases the risk of endometrial cancer, so work to achieve and maintain a healthy weight. If you need to lose weight, increase your physical activity and reduce the number of calories you eat each day."", mayo_prevention: __NULL__, mayo_risk_factors: ""Factors that increase the risk of endometrial cancer include: Changes in the balance of female hormones in the body. The ovaries make two main female hormones estrogen and progesterone. Fluctuations in the balance of these hormones cause changes in the endometrium. A disease or condition that increases the amount of estrogen, but not the level of progesterone, in your body can increase your risk of endometrial cancer. Examples include irregular ovulation patterns, which might happen in polycystic ovary syndrome, obesity and diabetes. Taking hormones after menopause that contain estrogen but not progesterone increases the risk of endometrial cancer. A rare type of ovarian tumor that secretes estrogen also can increase the risk of endometrial cancer., More years of menstruation. Starting menstruation at an early age before age 12 or beginning menopause later increases the risk of endometrial cancer. The more periods you've had, the more exposure your endometrium has had to estrogen. Never having been pregnant. If you've never been pregnant, you have a higher risk of endometrial cancer than someone who has had at least one pregnancy. Older age. As you get older, your risk of endometrial cancer increases. Endometrial cancer occurs most often after menopause. Obesity. Being obese increases your risk of endometrial cancer. This may occur because excess body fat alters your body's balance of hormones. Hormone therapy for breast cancer. Taking the hormone therapy drug tamoxifen for breast cancer can increase the risk of developing endometrial cancer. If you're taking tamoxifen, discuss this risk with your doctor. For most, the benefits of tamoxifen outweigh the small risk of endometrial cancer. An inherited colon cancer syndrome. Lynch syndrome, also called hereditary nonpolyposis colorectal cancer, is a syndrome that increases the risk of colon cancer and other cancers, including endometrial cancer. Lynch syndrome is caused by a gene mutation passed from parents to children. If a family member has been diagnosed with Lynch syndrome, discuss your risk of the genetic syndrome with your doctor. If you've been diagnosed with Lynch syndrome, ask your doctor what cancer screening tests you should undergo."", mayo_see_doc: ""When to see a doctor, Make an appointment with your doctor if you experience any persistent signs or symptoms that worry you."", mayo_symptoms: ""Signs and symptoms of endometrial cancer may include: Vaginal bleeding after menopause, Bleeding between periods, Pelvic pain"", mondo_definition: ""Primary or metastatic malignant neoplasm involving the uterine corpus and/or the cervix."", mondo_id: 2715, mondo_name: ""uterine cancer"", node_name: ""visual cortex disease"", node_source: ""MONDO_grouped"", orphanet_clinical_description: __NULL__, orphanet_definition: __NULL__, orphanet_epidemiology: __NULL__, orphanet_management_and_treatment: __NULL__, orphanet_prevalence: __NULL__, umls_description: __NULL__}), (35968 :disease{group_id_bert: __NULL__, group_name_bert: __NULL__, mayo_causes: __NULL__, mayo_complications: __NULL__, mayo_prevention: __NULL__, mayo_risk_factors: __NULL__, mayo_see_doc: __NULL__, mayo_symptoms: __NULL__, mondo_definition: ""A group of rare inherited disorders characterized by a deficiency of enzymes that are involved in metabolic pathways that affect muscles. The disorders are characterized by muscle dysfunction."", mondo_id: 20123, mondo_name: ""metabolic myopathy"", node_name: ""amblyopia (disease)"", node_source: ""MONDO"", orphanet_clinical_description: __NULL__, orphanet_definition: __NULL__, orphanet_epidemiology: __NULL__, orphanet_management_and_treatment: __NULL__, orphanet_prevalence: __NULL__, umls_description: ""A group of rare inherited disorders characterized by a deficiency of enzymes that are involved in metabolic pathways that affect muscles. The disorders are characterized by muscle dysfunction.""}), (94302 :disease{group_id_bert: __NULL__, group_name_bert: __NULL__, mayo_causes: __NULL__, mayo_complications: __NULL__, mayo_prevention: __NULL__, mayo_risk_factors: __NULL__, mayo_see_doc: __NULL__, mayo_symptoms: __NULL__, mondo_definition: __NULL__, mondo_id: __NULL__, mondo_name: __NULL__, node_name: ""acquired color blindness"", node_source: ""MONDO"", orphanet_clinical_description: __NULL__, orphanet_definition: __NULL__, orphanet_epidemiology: __NULL__, orphanet_management_and_treatment: __NULL__, orphanet_prevalence: __NULL__, umls_description: __NULL__}), (98588 :disease{group_id_bert: __NULL__, group_name_bert: __NULL__, mayo_causes: __NULL__, mayo_complications: __NULL__, mayo_prevention: __NULL__, mayo_risk_factors: __NULL__, mayo_see_doc: __NULL__, mayo_symptoms: __NULL__, mondo_definition: __NULL__, mondo_id: 18763, mondo_name: ""Tubulinopathy-associated dysgyria"", node_name: ""Alice in wonderland syndrome"", node_source: ""MONDO"", orphanet_clinical_description: __NULL__, orphanet_definition: ""A rare genetic central nervous system malformation characterized by dysplasia of the superior cerebellum (especially the vermis), brainstem asymmetry, dysplasia of the basal ganglia, and cortical irregularities with asymmetric abnormalities in gyral size and orientation, as well as varying sulcal depth, but without lissencephaly, pachygyria, or polymicrogyria. Clinically, patients present global developmental delay with motor development usually being more affected that speech. Variable features are abnormal eye movements including oculomotor apraxia, strabismus, seizures, and behavioral problems."", orphanet_epidemiology: __NULL__, orphanet_management_and_treatment: __NULL__, orphanet_prevalence: ""<1/1000000"", umls_description: __NULL__}), (29151 :disease{group_id_bert: __NULL__, group_name_bert: __NULL__, mayo_causes: ""As you age, the bones and cartilage that make up your backbone and neck gradually develop wear and tear. These changes can include: Dehydrated disks. Disks act like cushions between the vertebrae of your spine. By the age of 40, most people's spinal disks begin drying out and shrinking, which allows more bone-on-bone contact between the vertebrae. Herniated disks. Age also affects the exterior of your spinal disks. Cracks often appear, leading to bulging disks which sometimes can press on the spinal cord and nerve roots. Bone spurs. Disk degeneration often results in the spine producing extra amounts of bone in a misguided effort to strengthen the spine. These bone spurs can sometimes pinch the spinal cord and nerve roots. Stiff ligaments. Ligaments are cords of tissue that connect bone to bone. Spinal ligaments can stiffen with age, making your neck less flexible."", mayo_complications: ""If your spinal cord or nerve roots become severely compressed as a result of cervical spondylosis, the damage can be permanent."", mayo_prevention: __NULL__, mayo_risk_factors: ""Risk factors for cervical spondylosis include: Age. Cervical spondylosis is a normal part of aging. Occupation. Jobs that involve repetitive neck motions, awkward positioning or a lot of overhead work put extra stress on your neck. Neck injuries. Previous neck injuries appear to increase the risk of cervical spondylosis. Genetic factors. Some individuals in certain families will experience more of these changes over time, while others will not. Smoking. Smoking has been linked to increased neck pain."", mayo_see_doc: ""When to see a doctor, Seek medical attention if you notice a sudden onset of numbness or weakness, or loss of bladder or bowel control."", mayo_symptoms: ""For most people, cervical spondylosis causes no symptoms. When symptoms do occur, they typically include pain and stiffness in the neck. Sometimes, cervical spondylosis results in a narrowing of the space needed by the spinal cord and the nerve roots that pass through the spine to the rest of your body. If the spinal cord or nerve roots become pinched, you might experience: Tingling, numbness and weakness in your arms, hands, legs or feet, Lack of coordination and difficulty walking, Loss of bladder or bowel control"", mondo_definition: __NULL__, mondo_id: 8481, mondo_name: ""spondylosis, cervical"", node_name: ""red color blindness"", node_source: ""MONDO_grouped"", orphanet_clinical_description: __NULL__, orphanet_definition: __NULL__, orphanet_epidemiology: __NULL__, orphanet_management_and_treatment: __NULL__, orphanet_prevalence: __NULL__, umls_description: ""Arthritis of the cervical spine.""}), (36477 :disease{group_id_bert: __NULL__, group_name_bert: __NULL__, mayo_causes: __NULL__, mayo_complications: __NULL__, mayo_prevention: __NULL__, mayo_risk_factors: __NULL__, mayo_see_doc: __NULL__, mayo_symptoms: __NULL__, mondo_definition: ""A disease or disorder that involves the mouth mucosa."", mondo_id: 44992, mondo_name: ""mouth mucosa disease"", node_name: ""disease of visual system"", node_source: ""MONDO"", orphanet_clinical_description: __NULL__, orphanet_definition: __NULL__, orphanet_epidemiology: __NULL__, orphanet_management_and_treatment: __NULL__, orphanet_prevalence: __NULL__, umls_description: __NULL__})]","[(38527)-[:disease_disease@0{display_relation: ""parent-child""}]->(35444), (94421)-[:disease_disease@0{display_relation: ""parent-child""}]->(35444), (36252)-[:disease_disease@0{display_relation: ""parent-child""}]->(35444), (35804)-[:disease_disease@0{display_relation: ""parent-child""}]->(35444), (36169)-[:disease_disease@0{display_relation: ""parent-child""}]->(35444), (35805)-[:disease_disease@0{display_relation: ""parent-child""}]->(35444), (28357)-[:disease_disease@0{display_relation: ""parent-child""}]->(35444), (36168)-[:disease_disease@0{display_relation: ""parent-child""}]->(35444), (35968)-[:disease_disease@0{display_relation: ""parent-child""}]->(35444), (94302)-[:disease_disease@0{display_relation: ""parent-child""}]->(35444), (98588)-[:disease_disease@0{display_relation: ""parent-child""}]->(35444), (29151)-[:disease_disease@0{display_relation: ""parent-child""}]->(35444), (36477)-[:disease_disease@0{display_relation: ""parent-child""}]->(35444)]"


In [230]:
#look for the index from the node that has the node_id 95635
for i, node in enumerate(nodes):
    if node.node_id == '35443':
        print(i)
        break


118857


In [15]:
nodes[118856].relationships

{<NodeRelationship.CHILD: '5'>: [RelatedNodeInfo(node_id='38164', node_type=None, metadata={'relation': 'disease_disease', 'display_relation': 'parent-child'}, hash=None),
  RelatedNodeInfo(node_id='83541', node_type=None, metadata={'relation': 'disease_disease', 'display_relation': 'parent-child'}, hash=None),
  RelatedNodeInfo(node_id='27628', node_type=None, metadata={'relation': 'disease_disease', 'display_relation': 'parent-child'}, hash=None),
  RelatedNodeInfo(node_id='38203', node_type=None, metadata={'relation': 'disease_disease', 'display_relation': 'parent-child'}, hash=None),
  RelatedNodeInfo(node_id='35762', node_type=None, metadata={'relation': 'disease_disease', 'display_relation': 'parent-child'}, hash=None),
  RelatedNodeInfo(node_id='36523', node_type=None, metadata={'relation': 'disease_disease', 'display_relation': 'parent-child'}, hash=None)]}

In [None]:
start_node = nodes[118856] # 118856 small subgraph 118857 largest subgraph disease
related_nodes_info = start_node.relationships.get(NodeRelationship.CHILD, [])

related_nodes = []
edges = []
for relation in related_nodes_info:
    # Find the related node by its node_id
    related_node = next((node for node in nodes if node.node_id == relation.node_id), None)
    if related_node:
        related_nodes.append(related_node)
        edges.append((start_node.node_id, related_node.node_id, relation.metadata))

subgraph_nodes = [start_node] + related_nodes  # Include the starting node and its neighbors


In [11]:
# save `all_nodes` with pickle
import pickle
with open(os.path.expanduser('~/scratch-llm/storage/subgraph_large.pkl'), 'wb') as f:
    pickle.dump(subgraph_nodes, f)

## Generate index

In [38]:
%ngql CREATE SPACE IF NOT EXISTS PrimeKG_manual(vid_type=FIXED_STRING(256), partition_num=20, replica_factor=1);

In [None]:
%ngql drop space PrimeKG_manual;

In [39]:
# PropertyGraphIndex requires NebulaPropertyGraphStore
graph_store = NebulaPropertyGraphStore(
    space= "PrimeKG_manual", 
    username = "root",
    password = "nebula",
    url = "nebula://localhost:9669", #changing id to STRING
    props_schema= "`node_index` STRING, `id` STRING,`node_id` STRING, `node_name` STRING, `node_source` STRING, `mondo_id` STRING, `mondo_name` STRING, `group_id_bert` STRING, `group_name_bert` STRING, `mondo_definition` STRING, `umls_description` STRING, `orphanet_definition` STRING, `orphanet_prevalence` STRING, `orphanet_epidemiology` STRING, `orphanet_clinical_description` STRING, `orphanet_management_and_treatment` STRING, `mayo_symptoms` STRING, `mayo_causes` STRING, `mayo_risk_factors` STRING, `mayo_complications` STRING, `mayo_prevention` STRING, `mayo_see_doc` STRING, `display_relation` STRING, `description` STRING, `half_life` STRING, `indication` STRING, `mechanism_of_action` STRING, `protein_binding` STRING, `pharmacodynamics` STRING, `state` STRING, `atc_1` STRING, `atc_2` STRING, `atc_3` STRING, `atc_4` STRING, `category` STRING, `group` STRING, `pathway` STRING, `molecular_weight` STRING, `tpsa` STRING, `clogp` STRING, `_node_content` STRING,`_node_type` STRING,`document_id` STRING,`doc_id` STRING,`ref_doc_id` STRING, `triplet_source_id` STRING",
    overwrite=True
)

storage_context = StorageContext.from_defaults(property_graph_store=graph_store)

convert TextNode into IndexNode to insert the nodes into the index:

In [29]:
from llama_index.core.vector_stores.simple import SimpleVectorStore
from llama_index.core import PropertyGraphIndex, Settings
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import torch

vec_store = SimpleVectorStore()

In [19]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [20]:
# Create IndexNode objects from TextNode objects
index_nodes = [IndexNode.from_text_node(node, index_id="index_1") for node in subgraph_nodes]
len(index_nodes)


7

In [21]:
# this was intended to be for the bug but not really doing anything
from transformers import AutoTokenizer

# Load tokenizer and set pad_token_id explicitly
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B")
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token  # Set pad_token to eos_token if not defined

In [22]:
llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.8, "do_sample": True},
    # system_prompt=system_prompt,
    # query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="meta-llama/Llama-3.2-3B",
    model_name="meta-llama/Llama-3.2-3B",
    device_map="auto",
    stopping_ids=[50278, 50279, 50277, 1, 0],
    tokenizer_kwargs={"max_length": 4096, "pad_token_id": tokenizer.pad_token_id},
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16}
)

Settings.llm = llm
Settings.chunk_size = 1024
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-m3") 

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [26]:
index = PropertyGraphIndex.from_existing(
    llm=llm,
    vector_store=vec_store,
    property_graph_store=graph_store,
    show_progress=True,
    storage_context=storage_context,
    embed_kg_nodes=True
)

In [49]:
import nest_asyncio
nest_asyncio.apply()

# Ensure that node IDs are strings fitting the vertex ID length limit
for node in index_nodes:
	node.id_ = str(node.id_)

index.insert_nodes(index_nodes)

Extracting paths from text:   0%|          | 0/7 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Extracting paths from text: 100%|██████████| 7/7 [00:30<00:00,  4.37s/it]
Extracting implicit paths: 100%|██████████| 7/7 [00:00<00:00, 3190.62it/s]


Exception: ('NebulaGraph query failed:', "SemanticError: `Node__': Unknown tag", 'Statement:', 'MATCH (e:Node__) WHERE id(e) in $all_id          RETURN id(e) AS name,                e.Node__.label AS type,                properties(e.Props__) AS properties,                properties(e) AS all_props         ', 'Params:', {'all_id': ['Progressive neurodegenerative disorder', 'Alice', 'Philz', 'Neurodegenerative disorder', 'Characterized by generalized hypotonia at birth', '1197', 'Bob', '1982', 'Anterior horn cell', 'Progressive', 'A rare syndrome including neonatal and infantile hypotonia and failure to thrive', 'Coffee shop', 'Qualitative platelet defect', 'Berkeley', 'Anomaly', 'Hypotonia-cystinuria syndrome', 'Motor neuron disease', '83541', 'Muscle weakness', 'Disease', 'Disorder']})

save index to persist storage

In [32]:
index.storage_context.persist(persist_dir="~/scratch-llm/storage/PrimeKG_index_mid/")
