In [None]:
import pandas as pd
from nebulagraph_lite import nebulagraph_let as ng_let
import os

# Data loading


In [None]:
#primekg = pd.read_csv("~/scratch-llm/data/PrimeKG_data/raw_data/kg.csv", low_memory=False)
nodes = pd.read_csv("~/scratch-llm/data/PrimeKG/datasets/data/kg/nodes.csv", low_memory=False)
edges = pd.read_csv("~/scratch-llm/data/PrimeKG/datasets/data/kg/edges.csv")

In [None]:
# load NebulaGraph JupyterNotebook extension!
n = ng_let()
n.start() # This takes around 5 mins

# problems loading the extension, solve with:
# udocker ps
# udocker pull vesoft/nebula-metad:v3
# udocker create --name=nebula-metad vesoft/nebula-metad:v3
# udocker setup --execmode=F1 nebula-metad

# udocker pull vesoft/nebula-graphd:v3
# udocker create --name=nebula-graphd vesoft/nebula-graphd:v3
# udocker setup --execmode=F1 nebula-graphd

# udocker pull vesoft/nebula-storaged:v3
# udocker create --name=nebula-storaged vesoft/nebula-storaged:v3
# udocker setup --execmode=F1 nebula-storaged

In [None]:
%load_ext ngql
%ngql --address 127.0.0.1 --port 9669 --user root --password nebula

In [None]:
# Create a new Graph space (graph) for the PrimeKG
%ngql CREATE SPACE IF NOT EXISTS PrimeKG (vid_type = INT64);
%ngql :sleep 10;
%ngql USE PrimeKG;

# Nodes

In [None]:
print(nodes['node_type'].unique()) #types of nodes

In [None]:
# save the data for each 'node_type' in a separate file
for node_type in nodes['node_type'].unique():
    sanitized_node_type = node_type.replace('/', '_')
    output_path = os.path.join('~/scratch-llm/data/PrimeKG_data/sub_data/', 'node_'+ sanitized_node_type + '.csv')
    nodes[nodes['node_type'] == node_type].to_csv(output_path, index=False)

In [None]:
# Create Tags (node_type) for each node in the PrimeKG
%ngql CREATE TAG IF NOT EXISTS anatomy(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS gene_protein(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS drug(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS disease(node_name string, node_source string);
%ngql CREATE TAG IF NOT EXISTS pathway(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS biological_process(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS effect_phenotype(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS molecular_function(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS cellular_component(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS exposure(node_name string, node_source string, node_id string);

In [None]:
# Load each node source data into the corresponding Tag (working fine)
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_gene_protein.csv --tag gene_protein --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_biological_process.csv --tag biological_process --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_effect_phenotype.csv --tag effect_phenotype --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_molecular_function.csv --tag molecular_function --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_cellular_component.csv --tag cellular_component --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_pathway.csv --tag pathway --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_exposure.csv --tag exposure --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_drug.csv --tag drug --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_anatomy.csv --tag anatomy --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_disease.csv --tag disease --header --space PrimeKG --vid 0 --props 3:node_name,4:node_source


# Edges

In [None]:
print(edges['relation'].unique()) # types of edges

In [None]:
# save the data for each 'relation' (edge type) in a separate file
for relation in edges['relation'].unique():
    sanitized_relation = relation.replace('/', '_')
    output_path = os.path.join('~/scratch-llm/data/PrimeKG_data/sub_data/', 'edge_'+ sanitized_relation + '.csv')
    edges[edges['relation'] == relation].to_csv(output_path, index=False)

In [None]:
%ngql CREATE EDGE IF NOT EXISTS protein_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS drug_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS contraindication(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS indication(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS off_label_use(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS drug_drug(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS phenotype_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS phenotype_phenotype(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS disease_phenotype_negative(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS disease_phenotype_positive(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS disease_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS disease_disease(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS drug_effect(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS bioprocess_bioprocess(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS molfunc_molfunc(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS cellcomp_cellcomp(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS molfunc_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS cellcomp_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS bioprocess_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_disease(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_exposure(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_bioprocess(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_molfunc(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_cellcomp(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS pathway_pathway(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS pathway_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS anatomy_anatomy(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS anatomy_protein_present(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS anatomy_protein_absent(display_relation string);

In [None]:
# for each edge type, load the data accordingly, this takes like 20mins from new
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_protein_protein.csv --space PrimeKG --header --edge protein_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_drug_protein.csv --space PrimeKG --header --edge drug_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_contraindication.csv --space PrimeKG --header --edge contraindication --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_indication.csv --space PrimeKG --header --edge indication --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_off_label_use.csv --space PrimeKG --header --edge off_label_use --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_drug_drug.csv --space PrimeKG --header --edge drug_drug --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_phenotype_protein.csv --space PrimeKG --header --edge phenotype_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_phenotype_phenotype.csv --space PrimeKG --header --edge phenotype_phenotype --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_phenotype_negative.csv --space PrimeKG --header --edge disease_phenotype_negative --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_phenotype_positive.csv --space PrimeKG --header --edge disease_phenotype_positive --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_protein.csv --space PrimeKG --header --edge disease_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_disease.csv --space PrimeKG --header --edge disease_disease --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_drug_effect.csv --space PrimeKG --header --edge drug_effect --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_bioprocess_bioprocess.csv --space PrimeKG --header --edge bioprocess_bioprocess --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_molfunc_molfunc.csv --space PrimeKG --header --edge molfunc_molfunc --src 2 --dst 3 --props 1:display_relation


In [None]:
# load the edge data in two batches
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_cellcomp_cellcomp.csv --space PrimeKG --header --edge cellcomp_cellcomp --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_molfunc_protein.csv --space PrimeKG --header --edge molfunc_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_cellcomp_protein.csv --space PrimeKG --header --edge cellcomp_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_bioprocess_protein.csv --space PrimeKG --header --edge bioprocess_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_protein.csv --space PrimeKG --header --edge exposure_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_disease.csv --space PrimeKG --header --edge exposure_disease --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_exposure.csv --space PrimeKG --header --edge exposure_exposure --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_bioprocess.csv --space PrimeKG --header --edge exposure_bioprocess --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_molfunc.csv --space PrimeKG --header --edge exposure_molfunc --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_cellcomp.csv --space PrimeKG --header --edge exposure_cellcomp --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_pathway_pathway.csv --space PrimeKG --header --edge pathway_pathway --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_pathway_protein.csv --space PrimeKG --header --edge pathway_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_anatomy_anatomy.csv --space PrimeKG --header --edge anatomy_anatomy --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_anatomy_protein_present.csv --space PrimeKG --batch 500 --header --edge anatomy_protein_present --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_anatomy_protein_absent.csv --space PrimeKG --header --edge anatomy_protein_absent --src 2 --dst 3 --props 1:display_relation
