In [1]:
import pandas as pd
from nebulagraph_lite import nebulagraph_let as ng_let
import os

# Data loading


In [None]:
#primekg = pd.read_csv("~/scratch-llm/data/PrimeKG_data/raw_data/kg.csv", low_memory=False)
nodes = pd.read_csv("~/scratch-llm/data/PrimeKG/datasets/data/kg/nodes.csv", low_memory=False)
edges = pd.read_csv("~/scratch-llm/data/PrimeKG/datasets/data/kg/edges.csv")

In [None]:
# load NebulaGraph JupyterNotebook extension
# !udocker pull vesoft/nebula-metad:v3
# !udocker create --name=nebula-metad vesoft/nebula-metad:v3
# !udocker setup --execmode=F1 nebula-metad

# !udocker pull vesoft/nebula-graphd:v3
# !udocker create --name=nebula-graphd vesoft/nebula-graphd:v3
# !udocker setup --execmode=F1 nebula-graphd

# !udocker pull vesoft/nebula-storaged:v
# !udocker create --name=nebula-storaged vesoft/nebula-storaged:v3
# !udocker setup --execmode=F1 nebula-storaged

n = ng_let()
n.start() # This takes around 5 mins

In [None]:
%load_ext ngql

In [157]:
%reload_ext ngql

In [158]:
%ngql --address 127.0.0.1 --port 9669 --user root --password nebula

[1;3;38;2;0;135;107m[OK] Connection Pool Created[0m


Unnamed: 0,Name
0,PrimeKG
1,PrimeKG_manual
2,basketballplayer


In [None]:
# Create a new Graph space (graph) for the PrimeKG
%ngql CREATE SPACE IF NOT EXISTS PrimeKG (vid_type = INT64);
%ngql USE PrimeKG;

# Create the graph from NebulaGraph directly

## Nodes

In [None]:
print(nodes['node_type'].unique()) #types of nodes

In [None]:
# save the data for each 'node_type' in a separate file
for node_type in nodes['node_type'].unique():
    sanitized_node_type = node_type.replace('/', '_')
    output_path = os.path.join('~/scratch-llm/data/PrimeKG_data/sub_data/', 'node_'+ sanitized_node_type + '.csv')
    nodes[nodes['node_type'] == node_type].to_csv(output_path, index=False)

In [None]:
# Create Tags (node_type) for each node in the PrimeKG
%ngql CREATE TAG IF NOT EXISTS anatomy(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS gene_protein(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS drug(node_name string, node_source string, node_id string, description string, half_life string, indication string, mechanism_of_action string, protein_binding string, pharmacodynamics string, state string, atc_1 string, atc_2 string, atc_3 string, atc_4 string, category string, group string, pathway string, molecular_weight string, tpsa string, clogp string);
%ngql CREATE TAG IF NOT EXISTS disease(node_name string, node_source string, mondo_id int, mondo_name string, group_id_bert string, group_name_bert string, mondo_definition string, umls_description string, orphanet_definition string, orphanet_prevalence string, orphanet_epidemiology string, orphanet_clinical_description string, orphanet_management_and_treatment string, mayo_symptoms string, mayo_causes string, mayo_risk_factors string, mayo_complications string, mayo_prevention string, mayo_see_doc string);
%ngql CREATE TAG IF NOT EXISTS pathway(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS biological_process(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS effect_phenotype(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS molecular_function(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS cellular_component(node_name string, node_source string, node_id string);
%ngql CREATE TAG IF NOT EXISTS exposure(node_name string, node_source string, node_id string);

In [None]:
%ngql DESCRIBE TAG drug;

In [None]:
# Load each node source data into the corresponding Tag (working fine)
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_gene_protein.csv --tag gene_protein --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_biological_process.csv --tag biological_process --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_effect_phenotype.csv --tag effect_phenotype --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_molecular_function.csv --tag molecular_function --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_cellular_component.csv --tag cellular_component --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_pathway.csv --tag pathway --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_exposure.csv --tag exposure --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_anatomy.csv --tag anatomy --header --space PrimeKG --vid 0 --props 2:node_id,3:node_name,4:node_source


### Extra information for nodes 'drug' and 'disease'

In [None]:
# disease and drug feature files are available as .tab format so they need to be converted to .csv
# many formatting issues on both files that had to be correcteed manually before merging with the 'node_' dataset
disease_tab = "~/scratch-llm/data/PrimeKG_data/raw_data/disease_features.tab"
drug_tab = "~/scratch-llm/data/PrimeKG_data/raw_data/drug_features.tab"

df = pd.read_csv(disease_tab, delimiter='\t')
disease_csv = "~/scratch-llm/data/PrimeKG_data/sub_data/disease_features.csv"
df.to_csv(disease_csv, index=False)


df = pd.read_csv(drug_tab, delimiter='\t')
drug_csv = "~/scratch-llm/data/PrimeKG_data/sub_data/drug_features.csv"
df.to_csv(drug_csv, index=False) 

In [None]:
# merge extra drug and disease information 
node_drug = pd.read_csv("~/scratch-llm/data/PrimeKG_data/sub_data/node_drug.csv")
features_drug = pd.read_csv("~/scratch-llm/data/PrimeKG_data/sub_data/drug_features.csv")
node_disease = pd.read_csv("~/scratch-llm/data/PrimeKG_data/sub_data/node_disease.csv")
features_disease = pd.read_csv("~/scratch-llm/data/PrimeKG_data/sub_data/disease_features.csv")

# Merge the dataframes on 'node_index' and save to csv
merged_df = pd.merge(node_drug, features_drug, on='node_index')
merged_df.to_csv("~/scratch-llm/data/PrimeKG_data/sub_data/node_merged_drug.csv", index=False)
merged_df = pd.merge(node_disease, features_disease, on='node_index')
merged_df.to_csv("~/scratch-llm/data/PrimeKG_data/sub_data/node_merged_disease.csv", index=False)


In [None]:
#Load the extended drug and disease data into the corresponding Tags
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/merged_disease.csv --tag disease --header --space PrimeKG --vid 0 --props 3:node_name,4:node_source,5:mondo_id,6:mondo_name,7:group_id_bert,8:group_name_bert,9:mondo_definition,10:umls_description,11:orphanet_definition,12:orphanet_prevalence,13:orphanet_epidemiology,14:orphanet_clinical_description,15:orphanet_management_and_treatment,16:mayo_symptoms,17:mayo_causes,18:mayo_risk_factors,19:mayo_complications,20:mayo_prevention,21:mayo_see_doc
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/node_merged_drug.csv --tag drug --header --space PrimeKG --batch 100 --vid 0 --props 2:node_id,3:node_name,4:node_source,5:description,6:half_life,7:indication,8:mechanism_of_action,9:protein_binding,10:pharmacodynamics,11:state,12:atc_1,13:atc_2,14:atc_3,15:atc_4,16:category,17:group,18:pathway,19:molecular_weight,20:tpsa,21:clogp


## Edges

In [None]:
print(edges['relation'].unique()) # types of edges

In [None]:
# save the data for each 'relation' (edge type) in a separate file
for relation in edges['relation'].unique():
    sanitized_relation = relation.replace('-', '_')
    sanitized_relation = sanitized_relation.replace('\t', '_')
    output_path = os.path.join('~/scratch-llm/data/PrimeKG_data/sub_data/', 'edge_'+ sanitized_relation + '.csv')
    edges[edges['relation'] == relation].to_csv(output_path, index=False)

In [None]:
%ngql CREATE EDGE IF NOT EXISTS protein_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS drug_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS contraindication(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS indication(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS off_label_use(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS drug_drug(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS phenotype_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS phenotype_phenotype(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS disease_phenotype_negative(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS disease_phenotype_positive(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS disease_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS disease_disease(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS drug_effect(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS bioprocess_bioprocess(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS molfunc_molfunc(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS cellcomp_cellcomp(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS molfunc_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS cellcomp_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS bioprocess_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_disease(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_exposure(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_bioprocess(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_molfunc(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS exposure_cellcomp(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS pathway_pathway(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS pathway_protein(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS anatomy_anatomy(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS anatomy_protein_present(display_relation string);
%ngql CREATE EDGE IF NOT EXISTS anatomy_protein_absent(display_relation string);

In [None]:
# for each edge type, load the data accordingly, this takes like 20mins from new
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_protein_protein.csv --space PrimeKG --header --edge protein_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_drug_protein.csv --space PrimeKG --header --edge drug_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_contraindication.csv --space PrimeKG --header --edge contraindication --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_indication.csv --space PrimeKG --header --edge indication --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_off_label_use.csv --space PrimeKG --header --edge off_label_use --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_drug_drug.csv --space PrimeKG --header --edge drug_drug --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_phenotype_protein.csv --space PrimeKG --header --edge phenotype_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_phenotype_phenotype.csv --space PrimeKG --header --edge phenotype_phenotype --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_phenotype_negative.csv --space PrimeKG --header --edge disease_phenotype_negative --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_phenotype_positive.csv --space PrimeKG --header --edge disease_phenotype_positive --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_protein.csv --space PrimeKG --header --edge disease_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_disease_disease.csv --space PrimeKG --header --edge disease_disease --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_drug_effect.csv --space PrimeKG --header --edge drug_effect --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_bioprocess_bioprocess.csv --space PrimeKG --header --edge bioprocess_bioprocess --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_molfunc_molfunc.csv --space PrimeKG --header --edge molfunc_molfunc --src 2 --dst 3 --props 1:display_relation


In [None]:
# load the edge data in two batches
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_cellcomp_cellcomp.csv --space PrimeKG --header --edge cellcomp_cellcomp --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_molfunc_protein.csv --space PrimeKG --header --edge molfunc_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_cellcomp_protein.csv --space PrimeKG --header --edge cellcomp_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_bioprocess_protein.csv --space PrimeKG --header --edge bioprocess_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_protein.csv --space PrimeKG --header --edge exposure_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_disease.csv --space PrimeKG --header --edge exposure_disease --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_exposure.csv --space PrimeKG --header --edge exposure_exposure --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_bioprocess.csv --space PrimeKG --header --edge exposure_bioprocess --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_molfunc.csv --space PrimeKG --header --edge exposure_molfunc --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_exposure_cellcomp.csv --space PrimeKG --header --edge exposure_cellcomp --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_pathway_pathway.csv --space PrimeKG --header --edge pathway_pathway --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_pathway_protein.csv --space PrimeKG --header --edge pathway_protein --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_anatomy_anatomy.csv --space PrimeKG --header --edge anatomy_anatomy --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_anatomy_protein_present.csv --space PrimeKG --batch 500 --header --edge anatomy_protein_present --src 2 --dst 3 --props 1:display_relation
%ng_load --source ~/scratch-llm/data/PrimeKG_data/sub_data/edge_anatomy_protein_absent.csv --space PrimeKG --header --edge anatomy_protein_absent --src 2 --dst 3 --props 1:display_relation


## Checks

In [None]:
%ngql DESCRIBE TAG drug;

In [None]:
#small check-up, with extra drug and disease information for nodes
%ngql GET SUBGRAPH WITH PROP 1 STEPS FROM 14045 OUT drug_protein YIELD VERTICES AS nodes, EDGES AS relationships;

# Load directly to NebulaPropertyGraphStore

In [2]:
from llama_index.core.schema import NodeRelationship, TextNode, IndexNode, RelatedNodeInfo
from llama_index.graph_stores.nebula import NebulaPropertyGraphStore
from llama_index.core import (
    StorageContext,
)

define files to use:

In [None]:
data_dir = '~/scratch-llm/data/PrimeKG_data/sub_data/'

# List of node types and their corresponding CSV files
node_files = {
    'node_gene_protein.csv',
    'node_merged_drug.csv', # manually corrected and generated from "extra information section"
    'node_effect_phenotype.csv',
    'node_merged_disease.csv', # manually corrected and generated from "extra information section"
    'node_biological_process.csv',
    'node_molecular_function.csv',
    'node_cellular_component.csv',
    'node_exposure.csv',
    'node_pathway.csv',
    'node_anatomy.csv'
}

edge_files ={
    'edge_protein_protein.csv',
    'edge_drug_protein.csv',
    'edge_contraindication.csv',
    'edge_indication.csv',
    'edge_off_label_use.csv',
    'edge_drug_drug.csv', # around 40 mins
    'edge_phenotype_protein.csv',
    'edge_phenotype_phenotype.csv',
    'edge_disease_phenotype_negative.csv',
    'edge_disease_phenotype_positive.csv',
    'edge_disease_protein.csv',
    'edge_disease_disease.csv',
    'edge_drug_effect.csv',
    'edge_bioprocess_bioprocess.csv',
    'edge_molfunc_molfunc.csv',
    'edge_cellcomp_cellcomp.csv',
    'edge_molfunc_protein.csv',
    'edge_cellcomp_protein.csv',
    'edge_bioprocess_protein.csv',
    'edge_exposure_protein.csv',
    'edge_exposure_disease.csv',
    'edge_exposure_exposure.csv',
    'edge_exposure_bioprocess.csv',
    'edge_exposure_molfunc.csv',
    'edge_exposure_cellcomp.csv',
    'edge_pathway_pathway.csv',
    'edge_pathway_protein.csv',
    'edge_anatomy_anatomy.csv',
    'edge_anatomy_protein_present.csv', #around 90 mins
    'edge_anatomy_protein_absent.csv'
}

## Nodes

In [31]:
# create TextNodes
def create_text_nodes(file_path):
    df = pd.read_csv(file_path)
    text_nodes = []
    for _, row in df.iterrows():
        id = row['node_index']
        # include all text as `text` property
        text = row.drop(['node_index']).to_dict()
        text_node = TextNode(id_= str(id), text=str(text), metadata = {'id': id})
        text_nodes.append(text_node)
    return text_nodes


In [151]:
# generate all nodes from the node files
all_nodes = []
for file in node_files:
    nodes = create_text_nodes(data_dir+file)
    all_nodes.extend(nodes)


In [152]:
all_nodes[73]

TextNode(id_='53200', embedding=None, metadata={'id': 53200}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text="{'node_id': 16615, 'node_type': 'molecular_function', 'node_name': 'malate dehydrogenase activity', 'node_source': 'GO'}", mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

## Edges

In [None]:
# create all edges
from tqdm.notebook import tqdm

def create_all_edges(file_path, all_nodes):
    node_dict = {node.id_: node for node in all_nodes} #speed up search
    df = pd.read_csv(os.path.join(file_path))

    for _, row in df.iterrows():
        x_index = str(row['x_index'])

        if x_index in node_dict:
            node = node_dict[x_index]
            if NodeRelationship.CHILD not in node.relationships:
                node.relationships[NodeRelationship.CHILD] = []
            y_index = str(row['y_index'])
            metadata = row.drop(['x_index', 'y_index']).to_dict()
            
            # check that the relationship is not there already
            if not any(isinstance(r, RelatedNodeInfo) and r.node_id == y_index for r in node.relationships[NodeRelationship.CHILD]):
                new_relationship = RelatedNodeInfo(node_id=y_index, metadata=metadata)
                node.relationships[NodeRelationship.CHILD].append(new_relationship)

In [None]:
# all relationships for all nodes, takes  bit less than 2h
for file in tqdm(edge_files, desc='Procesing edge files'):
    create_all_edges(data_dir+file, all_nodes)

check ups that edges are correctly generated:

In [159]:
%ngql USE PrimeKG;

In [160]:
%ngql GET SUBGRAPH WITH PROP 1 STEPS FROM 37552 OUT disease_disease YIELD VERTICES AS nodes, EDGES AS relationships;

Unnamed: 0,nodes,relationships
0,"[(37552 :disease{group_id_bert: __NULL__, group_name_bert: __NULL__, mayo_causes: ""Cancer is caused by changes to the DNA within cells. The DNA inside a cell is packaged into a large number of individual genes, each of which contains a set of instructions telling the cell what functions to perform, as well as how to grow and divide. Errors in the instructions can cause the cell to stop its normal function and may allow a cell to become cancerous. What do gene mutations do?, A gene mutation can instruct a healthy cell to: Allow rapid growth. A gene mutation can tell a cell to grow and divide more rapidly. This creates many new cells that all have that same mutation. Fail to stop uncontrolled cell growth. Normal cells know when to stop growing so that you have just the right number of each type of cell. Cancer cells lose the controls that tell them when to stop growing. A mutation in a tumor suppressor gene allows cancer cells to continue growing and accumulating. Make mistakes when repairing DNA errors. DNA repair genes look for errors in a cell's DNA and make corrections. A mutation in a DNA repair gene may mean that other errors aren't corrected, leading cells to become cancerous. These mutations are the most common ones found in cancer. But many other gene mutations can contribute to causing cancer. What causes gene mutations?, Gene mutations can occur for several reasons, for instance: Gene mutations you're born with. You may be born with a genetic mutation that you inherited from your parents. This type of mutation accounts for a small percentage of cancers. Gene mutations that occur after birth. Most gene mutations occur after you're born and aren't inherited. A number of forces can cause gene mutations, such as smoking, radiation, viruses, cancer-causing chemicals, obesity, hormones, chronic inflammation and a lack of exercise. Gene mutations occur frequently during normal cell growth. However, cells contain a mechanism that recognizes when a mistake occurs and repairs the mistake. Occasionally, a mistake is missed. This could cause a cell to become cancerous. How do gene mutations interact with each other?, The gene mutations you're born with and those that you acquire throughout your life work together to cause cancer. For instance, if you've inherited a genetic mutation that predisposes you to cancer, that doesn't mean you're certain to get cancer. Instead, you may need one or more other gene mutations to cause cancer. Your inherited gene mutation could make you more likely than other people to develop cancer when exposed to a certain cancer-causing substance. It's not clear just how many mutations must accumulate for cancer to form. It's likely that this varies among cancer types."", mayo_complications: ""Cancer and its treatment can cause several complications, including: Pain. Pain can be caused by cancer or by cancer treatment, though not all cancer is painful. Medications and other approaches can effectively treat cancer-related pain. Fatigue. Fatigue in people with cancer has many causes, but it can often be managed. Fatigue associated with chemotherapy or radiation therapy treatments is common, but it's usually temporary. Difficulty breathing. Cancer or cancer treatment may cause a feeling of being short of breath. Treatments may bring relief. Nausea. Certain cancers and cancer treatments can cause nausea. Your doctor can sometimes predict if your treatment is likely to cause nausea. Medications and other treatments may help you prevent or decrease nausea. Diarrhea or constipation. Cancer and cancer treatment can affect your bowels and cause diarrhea or constipation. Weight loss. Cancer and cancer treatment may cause weight loss. Cancer steals food from normal cells and deprives them of nutrients. This is often not affected by how many calories or what kind of food is eaten; it's difficult to treat. In most cases, using artificial nutrition through tubes into the stomach or vein does not help change the weight loss. Chemical changes in your body. Cancer can upset the normal chemical balance in your body and increase your risk of serious complications. Signs and symptoms of chemical imbalances might include excessive thirst, frequent urination, constipation and confusion. Brain and nervous system problems. Cancer can press on nearby nerves and cause pain and loss of function of one part of your body. Cancer that involves the brain can cause headaches and stroke-like signs and symptoms, such as weakness on one side of your body. Unusual immune system reactions to cancer. In some cases the body's immune system may react to the presence of cancer by attacking healthy cells. Called paraneoplastic syndrome, these very rare reactions can lead to a variety of signs and symptoms, such as difficulty walking and seizures. Cancer that spreads. As cancer advances, it may spread to other parts of the body. Where cancer spreads depends on the type of cancer. Cancer that returns. Cancer survivors have a risk of cancer recurrence. Some cancers are more likely to recur than others. Ask your doctor about what you can do to reduce your risk of cancer recurrence. Your doctor may devise a follow-up care plan for you after treatment. This plan may include periodic scans and exams in the months and years after your treatment, to look for cancer recurrence."", mayo_prevention: ""There's no certain way to prevent cancer. But doctors have identified several ways of reducing your cancer risk, such as: Stop smoking. If you smoke, quit. If you don't smoke, don't start. Smoking is linked to several types of cancer not just lung cancer. Stopping now will reduce your risk of cancer in the future. Avoid excessive sun exposure. Harmful ultraviolet rays from the sun can increase your risk of skin cancer. Limit your sun exposure by staying in the shade, wearing protective clothing or applying sunscreen. Eat a healthy diet. Choose a diet rich in fruits and vegetables. Select whole grains and lean proteins. Exercise most days of the week. Regular exercise is linked to a lower risk of cancer. Aim for at least 30 minutes of exercise most days of the week. If you haven't been exercising regularly, start out slowly and work your way up to 30 minutes or longer. Maintain a healthy weight. Being overweight or obese may increase your risk of cancer. Work to achieve and maintain a healthy weight through a combination of a healthy diet and regular exercise. Drink alcohol in moderation, if you choose to drink. If you choose to drink alcohol, limit yourself to one drink a day if you're a woman of any age or a man older than age 65, or two drinks a day if you're a man 65 years old or younger. Schedule cancer screening exams. Talk to your doctor about what types of cancer screening exams are best for you based on your risk factors. Ask your doctor about immunizations. Certain viruses increase your risk of cancer. Immunizations may help prevent those viruses, including hepatitis B, which increases the risk of liver cancer, and human papillomavirus, which increases the risk of cervical cancer and other cancers. Ask your doctor whether immunization against these viruses is appropriate for you."", mayo_risk_factors: ""While doctors have an idea of what may increase your risk of cancer, the majority of cancers occur in people who don't have any known risk factors. Factors known to increase your risk of cancer include: Your age, Cancer can take decades to develop. That's why most people diagnosed with cancer are 65 or older. While it's more common in older adults, cancer isn't exclusively an adult disease cancer can be diagnosed at any age. Your habits, Certain lifestyle choices are known to increase your risk of cancer. Smoking, drinking more than one alcoholic drink a day or two drinks a day, excessive exposure to the sun or frequent blistering sunburns, being obese, and having unsafe sex can contribute to cancer. You can change these habits to lower your risk of cancer though some habits are easier to change than others. Your family history, Only a small portion of cancers are due to an inherited condition. If cancer is common in your family, it's possible that mutations are being passed from one generation to the next. You might be a candidate for genetic testing to see whether you have inherited mutations that might increase your risk of certain cancers. Keep in mind that having an inherited genetic mutation doesn't necessarily mean you'll get cancer. Your health conditions, Some chronic health conditions, such as ulcerative colitis, can markedly increase your risk of developing certain cancers. Talk to your doctor about your risk. Your environment, The environment around you may contain harmful chemicals that can increase your risk of cancer. Even if you don't smoke, you might inhale secondhand smoke if you go where people are smoking or if you live with someone who smokes. Chemicals in your home or workplace, such as asbestos and benzene, also are associated with an increased risk of cancer."", mayo_see_doc: ""When to see a doctor, Make an appointment with your doctor if you have any persistent signs or symptoms that concern you. If you don't have any signs or symptoms, but are worried about your risk of cancer, discuss your concerns with your doctor. Ask about which cancer screening tests and procedures are appropriate for you."", mayo_symptoms: ""Signs and symptoms caused by cancer will vary depending on what part of the body is affected. Some general signs and symptoms associated with, but not specific to, cancer, include: Fatigue, Lump or area of thickening that can be felt under the skin, Weight changes, including unintended loss or gain, Skin changes, such as yellowing, darkening or redness of the skin, sores that won't heal, or changes to existing moles, Changes in bowel or bladder habits, Persistent cough or trouble breathing, Difficulty swallowing, Hoarseness, Persistent indigestion or discomfort after eating, Persistent, unexplained muscle or joint pain, Persistent, unexplained fevers or night sweats, Unexplained bleeding or bruising"", mondo_definition: ""A primary or metastatic malignant neoplasm that affects the oropharynx."", mondo_id: 4608, mondo_name: ""oropharynx cancer"", node_name: ""meconium ileus"", node_source: ""MONDO"", orphanet_clinical_description: __NULL__, orphanet_definition: __NULL__, orphanet_epidemiology: __NULL__, orphanet_management_and_treatment: __NULL__, orphanet_prevalence: __NULL__, umls_description: __NULL__})]","[(37552)-[:disease_disease@0{display_relation: ""parent-child""}]->(96276), (37552)-[:disease_disease@0{display_relation: ""parent-child""}]->(27201), (37552)-[:disease_disease@0{display_relation: ""parent-child""}]->(36769)]"
1,"[(27201 :disease{group_id_bert: ""8470_30074"", group_name_bert: ""spondyloepiphyseal dysplasia with punctate corneal dystrophy"", mayo_causes: __NULL__, mayo_complications: __NULL__, mayo_prevention: __NULL__, mayo_risk_factors: __NULL__, mayo_see_doc: __NULL__, mayo_symptoms: __NULL__, mondo_definition: __NULL__, mondo_id: 30074, mondo_name: ""spondylometaphyseal dysplasia with corneal dystrophy"", node_name: ""intestinal obstruction in the newborn due to guanylate cyclase 2C deficiency"", node_source: ""MONDO"", orphanet_clinical_description: __NULL__, orphanet_definition: __NULL__, orphanet_epidemiology: __NULL__, orphanet_management_and_treatment: __NULL__, orphanet_prevalence: __NULL__, umls_description: __NULL__}), (96276 :disease{group_id_bert: __NULL__, group_name_bert: __NULL__, mayo_causes: __NULL__, mayo_complications: __NULL__, mayo_prevention: __NULL__, mayo_risk_factors: __NULL__, mayo_see_doc: __NULL__, mayo_symptoms: __NULL__, mondo_definition: ""An adenocarcinoma that arises from the fallopian tube and is characterized by a papillary architectural pattern."", mondo_id: 3535, mondo_name: ""fallopian tube papillary adenocarcinoma"", node_name: ""cystic fibrosis associated meconium ileus"", node_source: ""MONDO"", orphanet_clinical_description: __NULL__, orphanet_definition: __NULL__, orphanet_epidemiology: __NULL__, orphanet_management_and_treatment: __NULL__, orphanet_prevalence: __NULL__, umls_description: __NULL__}), (36769 :disease{group_id_bert: __NULL__, group_name_bert: __NULL__, mayo_causes: __NULL__, mayo_complications: __NULL__, mayo_prevention: __NULL__, mayo_risk_factors: __NULL__, mayo_see_doc: __NULL__, mayo_symptoms: __NULL__, mondo_definition: ""An adenocarcinoma that arises from the extrahepatic bile ducts. It is characterized by the presence of signet ring malignant epithelial cells."", mondo_id: 2664, mondo_name: ""extrahepatic bile duct signet ring cell carcinoma"", node_name: ""ileus"", node_source: ""MONDO"", orphanet_clinical_description: __NULL__, orphanet_definition: __NULL__, orphanet_epidemiology: __NULL__, orphanet_management_and_treatment: __NULL__, orphanet_prevalence: __NULL__, umls_description: __NULL__})]","[(27201)-[:disease_disease@0{display_relation: ""parent-child""}]->(37552), (96276)-[:disease_disease@0{display_relation: ""parent-child""}]->(37552), (36769)-[:disease_disease@0{display_relation: ""parent-child""}]->(37552)]"


In [161]:
#look for the index from the node that has the node_id 95635
for i, node in enumerate(all_nodes):
    if node.node_id == '37552':
        print(i)
        break


118129


In [174]:
all_nodes[118130].relationships

{<NodeRelationship.CHILD: '5'>: [RelatedNodeInfo(node_id='37554', node_type=None, metadata={'relation': 'disease_disease', 'display_relation': 'parent-child'}, hash=None),
  RelatedNodeInfo(node_id='28850', node_type=None, metadata={'relation': 'disease_disease', 'display_relation': 'parent-child'}, hash=None),
  RelatedNodeInfo(node_id='38266', node_type=None, metadata={'relation': 'disease_disease', 'display_relation': 'parent-child'}, hash=None),
  RelatedNodeInfo(node_id='38036', node_type=None, metadata={'relation': 'disease_disease', 'display_relation': 'parent-child'}, hash=None),
  RelatedNodeInfo(node_id='36003', node_type=None, metadata={'relation': 'disease_disease', 'display_relation': 'parent-child'}, hash=None),
  RelatedNodeInfo(node_id='14844', node_type=None, metadata={'relation': 'indication', 'display_relation': 'indication'}, hash=None),
  RelatedNodeInfo(node_id='2160', node_type=None, metadata={'relation': 'disease_protein', 'display_relation': 'associated with'},

In [175]:
start_node = all_nodes[118130]
related_nodes_info = start_node.relationships.get(NodeRelationship.CHILD, [])

related_nodes = []
edges = []

for relation in related_nodes_info:
    # Find the related node by its node_id
    related_node = next((node for node in all_nodes if node.node_id == relation.node_id), None)
    if related_node:
        related_nodes.append(related_node)
        edges.append((start_node.node_id, related_node.node_id, relation.metadata))

subgraph_nodes = [start_node] + related_nodes  # Include the starting node and its neighbors


In [176]:
subgraph_nodes

[TextNode(id_='37553', embedding=None, metadata={'id': 37553}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.CHILD: '5'>: [RelatedNodeInfo(node_id='37554', node_type=None, metadata={'relation': 'disease_disease', 'display_relation': 'parent-child'}, hash=None), RelatedNodeInfo(node_id='28850', node_type=None, metadata={'relation': 'disease_disease', 'display_relation': 'parent-child'}, hash=None), RelatedNodeInfo(node_id='38266', node_type=None, metadata={'relation': 'disease_disease', 'display_relation': 'parent-child'}, hash=None), RelatedNodeInfo(node_id='38036', node_type=None, metadata={'relation': 'disease_disease', 'display_relation': 'parent-child'}, hash=None), RelatedNodeInfo(node_id='36003', node_type=None, metadata={'relation': 'disease_disease', 'display_relation': 'parent-child'}, hash=None), RelatedNodeInfo(node_id='14844', node_type=None, metadata={'relation': 'indication', 'display_relation': 'indication'}, hash=None),

## Generate index

In [None]:
%ngql CREATE SPACE IF NOT EXISTS PrimeKG_manual(vid_type=FIXED_STRING(256), partition_num=20, replica_factor=1);

In [None]:
%ngql drop space PrimeKG_manual;

In [177]:
# PropertyGraphIndex requires NebulaPropertyGraphStore
graph_store = NebulaPropertyGraphStore(
    space= "PrimeKG_manual", 
    username = "root",
    password = "nebula",
    url = "nebula://localhost:9669",
    props_schema= "`id` INT,`node_id` STRING, `node_name` STRING, `node_source` STRING, `mondo_id` STRING, `mondo_name` STRING, `group_id_bert` STRING, `group_name_bert` STRING, `mondo_definition` STRING, `umls_description` STRING, `orphanet_definition` STRING, `orphanet_prevalence` STRING, `orphanet_epidemiology` STRING, `orphanet_clinical_description` STRING, `orphanet_management_and_treatment` STRING, `mayo_symptoms` STRING, `mayo_causes` STRING, `mayo_risk_factors` STRING, `mayo_complications` STRING, `mayo_prevention` STRING, `mayo_see_doc` STRING, `display_relation` STRING, `description` STRING, `half_life` STRING, `indication` STRING, `mechanism_of_action` STRING, `protein_binding` STRING, `pharmacodynamics` STRING, `state` STRING, `atc_1` STRING, `atc_2` STRING, `atc_3` STRING, `atc_4` STRING, `category` STRING, `group` STRING, `pathway` STRING, `molecular_weight` STRING, `tpsa` STRING, `clogp` STRING, `_node_content` STRING,`_node_type` STRING,`document_id` STRING,`doc_id` STRING,`ref_doc_id` STRING, `triplet_source_id` STRING"
)

storage_context = StorageContext.from_defaults(property_graph_store=graph_store)

convert TextNode into IndexNode to insert the nodes into the index:

In [178]:
from llama_index.core.vector_stores.simple import SimpleVectorStore
from llama_index.core import PropertyGraphIndex, Settings
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import torch

vec_store = SimpleVectorStore()








In [179]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [180]:
# Create IndexNode objects from TextNode objects
index_nodes = [IndexNode.from_text_node(node, index_id="index_1") for node in subgraph_nodes]
index_nodes


[IndexNode(id_='37553', embedding=None, metadata={'id': 37553}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.CHILD: '5'>: [RelatedNodeInfo(node_id='37554', node_type=None, metadata={'relation': 'disease_disease', 'display_relation': 'parent-child'}, hash=None), RelatedNodeInfo(node_id='28850', node_type=None, metadata={'relation': 'disease_disease', 'display_relation': 'parent-child'}, hash=None), RelatedNodeInfo(node_id='38266', node_type=None, metadata={'relation': 'disease_disease', 'display_relation': 'parent-child'}, hash=None), RelatedNodeInfo(node_id='38036', node_type=None, metadata={'relation': 'disease_disease', 'display_relation': 'parent-child'}, hash=None), RelatedNodeInfo(node_id='36003', node_type=None, metadata={'relation': 'disease_disease', 'display_relation': 'parent-child'}, hash=None), RelatedNodeInfo(node_id='14844', node_type=None, metadata={'relation': 'indication', 'display_relation': 'indication'}, hash=None)

In [181]:
# this was intended to be for the bug but not really doing anything
from transformers import AutoTokenizer

# Load tokenizer and set pad_token_id explicitly
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B")
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token  # Set pad_token to eos_token if not defined

In [182]:
llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.8, "do_sample": True},
    # system_prompt=system_prompt,
    # query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="meta-llama/Llama-3.2-3B",
    model_name="meta-llama/Llama-3.2-3B",
    device_map="auto",
    stopping_ids=[50278, 50279, 50277, 1, 0],
    tokenizer_kwargs={"max_length": 4096, "pad_token_id": tokenizer.pad_token_id},
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16}
)

Settings.llm = llm
Settings.chunk_size = 1024
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-m3") 

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [183]:
index = PropertyGraphIndex.from_existing(
    llm=llm,
    vector_store=vec_store,
    property_graph_store=graph_store,
    show_progress=True,
    storage_context=storage_context,
    embed_kg_nodes=True
)

In [186]:
import nest_asyncio
nest_asyncio.apply()

# Ensure that node IDs are strings fitting the vertex ID length limit
for node in index_nodes:
	node.id_ = str(node.id_)

# Insert nodes into the index
index.insert_nodes(index_nodes)

Extracting paths from text:   0%|          | 0/13 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Extracting paths from text: 100%|██████████| 13/13 [01:08<00:00,  5.26s/it]
E

save index to persist storage

In [187]:
index.storage_context.persist(persist_dir="~/scratch-llm/storage/PrimeKG_index_mid/")
