# Import modules

In [1]:
import os
import importlib
import subprocess
import pandas as pd

try:
    importlib.import_module('pygrpm')
except ImportError:
    subprocess.check_call(["pip", "install", "git+https://github.com/johndef64/GRPM_system.git"])
    
from pygrpm import *

# GET Datasets

In [4]:
### GET Datasets ###
if not os.path.exists('grpm_dataset/grpm_dataset.parquet'):
    get_and_extract('grpm_dataset', record_id='14052302')
    get_and_extract('nutrigenetic_dataset', record_id='14052302')

# LOAD Datasets

In [2]:
### LOAD Datasets ###
pcg_grpm, rna_grpm, pseudo_grpm = grpm_importer()
grpm_nutrigen, grpm_nutrigen_int, grpm_nutrigen_int_gwas = nutrig_importer()

display(grpm_nutrigen_int)

Importing time:  0:00:06.737058
pcg: 776.19 MB
rna: 58.18 MB
pseudo: 1.93 MB
nutrigen dataset: 87.13 MB
nutrigen dataset filtered: 53.62 MB
nutrigen gwas dataset: 20.56 MB


Unnamed: 0,gene,rsid,pmid,mesh,topic,interest_index
0,FTO,rs9972653,32393786,Body Mass Index,General Nutrition,1.00000
1,FTO,rs9972653,32393786,"Diabetes Mellitus, Type 2",General Nutrition,1.00000
2,FTO,rs9972653,32393786,Diet,General Nutrition,1.00000
3,FTO,rs9972653,33128006,Body Mass Index,General Nutrition,1.00000
4,FTO,rs9972653,33128006,Cardiovascular Diseases,General Nutrition,1.00000
...,...,...,...,...,...,...
1171244,FADS1,rs174545,23221573,Lipid Metabolism,Xenobiotics Metabolism,0.01255
1171245,FADS1,rs174545,30120404,Liver,Xenobiotics Metabolism,0.01255
1171246,FADS1,rs174545,30120404,Non-alcoholic Fatty Liver Disease,Xenobiotics Metabolism,0.01255
1171247,FADS1,rs174544,20565855,Lipid Metabolism,Xenobiotics Metabolism,0.01255


# SHOW Stats

In [ ]:
%%time
pcg_grpm_stats = get_stats(pcg_grpm, group_by = 'gene')
display(pcg_grpm_stats)

Computing Stats...


In [None]:
%%time
grpm_nutrigen_stats = get_stats(grpm_nutrigen, group_by = 'gene')
display(grpm_nutrigen_stats)

# QUERY GRPM Dataset 

## MeSH Query Example

In [3]:
## GET MeSH Dataset ##
if not os.path.exists('ref-mesh/MESH_STY_LITVAR1.csv'):
    get_and_extract('ref-mesh', record_id='14052302')
    get_topic_terms()

In [4]:
# LOAD MeSH
grpm_mesh = mesh_importer()
grpm_mesh.head()

GRPM MeSH count: 21705
semantic types: 125


Unnamed: 0,Preferred Label,Semantic Types Label,Class ID,mesh_id,Semantic Types
1,Electronic Health Records,Intellectual Product,http://purl.bioontology.org/ontology/MESH/D057286,D057286,T170
2,Consent Forms,Intellectual Product,http://purl.bioontology.org/ontology/MESH/D032962,D032962,T170
3,Genealogy and Heraldry,Intellectual Product,http://purl.bioontology.org/ontology/MESH/D005789,D005789,T170
4,Publications,Intellectual Product,http://purl.bioontology.org/ontology/MESH/D011642,D011642,T170
5,Pharmaceutical Services,Intellectual Product,http://purl.bioontology.org/ontology/MESH/D010593,D010593,T170


In [5]:
get_stats(grpm_mesh, "Semantic Types Label")

Computing Stats...
runtime:  0:00:00.646001


Unnamed: 0_level_0,Preferred Label,Class ID,mesh_id,Semantic Types
Unnamed: 0_level_1,unique,unique,unique,unique
Semantic Types Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Acquired Abnormality,41,41,41,1
Activity,38,38,38,1
Age Group,15,15,15,1
Amino Acid Sequence,45,45,45,1
"Amino Acid, Peptide, or Protein",4625,4625,4625,1
...,...,...,...,...
Therapeutic or Preventive Procedure,693,693,693,1
Tissue,89,89,89,1
Vertebrate,4,4,4,1
Virus,370,370,370,1


In [8]:
# Random Query Example
mesh_query =  grpm_mesh['Preferred Label'].drop_duplicates().sample(10).to_list()

# Filter and get unique results
result = query_dataset(pcg_grpm, mesh_query, 'mesh')
display(result)

Unnamed: 0,gene,type,rsid,pmid,mesh,qualifier,major
38033,MT-CO3,PCG,rs2853826,34576000,Epigenome,,True
45210,MT-ND3,PCG,rs2853826,34576000,Epigenome,,True
52523,MT-ND4L,PCG,rs2853826,34576000,Epigenome,,True
62152,MT-ND4,PCG,rs2853826,34576000,Epigenome,,True
73836,MT-ND5,PCG,rs2853826,34576000,Epigenome,,True
...,...,...,...,...,...,...,...
16595876,CXCL8,PCG,rs2227532,29105764,Interleukin-1,genetics,False
16595895,CXCL8,PCG,rs4073,29105764,Interleukin-1,genetics,False
16597205,CXCL8,PCG,rs2227306,16719905,Interleukin-1,genetics,False
16597221,CXCL8,PCG,rs4073,16719905,Interleukin-1,genetics,False


## Build MeSH Query [CUDA recommended]
In Colab:: load Runtime with GPU 

In [6]:
import torch
print("Torch version:",torch.__version__)
print("Is CUDA enabled?",torch.cuda.is_available())
if torch.cuda.is_available():
    print(torch.randn(1).cuda())
    
# if not CUDA and Windows system: unsitall torch and install "pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118"

Torch version: 2.6.0+cu118
Is CUDA enabled? True
tensor([-0.1710], device='cuda:0')


In [8]:
# LOAD Language Model
MODEL = 'dmis-lab/biobert-v1.1'
model = load_language_model(MODEL)

# Get MeSH embeddings
grpm_meshes = grpm_mesh['Preferred Label'].drop_duplicates().reset_index(drop=True)
mesh_embeddings = extract_embedding(grpm_meshes.to_list(), model)

No sentence-transformers model found with name dmis-lab/biobert-v1.1. Creating a new one with mean pooling.


Batches:   0%|          | 0/679 [00:00<?, ?it/s]

In [15]:
grpm_mesh_embeddings = {"meshes":grpm_meshes, "embeddings":mesh_embeddings}

In [21]:
import pickle
file_path = 'ref-mesh/GrpmMeshEmbeddings_biobert-v1.1.pkl'

# Open the file in write-binary mode to store the pickle
with open(file_path, 'wb') as file:
    # Use pickle to dump the dictionary into the file
    pickle.dump(grpm_mesh_embeddings, file)

Downloading...
From: https://github.com/johndef64/GRPM_system/raw/refs/heads/main/ref-mesh/GrpmMeshEmbeddings_biobert-v1.1.pkl
To: G:\Altri computer\Horizon\horizon_workspace\projects\work\#computer_science\GRPM\GRPM_system\ref-mesh\GrpmMeshEmbeddings_biobert-v1.1.pkl
100%|██████████| 67.1M/67.1M [00:01<00:00, 59.2MB/s]


In [22]:
import pickle
file_path = 'ref-mesh/GrpmMeshEmbeddings_biobert-v1.1.pkl'
url ="https://github.com/johndef64/GRPM_system/raw/refs/heads/main/ref-mesh/GrpmMeshEmbeddings_biobert-v1.1.pkl"
gdown.download(url, file_path, quiet=False)
# Import the mesh_embeddings back from the pickle file
with open(file_path, "rb") as file:
    grpm_mesh_embeddings = pickle.load(file)

grpm_meshes = grpm_mesh_embeddings['meshes']
mesh_embeddings = grpm_mesh_embeddings['embeddings']

Downloading...
From: https://github.com/johndef64/GRPM_system/raw/refs/heads/main/ref-mesh/GrpmMeshEmbeddings_biobert-v1.1.pkl
To: G:\Altri computer\Horizon\horizon_workspace\projects\work\#computer_science\GRPM\GRPM_system\ref-mesh\GrpmMeshEmbeddings_biobert-v1.1.pkl
100%|██████████| 67.1M/67.1M [00:01<00:00, 61.3MB/s]


In [23]:
# User defined Topic Terms
topic_terms_sample = ["diet ketogenic",
                      "diet reducing",
                      "diet sodium-restricted",
                      "diet",
                      "dietary",
                      "dietetics",
                      "dyslipidemias",
                      "eating disorders",
                      "feeding and eating disorde",
                      "food hypersensitivity",
                      "foodborne diseases",
                      "gastrointestinal diseases",
                      "hypercholesterolemia",
                      "hyperglycemia",
                      "hyperlipidemias",
                      "hyperphagia",
                      "hypoglycemia",
                      "hypophagia",
                      "insulin resistance",
                      ]
topic_terms = pd.Series(topic_terms_sample)

In [26]:
# Extract MeSH Query
tab = create_corr_table(topic_terms, grpm_meshes, model, mesh_embeddings)

mesh_query = tab[tab.similarity >= 0.90].list2.to_list()
print('\n\nMeSH Query:', mesh_query)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 19/19 [00:03<00:00,  5.69it/s]



MeSH Query: ['Diet', 'Diet', 'Dyslipidemias', 'Hypersensitivity', 'Gastrointestinal Diseases', 'Hypercholesterolemia', 'Hyperglycemia', 'Hyperlipidemias', 'Hyperphagia', 'Hypoglycemia']





## Execute MeSH Query

In [17]:
# Filter and get unique results
result = query_dataset(pcg_grpm, mesh_query, 'mesh')
display(result)

Unnamed: 0,gene,type,rsid,pmid,mesh,qualifier,major
5434,MT-ND1,PCG,rs200319905,25579139,Hyperlipidemias,genetics,False
5435,MT-ND1,PCG,rs200319905,25579139,Hyperlipidemias,metabolism,False
5436,MT-ND1,PCG,rs200319905,25579139,Hyperlipidemias,pathology,False
13642,MT-ND2,PCG,rs200319905,25579139,Hyperlipidemias,genetics,False
13643,MT-ND2,PCG,rs200319905,25579139,Hyperlipidemias,metabolism,False
...,...,...,...,...,...,...,...
16606654,CXCL8,PCG,rs572157399,20508232,Hypoglycemia,complications,False
16606655,CXCL8,PCG,rs572157399,20508232,Hypoglycemia,diagnosis,False
16606656,CXCL8,PCG,rs572157399,20508232,Hypoglycemia,epidemiology,True
16607143,CXCL8,PCG,rs572157399,21357364,Hyperglycemia,metabolism,False


# QUERY Nutrigenetic Dataset

In [27]:
# Gene Query on Nutrigenetic ds
topic =  grpm_nutrigen_int.topic[0]

print(f'Displaying "{topic}" topic')
# Filter and get unique results
result = query_dataset(grpm_nutrigen_int, [topic], 'topic')
display(result)

Displaying "General Nutrition" topic


Unnamed: 0,gene,rsid,pmid,mesh,topic,interest_index
0,FTO,rs9972653,32393786,Body Mass Index,General Nutrition,1.00000
1,FTO,rs9972653,32393786,"Diabetes Mellitus, Type 2",General Nutrition,1.00000
2,FTO,rs9972653,32393786,Diet,General Nutrition,1.00000
3,FTO,rs9972653,33128006,Body Mass Index,General Nutrition,1.00000
4,FTO,rs9972653,33128006,Cardiovascular Diseases,General Nutrition,1.00000
...,...,...,...,...,...,...
268296,DRD2,rs1076560,31948125,Anxiety,General Nutrition,0.01251
268297,DRD2,rs1076560,22829935,Physical Endurance,General Nutrition,0.01251
268298,DRD2,rs1076560,30729689,"Diabetes Mellitus, Type 2",General Nutrition,0.01251
268299,DRD2,rs1076560,30729689,Metabolic Syndrome,General Nutrition,0.01251


## Gene Query Example

In [None]:
# Gene Query on Nutrigenetic ds
my_genes = (
    'FTO',
    'APOB',
    'G6PD'
)
# Filter and get unique results
result = query_dataset(grpm_nutrigen_int, my_genes, 'gene')
display(result)

In [None]:
# Gene Query on Nutrigenetic-GWAS ds
result = query_dataset(grpm_nutrigen_int_gwas, my_genes, 'GRPM_GENE')
display(result)

# Query by Nutrigenetic Track

In [28]:
topics = list(set(grpm_nutrigen_int.topic))
for t in topics:
    print(f'Topic: {t}, rsID count: {len(grpm_nutrigen_int[grpm_nutrigen_int.topic == t ].rsid.drop_duplicates())} ')

print(f'\nTotal: {len(grpm_nutrigen_int.rsid.drop_duplicates())} ')


Topic: General Nutrition, rsID count: 26456 
Topic: Cardiovascular Health and Lipid Metabolism, rsID count: 41931 
Topic: Diet-induced Oxidative Stress, rsID count: 2559 
Topic: Food Allergies, rsID count: 6289 
Topic: Xenobiotics Metabolism, rsID count: 7159 
Topic: Food Intolerances, rsID count: 5008 
Topic: Eating Behavior and Taste Sensation, rsID count: 4252 
Topic: Obesity, Weight Control and Compulsive Eating, rsID count: 10842 
Topic: Vitamin and Micronutrients Metabolism and Deficiency-Related Diseases, rsID count: 3525 
Topic: Diabetes Mellitus Type II and Metabolic Syndrome, rsID count: 22270 

Total: 63581 


Build a composite query

    \item [1)] Although the paper provides a link to a directory containing\textit{ sample queries}, it would be very helpful to provide a few examples of  particularly interesting queries that can be written with this system.

    \item [R:] \textcolor{violet}{Si riferisce ala directori tests e al notebook tests.ipynb. alcuni esempi di query particolarmente interessanti che possono essere scritte con questo sistema.  
    Una query di interesse sul datase nutrientico complessivo potrebbe essere la ricerca di varianti putatiamente correlate a due diverse condizioni nutrizionali assemblando una query composita:
    .
    esempio: Relationship between nutritional status and the systemic inflammatory response: micronutrients a livello genetico \cite{https://pubmed.ncbi.nlm.nih.gov/30220267/}
    .
    Query composita di questo tipo può essere composta con l'ausilio di BioBERT embedding per slezionare MESH partendo da una descrizione testuale (come proposto nel notebook "test.ipynb"), una funzinoe utile per l'implementazione di un interfaccia utente. 
    . 
    }
    % c'è tempo per un ultimaimplemetaione?