# Import modules

In [1]:
import importlib
import subprocess

try:
    importlib.import_module('pygrpm')
except ImportError:
    subprocess.check_call(["pip", "install", "git+https://github.com/johndef64/GRPM_system.git"])
    
from pygrpm import *

# GET Datasets

In [3]:
### GET Datasets ###
if not os.path.exists('grpm_dataset/grpm_dataset.parquet'):
    get_and_extract('grpm_dataset', record_id='14052302')
    get_and_extract('nutrigenetic_dataset', record_id='14052302')

# LOAD Datasets

In [2]:
### LOAD Datasets ###
pcg_grpm, rna_grpm, pseudo_grpm = grpm_importer()
grpm_nutrigen, grpm_nutrigen_int, grpm_nutrigen_int_gwas = nutrig_importer()

display(grpm_nutrigen_int)

Importing time:  0:00:07.151996
pcg: 776.19 MB
rna: 58.18 MB
pseudo: 1.93 MB
nutrigen dataset: 87.13 MB
nutrigen dataset filtered: 53.62 MB
nutrigen gwas dataset: 20.56 MB


Unnamed: 0,gene,rsid,pmid,mesh,topic,interest_index
0,FTO,rs9972653,32393786,Body Mass Index,General Nutrition,1.00000
1,FTO,rs9972653,32393786,"Diabetes Mellitus, Type 2",General Nutrition,1.00000
2,FTO,rs9972653,32393786,Diet,General Nutrition,1.00000
3,FTO,rs9972653,33128006,Body Mass Index,General Nutrition,1.00000
4,FTO,rs9972653,33128006,Cardiovascular Diseases,General Nutrition,1.00000
...,...,...,...,...,...,...
1171244,FADS1,rs174545,23221573,Lipid Metabolism,Xenobiotics Metabolism,0.01255
1171245,FADS1,rs174545,30120404,Liver,Xenobiotics Metabolism,0.01255
1171246,FADS1,rs174545,30120404,Non-alcoholic Fatty Liver Disease,Xenobiotics Metabolism,0.01255
1171247,FADS1,rs174544,20565855,Lipid Metabolism,Xenobiotics Metabolism,0.01255


# SHOW Stats

In [ ]:
%%time
pcg_grpm_stats = get_stats(pcg_grpm, group_by = 'gene')
display(pcg_grpm_stats)

In [None]:
%%time
grpm_nutrigen_stats = get_stats(grpm_nutrigen, group_by = 'gene')
display(grpm_nutrigen_stats)

# QUERY GRPM Dataset 

## MeSH Query Example

In [2]:
# LOAD MeSH

grpm_mesh= import_grpm_mesh()
grpm_mesh.head()

GRPM MeSH count: 21705


Unnamed: 0,Preferred Label,Class ID,Synonyms,Definitions
0,Electronic Health Records,http://purl.bioontology.org/ontology/MESH/D057286,"Electronic Medical Record|Medical Record, Elec...",Media that facilitate transportability of pert...
1,Consent Forms,http://purl.bioontology.org/ontology/MESH/D032962,Informed Consent Documents|Informed Consent Fo...,Documents describing a medical treatment or re...
2,Genealogy and Heraldry,http://purl.bioontology.org/ontology/MESH/D005789,Geneology and Heraldry|Heraldry and Genealogy|...,"Descent of a person, family, or group from an ..."
3,Publications,http://purl.bioontology.org/ontology/MESH/D011642,Publication,Copies of a work or document distributed to th...
4,Pharmaceutical Services,http://purl.bioontology.org/ontology/MESH/D010593,"Pharmaceutical Service|Services, Pharmaceutic|...",Total pharmaceutical services provided by qual...


In [5]:
# Random Query Example
mesh_query =  grpm_mesh['Preferred Label'].drop_duplicates().sample(10).to_list()

# Filter and get unique results
result = query_dataset(pcg_grpm, mesh_query, 'mesh')
display(result)

Unnamed: 0,gene,type,rsid,pmid,mesh,qualifier,major
178386,KRT23,PCG,rs140407470,21874024,CD28 Antigens,physiology,False
222732,MMEL1,PCG,rs3890745,19898481,CD28 Antigens,genetics,True
361776,HLA-DRA,PCG,rs3135392,33058932,Cephalosporins,adverse effects,True
400814,TNF,PCG,rs1800629,25510954,CD28 Antigens,analysis,False
589282,PTPRC,PCG,rs10919563,19898481,CD28 Antigens,genetics,True
...,...,...,...,...,...,...,...
16415706,TNPO3,PCG,rs10488631,31030958,CD28 Antigens,antagonists & inhibitors,False
16416142,TNPO3,PCG,rs10488631,27092776,CD28 Antigens,genetics,True
16416202,TNPO3,PCG,rs10488631,34017081,CD28 Antigens,genetics,True
16488218,ALOX15,PCG,rs34210653,31301373,Turbinates,metabolism,True


## Build MeSH Query 
[CUDA recommended] - In Colab:: load Runtime with GPU 

In [3]:
# LOAD Language Model
MODEL = 'dmis-lab/biobert-v1.1'
model = load_language_model(MODEL)
file_path = 'ref-mesh/GrpmMeshSynEmbeddings_biobert-v1.1.pkl'

# Get MeSH embeddings
test_cuda()
grpm_mesh_embeddings = get_mesh_embeddings(grpm_mesh, model, file_path, retrain=True, synonyms=True)

grpm_meshes = grpm_mesh_embeddings['meshes']
mesh_embeddings = grpm_mesh_embeddings['embeddings']

##################################################################
file_path = 'ref-mesh/GrpmMeshDefEmbeddings_biobert-v1.1.pkl'
grpm_mesh_embeddings_ = get_mesh_embeddings(grpm_mesh, model, file_path)#, retrain=True, full_lenght=True)

grpm_meshes_ = grpm_mesh_embeddings_['meshes']
mesh_embeddings_ = grpm_mesh_embeddings_['embeddings']

No sentence-transformers model found with name dmis-lab/biobert-v1.1. Creating a new one with mean pooling.


Torch version: 2.6.0+cu118
Is CUDA enabled? True
Is CUDA enabled? True
No embeddings. Generating embeddings...


Batches:   0%|          | 0/679 [00:00<?, ?it/s]

Saving embeddings in ref-mesh/GrpmMeshSynEmbeddings_biobert-v1.1.pkl
Importing pretrained embeddings...
Done


In [15]:
len(grpm_meshes)

21705

In [14]:
# User defined Topic Terms

user_query = "diet ketogenic, diet reducing, diet sodium-restricted, diet, dietary, dietetics, dyslipidemias, eating disorders, feeding and eating disorde, food hypersensitivity, foodborne diseases, gastrointestinal diseases, hypercholesterolemia, hyperglycemia, hyperlipidemias, hyperphagia, hypoglycemia, hypophagia, insulin resistance"  # comma separated list

topic_terms_list = user_query.split(',')
topic_terms = pd.Series(topic_terms_list)

# mesh_query = get_mesh_query(user_query, grpm_meshes, model, mesh_embeddings=mesh_embeddings, threshold=0.90)

# Extract MeSH Query
tab = create_corr_table(topic_terms, grpm_meshes, model, mesh_embeddings_)

threshold = 0.84 # set similarity threshold
mesh_query = tab[tab.similarity >= threshold].list2.to_list()
print('\n\nMeSH Query:', mesh_query)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 19/19 [00:03<00:00,  5.59it/s]



MeSH Query: ['Diet, Carbohydrate-Restricted', 'Hyperlipidemias', 'Feeding and Eating Disorders of Childhood', 'Egg Hypersensitivity', 'Endoscopy, Gastrointestinal', 'Hypercholesterolemia', 'Hypertriglyceridemia', 'Hyperlipidemias', 'Hypoglycemic Agents']





Batches:   0%|          | 0/1 [00:00<?, ?it/s]
100%|██████████| 19/19 [00:03<00:00,  5.67it/s]


MeSH Query: ['Hyperlipidemias', 'Feeding and Eating Disorders of Childhood', 'Egg Hypersensitivity', 'Endoscopy, Gastrointestinal', 'Hypercholesterolemia', 'Hyperlipidemias', 'Hypoglycemic Agents']



MeSH Query: ['Diet', 'Diet', 'Dyslipidemias', 'Hypersensitivity', 'Gastrointestinal Diseases', 'Hypercholesterolemia', 'Hyperglycemia', 'Hyperlipidemias', 'Hyperphagia', 'Hypoglycemia']

## Execute MeSH Query

In [None]:
# Filter and get unique results
result = query_dataset(pcg_grpm, mesh_query, 'mesh')
display(result)

# QUERY Nutrigenetic Dataset

In [None]:
get_stats(grpm_nutrigen_int, 'topic')

In [17]:
# Select Topic
topic = "Vitamin and Micronutrients Metabolism and Deficiency-Related Diseases"

# Filter and get unique results
topic_data = query_dataset(grpm_nutrigen_int, [topic], 'topic')
print(f'Displaying "{topic}" topic')
display(topic_data)

Displaying "Vitamin and Micronutrients Metabolism and Deficiency-Related Diseases" topic


Unnamed: 0,gene,rsid,pmid,mesh,topic,interest_index
967603,VDR,rs987849,34578986,Nutritional Status,Vitamin and Micronutrients Metabolism and Defi...,1.00000
967604,VDR,rs987849,34578986,"Receptors, Calcitriol",Vitamin and Micronutrients Metabolism and Defi...,1.00000
967605,VDR,rs987849,34578986,Vitamin D,Vitamin and Micronutrients Metabolism and Defi...,1.00000
967606,VDR,rs987849,34578986,Vitamin D Deficiency,Vitamin and Micronutrients Metabolism and Defi...,1.00000
967607,VDR,rs987849,34578986,Vitamin D-Binding Protein,Vitamin and Micronutrients Metabolism and Defi...,1.00000
...,...,...,...,...,...,...
1002198,ITPR1,rs121912425,21555639,"Inositol 1,4,5-Trisphosphate Receptors",Vitamin and Micronutrients Metabolism and Defi...,0.01255
1002199,ITPR1,rs121912425,22986007,"Inositol 1,4,5-Trisphosphate Receptors",Vitamin and Micronutrients Metabolism and Defi...,0.01255
1002200,ITPR1,rs121912425,18579805,"Inositol 1,4,5-Trisphosphate Receptors",Vitamin and Micronutrients Metabolism and Defi...,0.01255
1002201,ITPR1,rs121912425,27108798,"Inositol 1,4,5-Trisphosphate Receptors",Vitamin and Micronutrients Metabolism and Defi...,0.01255


In [None]:
# Get Topic Data Stats
stats = get_stats(topic_data, "gene", gi_sort=True)

stats

## 1. Gene Query Example

In [None]:
# Gene Query on Nutrigenetic Topic
my_genes = "VDR, G6PD, GSTP1"

# Filter and get unique results
result = query_dataset(topic_data, my_genes.split(','), 'gene')
display(result)

In [None]:
# Gene Query on Nutrigenetic-GWAS ds
result = query_dataset(grpm_nutrigen_int_gwas, my_genes, 'GRPM_GENE')
display(result)

## 2. Advanced query example

Exploring the genetic determinants of nutritional status involves understanding how genetic variations influence the intake and utilization of micronutrients, impacting nutrient transport, metabolism, and cellular uptake.

"""
Micronutrients such as trace elements and vitamins are important as enzyme cofactors in the metabolism of all cells in the body and therefore key to determining nutritional status. 
"""

Build a composite query:
- Nutritional Status, Mechanisms of Micronutrient Metabolism, and Micronutrient Measurement

In [22]:
from pygrpm import *

# Download and import MeSH Embeddings
grpm_mesh_embeddings = import_mesh_embeddings()

grpm_meshes = grpm_mesh_embeddings['meshes']
mesh_embeddings = grpm_mesh_embeddings['embeddings']

# Define queries using natural language
QUERIES =[
     # 1. **Nutritional Status**:
     ["Measurement of nutritional status", 0.85],
     ["Assess essential micronutrients", 0.84],
     ["Focus on vitamins like vitamin A, D, and B-vitamins.", 0.84],
     ["Include trace minerals such as iron, zinc, and iodine.", 0.87],

    # 2"**Mechanisms of Micronutrient Metabolism**:
     ["cellular processes for micronutrient absorption.", 0.87],
     ["transport and transformation of nutrients.", 0.84],
     ["nutrients storage mechanisms.", 0.84],
     ["cofactors in nutrient utilization.", 0.84],
     ["homeostasis of nutrients.", 0.84],
]


mesh_query = []
for i in range(len(QUERIES)):
    query =  QUERIES[i]
    print("\n",query)
    meshes = get_mesh_query(query[0], grpm_meshes, model, mesh_embeddings=mesh_embeddings, threshold=query[1])
    mesh_query.extend(meshes)

mesh_query

Done

 ['Measurement of nutritional status', 0.85]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Related MeSH: ['Nutritional Support', 'Malnutrition', 'Nutritional Status', 'Lung Volume Measurements', 'Speech Production Measurement', 'Eye Movement Measurements', 'Nutritional Requirements', 'Educational Measurement']

 ['Assess essential micronutrients', 0.84]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Related MeSH: ['Dietary Proteins', 'Micronutrients', 'Malnutrition', 'Vitamin A Deficiency', 'Nutrition Disorders', 'Nutritional Status', 'Nutrients', 'Sports Nutritional Sciences', 'Nutritional Requirements']

 ['Focus on vitamins like vitamin A, D, and B-vitamins.', 0.84]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Related MeSH: ['Vitamin E', 'Cholagogues and Choleretics', 'Anti-Infective Agents, Urinary', 'Anti-Infective Agents, Local', 'Fatty Acids, Omega-3', 'Vitamin B 12', 'Vitamin B Complex', 'Vitamin A', 'Vitamin D', 'Diuretics, Potassium Sparing', 'Vaccines, Virus-Like Particle', 'Amino Acids, Essential', 'Vitamin K 3', 'Vitamin K 2', 'Vitamin B 6', 'Contraceptives, Oral, Hormonal', 'Receptors, OSM-LIF', 'Salivary Proteins and Peptides', 'Retinol-Binding Proteins, Cellular', 'Retinol-Binding Proteins, Plasma', 'Amino Acids, Basic', 'Fatty Acids, Essential', 'Fatty Acids, Omega-6', 'Micronutrients', 'Diet, Vegetarian', 'Intestinal Diseases, Parasitic', 'Vitamin E Deficiency', 'Vitamin K Deficiency', 'Liver Diseases, Parasitic', 'Liver Diseases, Alcoholic', 'Hypervitaminosis A', 'Vitamin B 6 Deficiency', 'Skin Diseases, Bacterial', 'Skin Diseases, Viral', 'Vitamin A Deficiency', 'Vitamin B 12 Deficiency', 'Vitamin D Deficiency', 'Vitamin B Deficiency', 'Hepatitis, Viral, Animal', 'Hepatitis,

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Related MeSH: []

 ['cellular processes for micronutrient absorption.', 0.87]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Related MeSH: ['Glucose Transport Proteins, Facilitative', 'Receptors, OSM-LIF', 'Activation, Metabolic']

 ['transport and transformation of nutrients.', 0.84]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Related MeSH: ['Sulfate Transporters', 'Nitrate Transporters', 'Biodegradation, Environmental', 'Protein Transport', 'Chemoautotrophic Growth']

 ['nutrients storage mechanisms.', 0.84]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Related MeSH: ['Glycine Agents', 'Receptors, GABA-A', 'Receptors, GABA-B', 'R Factors', 'Inhibition, Psychological', 'Conditioning, Classical', 'Starvation', 'Protein Transport', 'Chemoautotrophic Growth', 'Autotrophic Processes', 'Activation, Metabolic', 'Secretory Rate', 'Cell Respiration', 'Osmoregulation', 'Plant Transpiration', 'Germination']

 ['cofactors in nutrient utilization.', 0.84]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Related MeSH: ['Glycine Decarboxylase Complex H-Protein', 'Nitrate Reductases', 'Glycine Decarboxylase Complex', 'Sulfate Transporters', 'Pyruvate Dehydrogenase Complex', 'Pyruvate Decarboxylase', 'Pyruvate Carboxylase', 'Glucose Transport Proteins, Facilitative', 'PII Nitrogen Regulatory Proteins', 'NADH, NADPH Oxidoreductases', 'Ketoglutarate Dehydrogenase Complex', 'Nitrite Reductases', 'Pyruvate Kinase', 'Nitrate Transporters', 'Citrate (si)-Synthase', 'Porphobilinogen Synthase', 'Nitroreductases', 'Hydroxymethylglutaryl-CoA-Reductases, NADP-dependent', 'Carbonyl Reductase (NADPH)', 'Carbamoyl-Phosphate Synthase (Ammonia)', 'Pyruvate Carboxylase Deficiency Disease', 'Chemoautotrophic Growth', 'Autotrophic Processes', 'Catabolite Repression']

 ['homeostasis of nutrients.', 0.84]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Related MeSH: ['Receptors, OSM-LIF', 'Decerebrate State', 'Intracellular Space', 'Inactivation, Metabolic', 'Intestinal Reabsorption', 'Protein Transport', 'Activation, Metabolic', 'Proteostasis', 'Metabolic Flux Analysis', 'Organism Hydration Status', 'Osmoregulation', 'Energy Metabolism', 'Basal Metabolism', 'Catabolite Repression', 'Plasma Volume', 'Homeostasis']


['Nutritional Support',
 'Malnutrition',
 'Nutritional Status',
 'Lung Volume Measurements',
 'Speech Production Measurement',
 'Eye Movement Measurements',
 'Nutritional Requirements',
 'Educational Measurement',
 'Dietary Proteins',
 'Micronutrients',
 'Malnutrition',
 'Vitamin A Deficiency',
 'Nutrition Disorders',
 'Nutritional Status',
 'Nutrients',
 'Sports Nutritional Sciences',
 'Nutritional Requirements',
 'Vitamin E',
 'Cholagogues and Choleretics',
 'Anti-Infective Agents, Urinary',
 'Anti-Infective Agents, Local',
 'Fatty Acids, Omega-3',
 'Vitamin B 12',
 'Vitamin B Complex',
 'Vitamin A',
 'Vitamin D',
 'Diuretics, Potassium Sparing',
 'Vaccines, Virus-Like Particle',
 'Amino Acids, Essential',
 'Vitamin K 3',
 'Vitamin K 2',
 'Vitamin B 6',
 'Contraceptives, Oral, Hormonal',
 'Receptors, OSM-LIF',
 'Salivary Proteins and Peptides',
 'Retinol-Binding Proteins, Cellular',
 'Retinol-Binding Proteins, Plasma',
 'Amino Acids, Basic',
 'Fatty Acids, Essential',
 'Fatty Acids, 

In [24]:
len(mesh_query), len(set(mesh_query))

(124, 105)

In [18]:

# QUERY = "Cardiovascualr disease"
composite_query = []
for query in QUERIES:
    mesh_rankings = get_mesh_rankings(query,
                      grpm_meshes,
                      model, mesh_embeddings=mesh_embeddings)
    threshold = 0.85
    mesh_query = filter_mesh_scores(mesh_rankings, threshold)
    composite_query.extend(mesh_query)

composite_query

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [None]:
composite_query

Get Genes and Variants possibly related to the Query

In [None]:
# Filter and get unique results
result = query_dataset(grpm_nutrigen_int, mesh_query, 'mesh')
display(result)

In [None]:
get_stats(result, "gene")

In [None]:
TITABS = """
Relationship between nutritional status and the systemic inflammatory response: micronutrients

Abstract
Micronutrients such as trace elements and vitamins are important as enzyme cofactors in the metabolism of all cells in the body and therefore key to determining nutritional status. The present systematic review examined the evidence of the impact of the systemic inflammatory response on plasma micronutrient status in acute (surgical) and chronic tissue injury. A literature review using targeted subject headings was carried out. Plasma C-reactive protein was used to classify minor (80 mg/l) inflammation. The literature search produced 2344 publications and plasma vitamin D, zinc and carotenoids were most commonly studied and plasma vitamins K, B2 and B6 were least studied. In acute injury thirteen studies (all prospective) and in chronic injury twenty-four studies (largely retrospective) were included in the review. There was consistent evidence that most common measured micronutrients in the plasma (zinc, selenium, vitamins A, D, E, K, B2, B6, B12, C, lutein, lycopene, α- and β-carotene) were significantly lowered from minor to moderate to major inflammation. The results of the present systematic review indicate that most plasma micronutrients fall as part of the systemic inflammatory response irrespective of acute or chronic injury. Therefore, in the presence of a systemic inflammation, plasma micronutrient concentrations should be interpreted with caution. There are a number of methods applied to adjust plasma micronutrient concentrations to avoid misdiagnosis of deficiency. Alternatively, intracellular measurements appear to obviate the need for such plasma adjustment to assess micronutrient status.

Keywords: CRP C-reactive protein; C-reactive protein; Micronutrient status; Systemic inflammation; Trace elements; vitamins.
"""
# TITABS="Diabetic glucose"
topic_terms=pd.Series([TITABS])

# Extract MeSH Query
tab = create_corr_table(topic_terms, grpm_meshes, model, mesh_embeddings)

threshold = 0.99# set similarity threshold
mesh_query = tab[tab.similarity >= threshold].list2.to_list()
print('\n\nMeSH Query:', mesh_query)
tab

    \item [1)] Although the paper provides a link to a directory containing\textit{ sample queries}, it would be very helpful to provide a few examples of  particularly interesting queries that can be written with this system.

    \item [R:] \textcolor{violet}{Si riferisce ala directori tests e al notebook tests.ipynb. alcuni esempi di query particolarmente interessanti che possono essere scritte con questo sistema.  
    Una query di interesse sul datase nutrientico complessivo potrebbe essere la ricerca di varianti putatiamente correlate a due diverse condizioni nutrizionali assemblando una query composita:
    .
    esempio: Relationship between nutritional status and the systemic inflammatory response: micronutrients a livello genetico \cite{https://pubmed.ncbi.nlm.nih.gov/30220267/}
    .
    Query composita di questo tipo può essere composta con l'ausilio di BioBERT embedding per slezionare MESH partendo da una descrizione testuale (come proposto nel notebook "test.ipynb"), una funzinoe utile per l'implementazione di un interfaccia utente. 
    . 
    }
    % c'è tempo per un ultimaimplemetaione?