# Import modules

[ In Colab:: load Runtime with GPU ] 

In [15]:
import importlib
import subprocess
import torch
print("Torch version:",torch.__version__)
print("Is CUDA enabled?",torch.cuda.is_available())
if torch.cuda.is_available():
    print(torch.randn(1).cuda())
    
try:
    importlib.import_module('pygrpm')
except ImportError:
    subprocess.check_call(["pip", "install", "git+https://github.com/johndef64/GRPM_system.git"])
    
from pygrpm import *

# GET Datasets

In [ ]:
### GET Datasets ###
get_and_extract('grpm_dataset', record_id='14052302')
get_and_extract('nutrigenetic_dataset', record_id='14052302')

# LOAD Datasets

In [17]:
### LOAD Datasets ###
pcg_grpm, rna_grpm, pseudo_grpm = grpm_importer()
grpm_nutrigen, grpm_nutrigen_int, grpm_nutrigen_int_gwas = nutrig_importer()

display(grpm_nutrigen_int)

Importing time:  0:00:09.197935
pcg: 776.19 MB
rna: 58.18 MB
pseudo: 1.93 MB
nutrigen dataset: 87.13 MB
nutrigen dataset filtered: 53.62 MB
nutrigen gwas dataset: 20.56 MB


Unnamed: 0,gene,rsid,pmid,mesh,topic,interest_index
0,FTO,rs9972653,32393786,Body Mass Index,General Nutrition,1.00000
1,FTO,rs9972653,32393786,"Diabetes Mellitus, Type 2",General Nutrition,1.00000
2,FTO,rs9972653,32393786,Diet,General Nutrition,1.00000
3,FTO,rs9972653,33128006,Body Mass Index,General Nutrition,1.00000
4,FTO,rs9972653,33128006,Cardiovascular Diseases,General Nutrition,1.00000
...,...,...,...,...,...,...
1171244,FADS1,rs174545,23221573,Lipid Metabolism,Xenobiotics Metabolism,0.01255
1171245,FADS1,rs174545,30120404,Liver,Xenobiotics Metabolism,0.01255
1171246,FADS1,rs174545,30120404,Non-alcoholic Fatty Liver Disease,Xenobiotics Metabolism,0.01255
1171247,FADS1,rs174544,20565855,Lipid Metabolism,Xenobiotics Metabolism,0.01255


# SHOW Stats

In [ ]:
pcg_grpm_stats = get_stats(pcg_grpm, group_by = 'gene')
display(pcg_grpm_stats)

In [ ]:
grpm_nutrigen_stats = get_stats(grpm_nutrigen, group_by = 'gene')
display(grpm_nutrigen_stats)

# QUERY GRPM Dataset 

## MeSH Query Example

In [27]:
## GET MeSH Dataset ##
get_and_extract('ref-mesh', record_id='14052302')
get_topic_terms()

# LOAD MeSH
grpm_mesh = mesh_importer()
grpm_mesh.head()

Downloading ref-mesh.zip from https://zenodo.org/record/14052302/files/ref-mesh.zip?download=1


Downloading...
From: https://zenodo.org/record/14052302/files/ref-mesh.zip?download=1
To: G:\Altri computer\Horizon\horizon_workspace\projects\work\semantics\GRPM\GRPM_github\ref-mesh.zip
100%|██████████| 4.02M/4.02M [00:01<00:00, 3.36MB/s]


Extracting...
ZIP file 'ref-mesh.zip' extracted in 'G:\Altri computer\Horizon\horizon_workspace\projects\work\semantics\GRPM\GRPM_github' successfully.
GRPM MeSH count: 21705
semantic types: 125


Unnamed: 0,Preferred Label,Semantic Types Label,Class ID,mesh_id,Semantic Types
1,Electronic Health Records,Intellectual Product,http://purl.bioontology.org/ontology/MESH/D057286,D057286,T170
2,Consent Forms,Intellectual Product,http://purl.bioontology.org/ontology/MESH/D032962,D032962,T170
3,Genealogy and Heraldry,Intellectual Product,http://purl.bioontology.org/ontology/MESH/D005789,D005789,T170
4,Publications,Intellectual Product,http://purl.bioontology.org/ontology/MESH/D011642,D011642,T170
5,Pharmaceutical Services,Intellectual Product,http://purl.bioontology.org/ontology/MESH/D010593,D010593,T170


In [40]:
# Random Query Example
mesh_query =  grpm_mesh['Preferred Label'].sample(10).to_list()

# Filter and get unique results
result = query_dataset(pcg_grpm, mesh_query, 'mesh')
display(result)

Unnamed: 0,gene,type,rsid,pmid,mesh,qualifier,major
2120,MT-ND1,PCG,rs199476118,9718301,Lysine,genetics,False
5081,MT-ND1,PCG,rs199476125,15505787,Lysine,genetics,False
10336,MT-ND2,PCG,rs199476118,9718301,Lysine,genetics,False
13235,MT-ND2,PCG,rs199476125,15505787,Lysine,genetics,False
80151,MT-ND6,PCG,rs207460002,30550726,Lysine,metabolism,True
...,...,...,...,...,...,...,...
16483181,VDR,PCG,rs745379948,26855179,Lysine,chemistry,False
16483182,VDR,PCG,rs745379948,26855179,Lysine,metabolism,True
16484612,RPA2,PCG,rs748375829,15314062,Lysine,chemistry,False
16552950,MIA3,PCG,rs17465637,29673405,Cyclin-Dependent Kinase Inhibitor p18,blood,False


## Build MeSH Query

In [3]:
# LOAD Language Model
sentence_transformer = load_language_model('dmis-lab/biobert-v1.1')

# Get MeSH embeddings
series2 = grpm_mesh['Preferred Label'].reset_index(drop=True)
mesh_embeddings = extract_embedding(series2.to_list(), sentence_transformer)

GRPM MeSH count: 21705
semantic types: 125


No sentence-transformers model found with name dmis-lab/biobert-v1.1. Creating a new one with mean pooling.


In [24]:
# User defined Topic Terms
topic_terms_sample = ["diet ketogenic",
                      "diet reducing",
                      "diet sodium-restricted",
                      "diet",
                      "dietary",
                      "dietetics",
                      "dyslipidemias",
                      "eating disorders",
                      "feeding and eating disorde",
                      "food hypersensitivity",
                      "foodborne diseases",
                      "gastrointestinal diseases",
                      "hypercholesterolemia",
                      "hyperglycemia",
                      "hyperlipidemias",
                      "hyperphagia",
                      "hypoglycemia",
                      "hypophagia",
                      "insulin resistance",
                      ]
series1 = pd.Series(topic_terms_sample)

In [25]:
# Extract MeSH Query
tab = create_corr_table(series1, series2, sentence_transformer, mesh_embeddings)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 19/19 [00:07<00:00,  2.54it/s]


In [26]:
mesh_query = tab[tab.similarity >= 0.90].list2.to_list()
print('\n\nMeSH Query:', mesh_query)



MeSH Query: ['Diet', 'Diet', 'Dyslipidemias', 'Hypersensitivity', 'Gastrointestinal Diseases', 'Hypercholesterolemia', 'Hyperglycemia', 'Hyperlipidemias', 'Hyperphagia', 'Hypoglycemia']


## Execute MeSH Query

In [12]:
# Filter and get unique results
result = query_dataset(pcg_grpm, mesh_query, 'mesh')
display(result)

Unnamed: 0,gene,type,rsid,pmid,mesh,qualifier,major
5434,MT-ND1,PCG,rs200319905,25579139,Hyperlipidemias,genetics,False
5435,MT-ND1,PCG,rs200319905,25579139,Hyperlipidemias,metabolism,False
5436,MT-ND1,PCG,rs200319905,25579139,Hyperlipidemias,pathology,False
13642,MT-ND2,PCG,rs200319905,25579139,Hyperlipidemias,genetics,False
13643,MT-ND2,PCG,rs200319905,25579139,Hyperlipidemias,metabolism,False
...,...,...,...,...,...,...,...
16606654,CXCL8,PCG,rs572157399,20508232,Hypoglycemia,complications,False
16606655,CXCL8,PCG,rs572157399,20508232,Hypoglycemia,diagnosis,False
16606656,CXCL8,PCG,rs572157399,20508232,Hypoglycemia,epidemiology,True
16607143,CXCL8,PCG,rs572157399,21357364,Hyperglycemia,metabolism,False


# QUERY Nutrigenetic Dataset

In [42]:
# Gene Query on Nutrigenetic ds
topic =  grpm_nutrigen_int.topic[0]

print(f'Displaying "{topic}" topic')
# Filter and get unique results
result = query_dataset(grpm_nutrigen_int, [topic], 'topic')
display(result)

Displaying "General Nutrition" topic


Unnamed: 0,gene,rsid,pmid,mesh,topic,interest_index
0,FTO,rs9972653,32393786,Body Mass Index,General Nutrition,1.00000
1,FTO,rs9972653,32393786,"Diabetes Mellitus, Type 2",General Nutrition,1.00000
2,FTO,rs9972653,32393786,Diet,General Nutrition,1.00000
3,FTO,rs9972653,33128006,Body Mass Index,General Nutrition,1.00000
4,FTO,rs9972653,33128006,Cardiovascular Diseases,General Nutrition,1.00000
...,...,...,...,...,...,...
268296,DRD2,rs1076560,31948125,Anxiety,General Nutrition,0.01251
268297,DRD2,rs1076560,22829935,Physical Endurance,General Nutrition,0.01251
268298,DRD2,rs1076560,30729689,"Diabetes Mellitus, Type 2",General Nutrition,0.01251
268299,DRD2,rs1076560,30729689,Metabolic Syndrome,General Nutrition,0.01251


## Gene Query Example

In [13]:
# Gene Query on Nutrigenetic ds
my_genes = (
    'FTO',
    'APOB',
    'G6PD'
)
# Filter and get unique results
result = query_dataset(grpm_nutrigen_int, my_genes, 'gene')
display(result)

Unnamed: 0,gene,rsid,pmid,mesh,topic,interest_index
0,FTO,rs9972653,32393786,Body Mass Index,General Nutrition,1.000
1,FTO,rs9972653,32393786,"Diabetes Mellitus, Type 2",General Nutrition,1.000
2,FTO,rs9972653,32393786,Diet,General Nutrition,1.000
3,FTO,rs9972653,33128006,Body Mass Index,General Nutrition,1.000
4,FTO,rs9972653,33128006,Cardiovascular Diseases,General Nutrition,1.000
...,...,...,...,...,...,...
1037444,G6PD,rs1050828,22957039,"Anemia, Sickle Cell",Food Intolerances,0.086
1037445,G6PD,rs1050828,32697331,"Anemia, Sickle Cell",Food Intolerances,0.086
1037446,G6PD,rs1050828,31278068,Anemia,Food Intolerances,0.086
1037447,G6PD,rs1050828,29318647,"Anemia, Sickle Cell",Food Intolerances,0.086


In [14]:
# Gene Query on Nutrigenetic-GWAS ds
result = query_dataset(grpm_nutrigen_int_gwas, my_genes, 'GRPM_GENE')
display(result)

Unnamed: 0,GRPM_GENE,GRPM_RSID,GRPM_PMID,GRPM_MESH,GRPM_TOPIC,GRPM_GI,SEMANTIC_SIMILARITY,GWAS_DISEASE/TRAIT,GWAS_MAPPED_TRAIT,GWAS_MAPPED_GENE,GWAS_CONTEXT,GWAS_STRONGEST_SNP-RISK_ALLELE,GWAS_OR-BETA,GWAS_STUDY,GWAS_STUDY_ID
0,FTO,rs9941349,22084931,Obesity,General Nutrition,1.00000,0.909086,Obesity (extreme),obesity,FTO,intron_variant,rs9941349-T,1.480000,Common body mass index-associated variants con...,GCST000426
1,FTO,rs9941349,20442772,Obesity,General Nutrition,1.00000,0.909086,Obesity (extreme),obesity,FTO,intron_variant,rs9941349-T,1.480000,Common body mass index-associated variants con...,GCST000426
2,FTO,rs9941349,24879436,Obesity,General Nutrition,1.00000,0.909086,Obesity (extreme),obesity,FTO,intron_variant,rs9941349-T,1.480000,Common body mass index-associated variants con...,GCST000426
3,FTO,rs9941349,21552555,Obesity,General Nutrition,1.00000,0.909086,Obesity (extreme),obesity,FTO,intron_variant,rs9941349-T,1.480000,Common body mass index-associated variants con...,GCST000426
4,FTO,rs9941349,25014319,Obesity,General Nutrition,1.00000,0.909086,Obesity (extreme),obesity,FTO,intron_variant,rs9941349-T,1.480000,Common body mass index-associated variants con...,GCST000426
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166581,FTO,rs1421085,29540276,Diet,Eating Behavior and Taste Sensation,0.41459,0.948529,Weight,body weight,FTO,intron_variant,rs1421085-C,0.059100,A cross-population atlas of genetic associatio...,GCST90018949
166582,FTO,rs1421085,31358974,Alcohol Drinking,Eating Behavior and Taste Sensation,0.41459,0.913568,Alcohol consumption,alcohol consumption measurement,FTO,intron_variant,rs1421085-T,0.008000,New alcohol-related genes suggest shared genet...,GCST008757
166583,FTO,rs1421085,31358974,Alcohol Drinking,Eating Behavior and Taste Sensation,0.41459,0.903289,Alcohol use disorder,"alcohol use disorder measurement, alcohol depe...",FTO,intron_variant,rs1421085-T,6.690000,Genome-wide association study of alcohol consu...,GCST008259
166584,FTO,rs1421085,31358974,Alcohol Drinking,Eating Behavior and Taste Sensation,0.41459,0.903289,Alcohol use disorder,alcohol use disorder measurement,FTO,intron_variant,rs1421085-T,8.371000,Genetic Underpinnings of the Transition From A...,GCST90301659
