In [15]:
import importlib
import subprocess

try:
    importlib.import_module('pygrpm')
except ImportError:
    subprocess.check_call(["pip", "install", "git+https://github.com/johndef64/GRPM_system.git"])
    
from pygrpm import *

# GET Datasets

In [ ]:
### GET Datasets ###
get_and_extract('grpm_dataset', record_id='14052302')

In [ ]:
get_and_extract('nutrigenetic_dataset', record_id='14052302')


# LOAD Datasets

In [2]:
### LOAD Datasets ###
pcg_grpm, rna_grpm, pseudo_grpm = grpm_importer()
grpm_nutrigen, grpm_nutrigen_int, grpm_nutrigen_int_gwas = nutrig_importer()


Importing time:  0:00:08.916907
pcg: 776.19 MB
rna: 58.18 MB
pseudo: 1.93 MB
nutrigen dataset: 87.13 MB
nutrigen dataset filtered: 53.62 MB
nutrigen gwas dataset: 20.56 MB


# GET Stats

In [ ]:
### GET Stats ###
result = get_stats(pcg_grpm, group_by = 'gene')
display(result)

In [ ]:
result = get_stats(grpm_nutrigen, group_by = 'gene')
display(result)

# QUERY Dataset 

## Build MeSH Query

In [3]:
## GET MeSH Dataset ##
get_and_extract('ref-mesh', record_id='14052302')
get_topic_terms()

Downloading ref-mesh.zip from https://zenodo.org/record/14052302/files/ref-mesh.zip?download=1


Downloading...
From: https://zenodo.org/record/14052302/files/ref-mesh.zip?download=1
To: G:\Altri computer\Horizon\horizon_workspace\projects\work\semantics\GRPM\GRPM_github\ref-mesh.zip
100%|██████████| 4.02M/4.02M [00:00<00:00, 4.99MB/s]


Extracting...
ZIP file 'ref-mesh.zip' extracted in 'G:\Altri computer\Horizon\horizon_workspace\projects\work\semantics\GRPM\GRPM_github' successfully.


In [3]:
# LOAD MeSH
grpm_mesh = mesh_importer()

# LOAD Language Model
sentence_transformer = load_language_model('dmis-lab/biobert-v1.1')

# Get MeSH embeddings
series2 = grpm_mesh['Preferred Label'].reset_index(drop=True)
mesh_embeddings = extract_embedding(series2.to_list(), sentence_transformer)

GRPM MeSH count: 21705
semantic types: 125


No sentence-transformers model found with name dmis-lab/biobert-v1.1. Creating a new one with mean pooling.


In [10]:
# User defined Topic Terms
topic_terms_sample = ["diet ketogenic",
                      "diet reducing",
                      "diet sodium-restricted",
                      "diet",
                      "dietary",
                      "dietetics",
                      "dyslipidemias",
                      "eating disorders",
                      "feeding and eating disorde",
                      "food hypersensitivity",
                      "foodborne diseases",
                      "gastrointestinal diseases",
                      "hypercholesterolemia",
                      "hyperglycemia",
                      "hyperlipidemias",
                      "hyperphagia",
                      "hypoglycemia",
                      "hypophagia",
                      "insulin resistance",
                      ]
series1 = pd.Series(topic_terms_sample)

In [11]:
# Extract MeSH Query
tab = create_corr_table(series1, series2, sentence_transformer, mesh_embeddings)
mesh_query = tab[tab.similarity >= 0.90].list2.to_list()
print('MeSH Query:', mesh_query)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 19/19 [00:05<00:00,  3.41it/s]

MeSH Query: ['Diet', 'Diet', 'Dyslipidemias', 'Hypersensitivity', 'Gastrointestinal Diseases', 'Hypercholesterolemia', 'Hyperglycemia', 'Hyperlipidemias', 'Hyperphagia', 'Hypoglycemia']





# MeSH Query Dataset

In [12]:
# Filter and get unique results
result = query_dataset(pcg_grpm, mesh_query, 'mesh')
display(result)

Unnamed: 0,gene,type,rsid,pmid,mesh,qualifier,major
5434,MT-ND1,PCG,rs200319905,25579139,Hyperlipidemias,genetics,False
5435,MT-ND1,PCG,rs200319905,25579139,Hyperlipidemias,metabolism,False
5436,MT-ND1,PCG,rs200319905,25579139,Hyperlipidemias,pathology,False
13642,MT-ND2,PCG,rs200319905,25579139,Hyperlipidemias,genetics,False
13643,MT-ND2,PCG,rs200319905,25579139,Hyperlipidemias,metabolism,False
...,...,...,...,...,...,...,...
16606654,CXCL8,PCG,rs572157399,20508232,Hypoglycemia,complications,False
16606655,CXCL8,PCG,rs572157399,20508232,Hypoglycemia,diagnosis,False
16606656,CXCL8,PCG,rs572157399,20508232,Hypoglycemia,epidemiology,True
16607143,CXCL8,PCG,rs572157399,21357364,Hyperglycemia,metabolism,False


# Gene Query Dataset

In [13]:
# Gene Query on Nutrigenetic ds
my_genes = (
    'FTO',
    'APOB',
    'G6PD'
)
# Filter and get unique results
result = query_dataset(grpm_nutrigen_int, my_genes, 'gene')
display(result)

Unnamed: 0,gene,rsid,pmid,mesh,topic,interest_index
0,FTO,rs9972653,32393786,Body Mass Index,General Nutrition,1.000
1,FTO,rs9972653,32393786,"Diabetes Mellitus, Type 2",General Nutrition,1.000
2,FTO,rs9972653,32393786,Diet,General Nutrition,1.000
3,FTO,rs9972653,33128006,Body Mass Index,General Nutrition,1.000
4,FTO,rs9972653,33128006,Cardiovascular Diseases,General Nutrition,1.000
...,...,...,...,...,...,...
1037444,G6PD,rs1050828,22957039,"Anemia, Sickle Cell",Food Intolerances,0.086
1037445,G6PD,rs1050828,32697331,"Anemia, Sickle Cell",Food Intolerances,0.086
1037446,G6PD,rs1050828,31278068,Anemia,Food Intolerances,0.086
1037447,G6PD,rs1050828,29318647,"Anemia, Sickle Cell",Food Intolerances,0.086


In [14]:
# Gene Query on Nutrigenetic-GWAS ds
result = query_dataset(grpm_nutrigen_int_gwas, my_genes, 'GRPM_GENE')
display(result)

Unnamed: 0,GRPM_GENE,GRPM_RSID,GRPM_PMID,GRPM_MESH,GRPM_TOPIC,GRPM_GI,SEMANTIC_SIMILARITY,GWAS_DISEASE/TRAIT,GWAS_MAPPED_TRAIT,GWAS_MAPPED_GENE,GWAS_CONTEXT,GWAS_STRONGEST_SNP-RISK_ALLELE,GWAS_OR-BETA,GWAS_STUDY,GWAS_STUDY_ID
0,FTO,rs9941349,22084931,Obesity,General Nutrition,1.00000,0.909086,Obesity (extreme),obesity,FTO,intron_variant,rs9941349-T,1.480000,Common body mass index-associated variants con...,GCST000426
1,FTO,rs9941349,20442772,Obesity,General Nutrition,1.00000,0.909086,Obesity (extreme),obesity,FTO,intron_variant,rs9941349-T,1.480000,Common body mass index-associated variants con...,GCST000426
2,FTO,rs9941349,24879436,Obesity,General Nutrition,1.00000,0.909086,Obesity (extreme),obesity,FTO,intron_variant,rs9941349-T,1.480000,Common body mass index-associated variants con...,GCST000426
3,FTO,rs9941349,21552555,Obesity,General Nutrition,1.00000,0.909086,Obesity (extreme),obesity,FTO,intron_variant,rs9941349-T,1.480000,Common body mass index-associated variants con...,GCST000426
4,FTO,rs9941349,25014319,Obesity,General Nutrition,1.00000,0.909086,Obesity (extreme),obesity,FTO,intron_variant,rs9941349-T,1.480000,Common body mass index-associated variants con...,GCST000426
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166581,FTO,rs1421085,29540276,Diet,Eating Behavior and Taste Sensation,0.41459,0.948529,Weight,body weight,FTO,intron_variant,rs1421085-C,0.059100,A cross-population atlas of genetic associatio...,GCST90018949
166582,FTO,rs1421085,31358974,Alcohol Drinking,Eating Behavior and Taste Sensation,0.41459,0.913568,Alcohol consumption,alcohol consumption measurement,FTO,intron_variant,rs1421085-T,0.008000,New alcohol-related genes suggest shared genet...,GCST008757
166583,FTO,rs1421085,31358974,Alcohol Drinking,Eating Behavior and Taste Sensation,0.41459,0.903289,Alcohol use disorder,"alcohol use disorder measurement, alcohol depe...",FTO,intron_variant,rs1421085-T,6.690000,Genome-wide association study of alcohol consu...,GCST008259
166584,FTO,rs1421085,31358974,Alcohol Drinking,Eating Behavior and Taste Sensation,0.41459,0.903289,Alcohol use disorder,alcohol use disorder measurement,FTO,intron_variant,rs1421085-T,8.371000,Genetic Underpinnings of the Transition From A...,GCST90301659
