# GRPM MeSH Screening 

This notebook is engineered to screen the previously retrieved genetic polymorphism data using selected MeSH terms. It works with MeSH sets that are used as hooks to retrieve subsets of genes and polymorphisms from the "GRPM ds" dataset.

In [None]:
#Only for Google Colab
import os
import sys
import subprocess

# @markdown Run in Colab virtual machine by default

# @markdown to run in google drive set:
import_mydrive = False #@param {type:"boolean"}


if 'google.colab' in sys.modules:
    subprocess.run(["pip", "install", "nbib"])
    subprocess.run(["pip", "install", "biopython"])

    if import_mydrive:
        from google.colab import drive
        drive.mount('/content/drive')

        if not os.path.exists('/content/drive/MyDrive/grpm_system/'):
            subprocess.run(['mkdir', '/content/drive/MyDrive/grpm_system/'])
        subprocess.run(['cd', '/content/drive/MyDrive/grpm_system/'])
    else:
        if not os.path.exists('/content/grpm_system/'):
            subprocess.run(['mkdir', '/content/grpm_system/'])
        subprocess.run(['cd', '/content/grpm_system/'])

current_directory = os.getcwd()
print("Current working directory:", current_directory)

# Import Packages

In [1]:
import os
import io
import glob
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import requests
import zipfile

def simple_bool(message):
    choose = input(message+" (y/n): ").lower()
    your_bool = choose in ["y", "yes","yea","sure"]
    return your_bool

def get_and_extract(file, dir = os.getcwd()):
    url = "https://zenodo.org/record/8205724/files/"+file+".zip?download=1"
    zip_file_name = file+".zip"
    extracted_folder_name = dir

    # Download the ZIP file
    response = requests.get(url)

    if response.status_code == 200:
        # Extract the ZIP contents
        with io.BytesIO(response.content) as zip_buffer:
            with zipfile.ZipFile(zip_buffer, 'r') as zip_ref:
                zip_ref.extractall(extracted_folder_name)
        print(f"ZIP file '{zip_file_name}' extracted to '{extracted_folder_name}' successfully.")
    else:
        print("Failed to download the ZIP file.")

# Get requirements

In [None]:
# Get GRPM Dataset from Zenodo Repository
#https://zenodo.org/record/8205724  DOI: 10.5281/zenodo.8205724

if simple_bool('Download pre-made GRPM-Dataset from Zenodo? (6.5 minutes in Colab)'):
    timea = datetime.now()
    get_and_extract('grpm_dataset')
    print('Download and extraction time ',datetime.now()-timea)

if simple_bool('Download pre-made ref-mesh-archive from Zenodo?'):
    timea = datetime.now()
    get_and_extract('ref-mesh-archive')
    print('Download and extraction time ',datetime.now()-timea)

## Import GRPM dataset (required)

In [2]:
#Load GRPM db Report-----------------------------------------

# choose database:
db_tag      = 'pcg'
# 'pcg'    = protein coding genes = grpm_db
# 'rna'    = rna genes            = grpm_db_rna
# 'pseudo' = pseudogenes          = grpm_db_pseudo

db_name = 'grpm_db_'+ db_tag
db_path = 'grpm_dataset/'+db_name

print('importing GRPM Dataset...')
#get gene list from grpm report
GRPM_report = pd.read_csv(db_path+'/GRPM_report.csv',index_col=0).transpose().reset_index().rename(columns={'index':'gene'})
grpm_genes_list = GRPM_report.gene.to_list()

#Import grpm data back-------------------------------------------
time_load_1 = datetime.now()

columns = ['gene', 'rsid', 'pmids', 'mesh']
grpm_dataset = pd.read_csv(db_path+'/grpm_table_output.csv', usecols=columns)

grpm_dataset['pmids'] = grpm_dataset['pmids'].astype(str) #convert pmid type in str
time_load_2 = datetime.now()
print('time load:',time_load_2-time_load_1)

importing GRPM Dataset...
time load: 0:01:02.019286


## Subset GRPM Dataset (optional)

In [None]:
subset_grpm = simple_bool('Do you want to use a custom gene list to subset GRPM Dataset?')
if subset_grpm:
    # import your custom gene list (.csv)
    file_csv = []
    for file in os.listdir():
        if file.endswith(".csv") or file.endswith(".tsv"):
            file_csv.append(file)

    filenum = input('import your custom gene list (.csv)\nselect file index: \n'+str(pd.Series(file_csv)))

    time1 = datetime.now()
    subset_genes = pd.read_csv(file_csv[int(filenum)])
    subset_genes = subset_genes[subset_genes.columns[int(input('select column index:\n'+ str(pd.Series(subset_genes.columns))))]].drop_duplicates().str.replace(' ','')
    subset_genes.to_list()

    # subsetting GRPM_report and grpm_dataset
    GRPM_report_subset = GRPM_report[GRPM_report['gene'].isin(subset_genes)]
    grpm_subset = grpm_dataset[grpm_dataset['gene'].isin(subset_genes)]
    print("You're using a GRPM Dataset partition\ntime subsetting:",time_load_2-time_load_1)
    display(GRPM_report_subset)

# Define context
    - gene list
    - survey directory
    - ref-mesh list

## Check avalable ref-MeSH lists
## Set directory/import data

In [3]:
#Check avalable refs:
ref_path = "ref-mesh-archive/"  # Replace with the actual ref mesh path

#---------------------------------
#use random mesh list?
random_mesh = False
if random_mesh:
    ref_path = "ref-mesh-archive/random_lists/"
#---------------------------------

# Create a file path pattern to match CSV files
file_pattern = os.path.join(ref_path, "*.csv")

# Use glob to get a list of file paths matching the pattern
csv_files = glob.glob(file_pattern)

csv_files_name = []
# Print the list of CSV files
for file in csv_files:
    file_name = os.path.basename(file)
    csv_files_name.append(file_name)

pd.set_option('display.max_rows', 100)
print('Available reference mesh lists:')
csv_files_df = pd.Series(csv_files_name)

csv_file_tag = pd.DataFrame()
if not random_mesh:
    csv_file_tag = csv_files_df.str.extract(r'ref_mesh_(.*)\.csv', expand=False).dropna().reset_index(drop=True)
else:
    csv_file_tag = csv_files_df.str.extract(r'(.*)\.csv', expand=False).dropna().reset_index(drop=True)


#------------------------------------------------------
# define directory folder path:
survey_path = 'grpm_surveys/' # keep default to use root path

# choose ref_mesh.csv tab:
topic_tag   = csv_file_tag[int(input('\Select index from available ref-mesh list:\n'+str(csv_file_tag)))]
add         = ''    # additional survey directory tag
#------------------------------------------------------


# (1) Create survey directory:
survey_path = survey_path+'grpm_random/' if random_mesh else survey_path
directory = survey_path + 'grpm_survey_' + db_tag + '_' + topic_tag + add
if not os.path.exists(directory):
    os.makedirs(directory)


# (2) Import Mesh-reference list:
ref_filename = "ref_mesh_" + topic_tag + ".csv" if not random_mesh else topic_tag + ".csv"
ref = pd.read_csv(ref_path + ref_filename, index_col=0)

if 'mesh' not in ref.columns:
    ref = ref.rename(columns={'Preferred Label': 'mesh'})

ref_mesh_n = ref.mesh.nunique()
ref_mesh_list = ref['mesh'].drop_duplicates()

#----------------------------------------------------------
print('\n', ref_mesh_list)

Available reference mesh lists:

 0             Physical Endurance
1              Nutrition Therapy
5               Nutrition Policy
6                 Motor Activity
7          Adipose Tissue, Brown
                 ...            
623    Body Weights and Measures
627                  Body Weight
628      Body Weight Maintenance
629                   Overweight
630                      Obesity
Name: mesh, Length: 243, dtype: object


# Get GRPM dataset Metrics [required]


In [23]:
%time
# GET GRPM whole dataset Metrics
time_1 = datetime.now()
grpm_dataset['pmidmesh'] = grpm_dataset['pmids']+grpm_dataset['mesh']
print(datetime.now() -time_1)
grpm_dataset.head()

time_1 = datetime.now()
grpm_gene_metrics = grpm_dataset[['gene', 'rsid', 'pmids', 'mesh', 'pmidmesh']].groupby('gene').describe(include='all')
print(datetime.now() -time_1)
grpm_gene_metrics.head()

CPU times: total: 0 ns
Wall time: 0 ns
0:00:03.064639
0:01:09.662249


Unnamed: 0_level_0,rsid,rsid,rsid,rsid,pmids,pmids,pmids,pmids,mesh,mesh,mesh,mesh,pmidmesh,pmidmesh,pmidmesh,pmidmesh
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq
gene,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
A1BG,118,4,rs893184,68,118,7,23690342,40,118,75,Hypertension,8,118,94,32279138HIV-1,3
A1CF,362,7,rs10821905,185,362,23,29437585,31,362,137,Humans,22,362,318,30529582Colorectal Neoplasms,4
A2M,1160,21,rs669,676,1160,57,32747830,92,1160,365,Humans,61,1160,887,24011543Amyloid beta-Peptides,8
A2ML1,1065,32,rs863224951,119,1065,13,31009165,575,1065,117,Otitis Media,51,1065,227,31009165Adolescent,23
A3GALT2,19,1,rs376200069,19,19,1,28506304,19,19,11,Adenocarcinoma,3,19,11,28506304Adenocarcinoma,3


# Filter GRPM dataset by MeSH [required]

In [5]:
%time

# Preprocessing: 
time_1 = datetime.now()
genes = grpm_dataset.gene.drop_duplicates().to_list()

# Subset GRPM ds with ref-mesh
mask_full = grpm_dataset['mesh'].isin(ref_mesh_list)
grpm_match_full = grpm_dataset[mask_full].drop_duplicates()

# save GRPM_gene
grpm_match_full[['gene', 'rsid', 'pmids', 'mesh']].to_csv(directory+'/grpmx_filtered_output.csv')
print(datetime.now() -time_1)


CPU times: total: 0 ns
Wall time: 0 ns
0:00:01.934377


# Get GRPM Subset Metrics [required]

In [6]:
%time
grpm_match_full['pmidmesh'] = grpm_match_full['pmids']+grpm_match_full['mesh']
time_1 = datetime.now()
grpm_match_gene_metrics = grpm_match_full[['gene', 'rsid', 'pmids', 'mesh', 'pmidmesh']].groupby('gene').describe(include='all')
print(datetime.now() -time_1)
grpm_match_gene_metrics.head()

Unnamed: 0,gene,rsid,pmids,mesh,pmidmesh
45,MT-ND1,rs111033358,33468709,"Diabetes Mellitus, Type 2","33468709Diabetes Mellitus, Type 2"
51,MT-ND1,rs111033358,33468709,Insulin,33468709Insulin
52,MT-ND1,rs111033358,33468709,Insulin Resistance,33468709Insulin Resistance
1262,MT-ND1,rs1599988,28693754,Weight Gain,28693754Weight Gain
1281,MT-ND1,rs28358585,28693754,Weight Gain,28693754Weight Gain
...,...,...,...,...,...
16608733,EVC,rs2291157,23139751,Health,23139751Health
16608800,EVC,rs6414624,29273463,"Diabetes Mellitus, Type 2","29273463Diabetes Mellitus, Type 2"
16609420,TMC3,rs150843673,29691411,"Diabetes Mellitus, Type 2","29691411Diabetes Mellitus, Type 2"
16609428,TMC3,rs150843673,29691411,Obesity,29691411Obesity


# Other GRPM Subset Metrics [optional]

In [11]:
time_1 = datetime.now()
grpm_match_rsid_metrics = grpm_match_full[['rsid', 'pmids']].groupby('rsid').describe(include='all')#agg(lambda x: x.unique())
print(datetime.now() -time_1)
#grpm_match_rsid_metrics.head()
grpm_match_rsid_metrics_sort = grpm_match_rsid_metrics.sort_values(by=('pmids','unique'), ascending= False)
top10rsid = list(grpm_match_rsid_metrics_sort[:10].index)
top10rsid

0:01:00.944193


In [21]:
time_1 = datetime.now()
grpm_match_mesh_metrics = grpm_match_full[['pmids', 'mesh']].groupby('mesh').describe(include='all')#agg(lambda x: x.unique())
print(datetime.now() -time_1)
#grpm_match_mesh_metrics.head()

grpm_match_mesh_metrics_sort = grpm_match_mesh_metrics.sort_values(by=('pmids','unique'), ascending= False)
top10mesh = list(grpm_match_mesh_metrics_sort[:10].index)
top10mesh

0:00:00.589811


['Diabetes Mellitus, Type 2',
 'Obesity',
 'Inflammation',
 'Body Mass Index',
 'Hypertension',
 'Insulin',
 'Insulin Resistance',
 'Family Health',
 'Diet',
 'Metabolic Syndrome']

In [ ]:
time_1 = datetime.now()
grpm_match_pmids_metrics = grpm_match_full[['gene', 'rsid', 'pmids', 'mesh']].groupby('pmids').describe(include='all')#agg(lambda x: x.unique())
print(datetime.now() -time_1)
grpm_match_pmids_metrics.head()

0:00:57.947976 x 3
0:01:14.471060 x 4

# Run Survey

In [61]:
from tqdm import tqdm

#---------------------------------------------
# Edit saving options:
checkpoint = 500 # write Report each "n" genes

include_top10 = True # for a faster job
save_plot = False  # only if include_top10 = True

run_sample = False # set True just to run a test
sample_size = 10

df_report_complete = pd.DataFrame()
#---------------------------------------------


# Define grpm subset
if 'subset_grpm' in locals() and subset_grpm:
    GRPM_report = GRPM_report_subset
    grpm_dataset = grpm_subset

# define gene list
import random
if run_sample:
    genes = random.sample(grpm_genes_list[:], sample_size)
else:
    genes = grpm_genes_list[gene_start:len(grpm_genes_list)]

time_start = datetime.now()

for gene in tqdm(genes):

    time_alpha = datetime.now()
    timestamp = time_alpha.strftime('%Y%m%d%H%M%S')

    # Pre-Selection Metrics ===========
    grpm_gene_metrics_gene = grpm_gene_metrics.loc[gene]
    starting_pmid          =  grpm_gene_metrics_gene.pmids.loc['unique']
    starting_mesh          =  grpm_gene_metrics_gene.mesh.loc['unique']
    lit1_rsid              =  grpm_gene_metrics_gene.rsid.loc['unique']
    starting_pmidmesh      =  grpm_gene_metrics_gene.pmidmesh.loc['unique']
    
    #  Post-Selection Metrics ===========
    if gene in grpm_match_gene_metrics.index:
        grpm_match_gene_metrics_gene = grpm_match_gene_metrics.loc[gene]
        matching_rsid                = grpm_match_gene_metrics_gene.rsid.loc['unique']
        dropped_rsid                 = lit1_rsid - matching_rsid
        matching_pmids               = grpm_match_gene_metrics_gene.pmids.loc['unique']
        matching_mesh                = grpm_match_gene_metrics_gene.mesh.loc['unique']
        matching_pmidmesh            = grpm_match_gene_metrics_gene.pmidmesh.loc['unique']
    else:
        matching_rsid                = 0
        dropped_rsid                 = lit1_rsid - 0
        matching_pmids               = 0
        matching_mesh                = 0
        matching_pmidmesh            = 0
                

    #------------------------
    if not include_top10:
        matching_rsid_pmid10  = 'na'
        matching_rsid_pmid100 = 'na'
        top10rsid             = 'na'
        top10mesh             = 'na'
    else:
        #=====================================
        # Get gene_grpm (slow step)
        grpm_gene = grpm_match_full.loc[grpm_match_full['gene'] == gene]
        #print(datetime.now() -time_alpha)
    
        dfmatch_full = grpm_gene
        dfmatch_less = dfmatch_full[['pmids', 'rsid', 'mesh']].drop_duplicates()
        #=====================================

        #=====================================
        ## 1. groupby.describe analysis by [rsid]
        dfmatch_less_rsid = dfmatch_less.groupby('rsid').describe().reset_index()
        dfmatch_less_rsid.columns = dfmatch_less_rsid.columns.to_flat_index()
        new_column_names = ['rsid', 'pmid-count', 'pmid-unique','pmid-top','pmid-freq','mesh-count', 'mesh-unique','mesh-top','mesh-freq']
        dfmatch_less_rsid.columns = new_column_names

        ### statistics:
        #grpm_match_rsid_metrics_gene = grpm_match_rsid_metrics.loc[gene]
        matching_rsid_pmid10 = len(dfmatch_less_rsid[dfmatch_less_rsid['pmid-unique']>10])
        matching_rsid_pmid100 = len(dfmatch_less_rsid[dfmatch_less_rsid['pmid-unique']>100])

        ### sorting, top10
        dfmatch_less_rsidless = dfmatch_less_rsid[['rsid','pmid-unique','mesh-unique']]
        dfmatch_less_rsidlesssort = dfmatch_less_rsidless.sort_values(by='pmid-unique', ascending= False).reset_index(drop=True)
        top10rsid = dfmatch_less_rsidlesssort['rsid'][:10].tolist()
        #==================================
        
        #==================================
        ## 2. groupby.describe analysis by [mesh]
        dfmatch_less_mesh = dfmatch_less.groupby('mesh').describe().reset_index()
        dfmatch_less_mesh.columns = dfmatch_less_mesh.columns.to_flat_index()
        # simplify df.groupby.describe, convert Multicolumn to single column 
        new_column_names = ['mesh', 'pmid-count', 'pmid-unique','pmid-top','pmid-freq','rsid-count', 'rsid-unique','rsid-top','rsid-freq']
        dfmatch_less_mesh.columns = new_column_names
        dfmatch_less_mesh_less = dfmatch_less_mesh[['mesh','pmid-unique','rsid-unique']]
        
        ### add frequency, top10
        samplepmid_count = len(dfmatch_less.pmids.drop_duplicates())
        dfmatch_less_mesh_less_frq = dfmatch_less_mesh_less.copy()
        mesh_frq = dfmatch_less_mesh_less_frq.loc[:,'pmid-unique'].astype(float)/samplepmid_count
        dfmatch_less_mesh_less_frq.loc[:,'mesh frequency'] = round(mesh_frq,3)#*100
        dfmatch_less_mesh_less_frqsort = dfmatch_less_mesh_less_frq.sort_values(by='pmid-unique',ascending=False).reset_index(drop=True)
        top10mesh = dfmatch_less_mesh_less_frqsort['mesh'][:10].tolist()
        #==================================

        if save_plot:
            # create a scatter plot
            x = dfmatch_less_mesh_less_frqsort['mesh'].head(30)
            y = dfmatch_less_mesh_less_frqsort['pmid-unique'].head(30)
            plt.figure(figsize=(5, 8))
            plt.title('Scatter Plot: '+gene+' pmid-mesh (filtered)', loc='center',pad=10)
            plt.scatter(y, x)
            plt.gca().invert_yaxis()
            #plt.subplots_adjust(left=0.3, right=0.9, bottom=0.3, top=0.9)
            #plt.xticks(rotation=90)
            plt.tick_params(axis='x', which='both', top=True, bottom=False, labeltop=True, labelbottom=False)
            plt.xlabel('pmid count', position=(0.5, 1.08))
            ax = plt.gca()
            ax.xaxis.set_label_position('top')
            #plt.show()
            plt.savefig(directory+'/'+gene+'_mesh_plot_'+timestamp+'_filtered.png',dpi=120, bbox_inches = "tight")
            plt.close()
        else:
            pass


    # Collect REPORT data ==================================

    report = { 'reference_mesh': ref_mesh_n,
               'starting_pmidmesh': starting_pmidmesh,
               'starting_pmid' : starting_pmid,
               'starting_mesh': starting_mesh,
               'starting_rsid': lit1_rsid,
               'matching_pmidmesh': matching_pmidmesh,
               'matching_pmids': matching_pmids,
               'matching_mesh': matching_mesh,
               'matching_rsid': matching_rsid,
               'dropped_rsid': dropped_rsid,
               'matching_mesh_ratio': round((matching_mesh/starting_mesh),3),
               'matching_pmids_ratio': round((matching_pmids/starting_pmid),3),
               'matching_pmidmesh_ratio':  round((matching_pmidmesh/starting_pmidmesh),3),
               'matching_rsid_ratio': round((matching_rsid/lit1_rsid),3),
               'matching_rsid_pmid10': matching_rsid_pmid10,
               'matching_rsid_pmid100': matching_rsid_pmid100,
               'matching_top10mesh':str(top10mesh),
               'matching_top10rsid':str(top10rsid),
               }
    
    # UPDATE REPORT ------------
    df_new_raw = pd.DataFrame(report, index=[gene])
    df_report_complete = pd.concat([df_report_complete, df_new_raw])

    full_runtime = datetime.now() - time_alpha
    #print((gene+'_runtime:').ljust(18)+ str(full_runtime).ljust(15), ' Genes processed:', genes.index(gene), 'on', len(genes))
    total_seconds = full_runtime.total_seconds()

    # save checkpoint----------------------
    if genes.index(gene) > 1 and genes.index(gene) % checkpoint == 0:
        df_report_complete.to_csv(directory+'/GRPMX_report.csv')
        #print("saved checkpoint")
    else:
        pass
    #==================================


# Save report csv  (saving translate version is a code atavism)
df_report_complete.T.to_csv(directory+'/GRPMX_report.csv')

# #Update gene values (remove previous gene entry)
GRPMX_report = pd.read_csv(directory+'/GRPMX_report.csv', index_col=0)
time_load_1 = datetime.now()
for gene in grpm_genes_list:
    if gene+'.1' in GRPMX_report.columns:
        GRPMX_report = GRPMX_report.drop(columns = gene)
        GRPMX_report = GRPMX_report.rename(columns={gene+'.1': gene})
    else:
        pass
    
print(datetime.now() - time_load_1)
GRPMX_report.to_csv(directory+'/GRPMX_report.csv')

time_finish = datetime.now()
time_batch = time_finish - time_start

if os.path.isfile('run_time.txt'):
    with open('run_time.txt', 'a') as file:
        file.write(topic_tag+':\n\ttime batch: '+str(time_batch)+'\n\truntime/gene: '+str(time_batch/len(genes))+'\n\n')
else:
    with open('run_time.txt', 'w') as file:
        file.write(topic_tag+':\n\ttime batch: '+str(time_batch)+'\n\truntime/gene: '+str(time_batch/len(genes))+'\n\n')

print('time batch:',time_batch)
print('runtime/gene:', time_batch/len(genes))

100%|██████████| 15519/15519 [10:28<00:00, 24.67it/s]


0:00:00.022980
time batch: 0:10:30.102083
runtime/gene: 0:00:00.040602


In [56]:
df_report_complete

Unnamed: 0,reference_mesh,starting_pmidmesh,starting_pmid,starting_mesh,starting_rsid,matching_pmidmesh,matching_pmids,matching_mesh,matching_rsid,dropped_rsid,matching_mesh_ratio,matching_pmids_ratio,matching_pmidmesh_ratio,matching_rsid_ratio,matching_rsid_pmid10,matching_rsid_pmid100,matching_top10mesh,matching_top10rsid
MT-ND1,243,3603,269,764,105,51,40,40,27,78,0.052,0.149,0.014,0.257,na,na,na,na
MT-ND2,243,3958,292,795,123,57,46,46,29,94,0.058,0.158,0.014,0.236,na,na,na,na
MT-CO1,243,2308,163,558,97,29,26,26,22,75,0.047,0.160,0.013,0.227,na,na,na,na
MT-CO2,243,1330,94,422,67,21,17,17,14,53,0.040,0.181,0.016,0.209,na,na,na,na
MT-ATP8,243,1802,127,518,77,23,19,19,14,63,0.037,0.150,0.013,0.182,na,na,na,na
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CRISP1,243,22,2,19,2,0,0,0,0,0,0.000,0.000,0.000,0.000,na,na,na,na
TMC3,243,50,4,40,3,3,2,2,1,2,0.050,0.500,0.060,0.333,na,na,na,na
MPIG6B,243,210,16,107,7,3,3,3,4,3,0.028,0.188,0.014,0.571,na,na,na,na
PDP1,243,241,19,179,13,2,2,2,1,12,0.011,0.105,0.008,0.077,na,na,na,na


GRPM-Survey Version 2.0

OB-BMI (full)/
time batch: 0:42:36.831920
runtime/gene: 0:00:00.164755

======================================
GRPM-Survey Version 3.0
OB-BMI (partial)
time batch: 0:00:49.547031
runtime/gene: 0:00:00.003193

OB-BMI (full)
time batch: 0:10:30.102083
runtime/gene: 0:00:00.040602\78/

In [None]:
pd.read_csv(directory+'/GRPMX_report.csv').T.head(20)

In [None]:
df_read = pd.read_csv(directory+'/grpmx_filtered_output.csv', index_col=0)
print('genes matching:', df_read.gene.nunique())
print('mesh matching:', df_read.mesh.nunique())
print('apply threshold in Analyzer Module')
df_read

# Check results

In [None]:
# Visualize GRPMX_report.csv
GRPMX_report = pd.read_csv(directory+'/GRPMX_report.csv', index_col=0).transpose().reset_index().rename(columns={'index':'gene'})
GRPMX_report.gene.drop_duplicates().to_clipboard()
print('Genes matching:',len(GRPMX_report.gene.drop_duplicates()))

GRPMX_report[['reference_mesh', 'starting_pmidmesh', 'starting_pmid','starting_mesh','starting_rsid', 'matching_pmidmesh', 'matching_pmids', 'matching_mesh','matching_rsid', 'dropped_rsid']] = GRPMX_report[['reference_mesh', 'starting_pmidmesh', 'starting_pmid','starting_mesh','starting_rsid', 'matching_pmidmesh', 'matching_pmids', 'matching_mesh','matching_rsid', 'dropped_rsid']].astype(int)

GRPMX_report[['matching_mesh_ratio', 'matching_pmids_ratio','matching_pmidmesh_ratio', 'matching_rsid_ratio']] = GRPMX_report[['matching_mesh_ratio', 'matching_pmids_ratio','matching_pmidmesh_ratio','matching_rsid_ratio']].astype(float)

columns_to_keep = ['matching_pmids','matching_pmids_ratio','matching_mesh','matching_rsid']
GRPMX_report_less = GRPMX_report[columns_to_keep]

sorting_column = 'matching_pmids'
GRPMX_report_sort = GRPMX_report.sort_values(by=sorting_column, ascending=False)

columns_to_display = ['gene', 'matching_pmidmesh', 'matching_pmids',
                      'matching_mesh', 'matching_rsid', 'dropped_rsid', 'matching_mesh_ratio',
                      'matching_pmids_ratio', 'matching_pmidmesh_ratio',
                      'matching_rsid_ratio']
GRPMX_report_display = GRPMX_report[columns_to_display]
GRPMX_report_display

In [None]:
# Matching PMIDs in Database
GRPMX_report_sort = GRPMX_report.sort_values(by= 'matching_pmids',ascending=False)

x = GRPMX_report_sort.gene.iloc[:40]
y = GRPMX_report_sort['matching_pmids'].iloc[:40]
plt.figure(figsize=(5, len(x)*0.2))
plt.title('Matching PMIDs in Database', loc='center',pad=10)

plt.barh(x,y)
plt.gca().invert_yaxis()
plt.tick_params(axis='x', which='both', top=True, bottom=False, labeltop=True, labelbottom=False)
#plt.xlabel('pmid count', position=(0.5, 1.08))
plt.ylabel('genes')
plt.xlabel('matching pmid', position=(0.5, 1.08))
ax = plt.gca()
ax.xaxis.set_label_position('top')

plt.show()

In [None]:
# Add "interest value" to report:----------------------------------------------------------
max_match_pmids = int(GRPMX_report['matching_pmids'].max())
GRPMX_report_int = GRPMX_report
GRPMX_report_int['matching_pmids_score'] = round((GRPMX_report_int['matching_pmids']/max_match_pmids),3)

GRPMX_report_int['interest_value'] = round(GRPMX_report_int['matching_pmids_score'] * GRPMX_report_int['matching_pmids_ratio'],3)

GRPMX_report_int.set_index('gene').sort_values(by='interest_value')#.T

In [None]:
# Matching PMIDs in Database
GRPMX_report_sort = GRPMX_report.sort_values(by= 'matching_pmids_index',ascending=False)

x = GRPMX_report_sort.gene.iloc[:100]
y = GRPMX_report_sort['matching_pmids_index'].iloc[:100]
plt.figure(figsize=(5, len(x)*0.2))
plt.title('Matching PMIDs in Database', loc='center',pad=10)

plt.barh(x,y)
plt.gca().invert_yaxis()
plt.tick_params(axis='x', which='both', top=True, bottom=False, labeltop=True, labelbottom=False)
#plt.xlabel('pmid count', position=(0.5, 1.08))
plt.ylabel('genes')
plt.xlabel('matching pmid', position=(0.5, 1.08))
ax = plt.gca()
ax.xaxis.set_label_position('top')

plt.show()

# Extra

Simple GRPM Subsetting

In [None]:
grpm_dataset.head()

## filter by mesh

In [None]:
# filtering source dataset
import time
timea = time.time()

my_mesh = 'Heart Failure'
#nbib_subset = pd.DataFrame(columns= nbib_dataset.columns)
filteres_grpm = grpm_dataset[grpm_dataset.mesh == my_mesh].reset_index(drop=True)

print((time.time()-timea)/60,'minutes')
filteres_grpm.to_csv('filteres_grpm_heart_fail.csv') #= pd.read_csv(filteres_grpm)
filteres_grpm

In [None]:
# import LitVat-PubMed Dataset (GRPM)
filteres_grpm = pd.read_csv('filteres_grpm_heart_fail.csv', index_col=0)
filteres_grpm.pmids = filteres_grpm.pmids.astype('str')  # convert PMIDs to str
filteres_grpm

In [None]:
#Analyze data with "groupby.describe" method

## 1. groupby.describe analysis by [pmids]
filteres_grpm_gene = filteres_grpm.groupby('gene').describe().reset_index()#.reset_index(drop=True)
filteres_grpm_gene[['gene','pmids']].sort_values(by=('pmids', 'unique'), ascending=False).reset_index(drop=True)

## filter by gene

In [None]:
# filtering source dataset
import time
timea = time.time()
my_mesh = ref.mesh[0]
my_gene = 'GLA'
#nbib_subset = pd.DataFrame(columns= nbib_dataset.columns)
filteres_grpm = grpm_dataset[grpm_dataset.gene == my_gene].reset_index(drop=True)
print((time.time()-timea)/60,'minutes')

filteres_grpm.to_csv('filteres_grpm.csv') #= pd.read_csv(filteres_grpm)

In [None]:
# import LitVat-PubMed Dataset (GRPM)
filteres_grpm = pd.read_csv('filteres_grpm.csv', index_col=0)
filteres_grpm_sub = filteres_grpm[filteres_grpm.mesh.str.contains('Fabry')].reset_index(drop=True)
filteres_grpm_sub.pmids = filteres_grpm.pmids.astype('str')  # convert PMIDs to str
filteres_grpm_sub

In [None]:
#Analyze data with "groupby.describe" method

## 1. groupby.describe analysis by [pmids]
filteres_grpm_sub_pmids = filteres_grpm.groupby('pmids').describe().reset_index()
filteres_grpm_sub_pmids

In [None]:
## 1. groupby.describe analysis by [rsid]
filteres_grpm_sub_rsid = filteres_grpm.groupby('rsid').describe().reset_index()
filteres_grpm_sub_rsid

In [None]:
## 1. groupby.describe analysis by [rsid]
filteres_grpm_mesh = filteres_grpm.groupby('mesh').describe().reset_index()
filteres_grpm_mesh

In [None]:
filteres_grpm_[filteres_grpm_.mesh.str.contains('Fabry')]

In [None]:

filteres_grpm_.columns = filteres_grpm_.columns.to_flat_index()
#new_column_names = ['rsid', 'pmid-count', 'pmid-unique','pmid-top','pmid-freq','mesh-count', 'mesh-unique','mesh-top','mesh-freq']
filteres_grpm_.columns = filteres_grpm_
#------------------
filteres_grpm_

In [ ]:
import json
pmid_list =[]

for  pmid_id  in myjson.elemento[0]:
    pmid_list.append(pmid_id)
