In [1]:
# import pytorch libraries
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import GATConv
from torch_geometric.data import Data
from torch_geometric.deprecation import deprecated

import os
import csv
import copy
import math
import time
import random
import pickle
import argparse
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm

from collections import Counter
from statsmodels.tools.eval_measures import mse
from statsmodels.stats.multitest import fdrcorrection
from decimal import Decimal
from scipy import stats
from scipy.stats import spearmanr, rankdata, norm, fisher_exact
from scipy.spatial.distance import cosine
import matplotlib.pyplot as plt

In [2]:
# find all allen regions
save_dir = '/project/pi_rachel_melamed_uml_edu/Jianfeng/Allen/src/Pytorch/02162024/ATG_8_or_less/Prediction/300_500_by_subject/'
filename = 'GTEX-1192X_300_500_trainable_LMfromGTEx.csv'
df = pd.read_csv(save_dir+filename, index_col=0)
keys = df.columns

# which dataset here
dataset_list = ['ieu-a-1043', 'ieu-a-1044', 'ieu-a-1045', 'ieu-a-1041', 'ieu-a-1042', 
                'ieu-a-1046', 'ieu-a-1047', 'ieu-a-1048', 'ieu-a-1183', 'ieu-b-2', 
                'ieu-a-1085', 'ieu-a-1185', 'ieu-b-41', 'ieu-b-13', 'ieu-b-8', 
                'ieu-b-10', 'ieu-b-14', 'ieu-b-11', 'ieu-b-15', 'ieu-b-16', 
                'ieu-b-9', 'ieu-b-12', 'ieu-b-17', 'ieu-b-43', 'ebi-a-GCST005902', 
                'ieu-b-18', 'ieu-b-7', 'ieu-b-42', 'ebi-a-GCST006250', 'ieu-a-1239']
trait_name_list = ["Amygdala volume", "Caudate volume", "Hippocampus volume", "Intracranial volume", "Nucleus accumbens volume", 
                   "Pallidum volume", "Putamen volume", "Thalamus volume", "ADHD", "Alzheimer's disease", 
                   "Amyotrophic lateral sclerosis", "Autism spectrum disorder", "Bipolar disorder", "Childhood absence epilepsy", "Epilepsy, all documented cases", 
                   "Focal epilepsy, all documented cases", "Focal epilepsy, documented hippocampal sclerosis", "Focal epilepsy, documented lesion negative", "Focal epilepsy, documented lesion other than hippocampal sclerosis", "Generalized epilepsy with tonic-clonic seizures", 
                   "Generalized epilepsy, all documented cases", "Juvenile absence epilepsy", "Juvenile myoclonic epilepsy", "Frontotemporal dementia - TDP subtype", "Major depressive disorder", 
                   "Multiple sclerosis", "Parkinson's disease", "Schizophrenia", "Cognitive function", "Years of schooling"]
trait_name_abv_list = ["Amygdala volume", "Caudate volume", "Hippocampus volume", "Intracranial volume", "Nucleus accumbens volume", 
                       "Pallidum volume", "Putamen volume", "Thalamus volume", "ADHD", "AD", 
                       "ALS", "ASD", "BD", "CAE", "EPI", 
                       "Focal EPI 1", "Focal EPI 2", "Focal EPI 3", "Focal EPI 4", "Generalized EPI 1", 
                       "Generalized EPI 1", "JAE", "JME", "FTD", "MDD", 
                       "MS", "PD", "Schizophrenia", "Cognitive function", "Years of schooling"]

save_dir = "/project/pi_rachel_melamed_uml_edu/Jianfeng/Allen/src/R/coloc+MR/coloc_summarized_results/"
filename = f"coloc_summarized_results_method1_local_across_{len(dataset_list)}_traits.csv"
coloc_summarized_result = pd.read_csv(save_dir+filename)
coloc_summarized_result_sig = coloc_summarized_result[coloc_summarized_result['pass_coloc']==True]

In [9]:
save_coloc_result = coloc_summarized_result_sig.copy()
save_coloc_result = save_coloc_result.reset_index(drop=True)
save_coloc_result.rename(columns={save_coloc_result.columns[4]: 'tissue_id'}, inplace=True)
save_coloc_result.insert(5, 'tissue_name', '')
save_coloc_result.insert(6, 'allen_level4_tissue', '')
save_coloc_result

Unnamed: 0,variant_id,gene_name,gene_id,rsid,tissue_id,tissue_name,allen_level4_tissue,dataset,trait,proxy_snp,...,mr_se,eqtl_p,eqtl_b,eqtl_se,gwas_p,gwas_b,gwas_se,default_coloc_pp4,corrected_coloc_pp4,pass_coloc
0,chr11_47602338_C_T_b38,MTCH2,ENSG00000109919.9,rs12363232,4012,,,ieu-b-2,Alzheimer's disease,rs12363232,...,0.034252,7.204670e-16,0.440855,0.048545,7.689002e-09,-0.087200,0.015100,0.967790,0.482048,True
1,chr14_30614542_C_A_b38,SCFD1,ENSG00000092108.20,rs229243,4012,,,ieu-a-1085,Amyotrophic lateral sclerosis,rs229243,...,0.012339,1.084650e-08,0.293316,0.048353,9.693040e-07,0.017727,0.003619,0.928886,0.363759,True
2,chr20_46095666_A_G_b38,CD40,ENSG00000101017.13,rs6131010,4012,,,ebi-a-GCST005902,Major depressive disorder,rs6131010,...,0.004567,3.724520e-06,0.294630,0.061240,6.910349e-08,-0.007258,0.001346,0.881252,0.303787,True
3,chr20_46095666_A_G_b38,CD40,ENSG00000101017.13,rs6131010,4012,,,ieu-b-18,Multiple sclerosis,rs6131010,...,0.062094,3.724520e-06,0.294630,0.061240,1.894960e-12,-0.128833,0.018295,0.978778,0.680598,True
4,chr1_160195661_C_T_b38,CASQ1,ENSG00000143318.12,rs10908769,4012,,,ieu-b-18,Multiple sclerosis,rs3747621,...,0.044508,1.076840e-13,0.398391,0.048498,3.438983e-07,-0.090389,0.017732,0.974493,0.650769,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2950,chr7_100308061_A_G_b38,PILRB,ENSG00000121716.20,rs28401739,13005,,,ieu-a-1239,Years of schooling,rs28401739,...,0.002423,6.673790e-40,0.899796,0.048738,1.710015e-08,-0.012300,0.002180,0.929175,0.260802,True
2951,chr1_169289605_G_A_b38,NME7,ENSG00000143156.13,rs10800425,13005,,,ieu-a-1239,Years of schooling,rs10800425,...,0.004632,1.419370e-10,0.375646,0.054362,5.070024e-06,0.007950,0.001740,0.960229,0.629274,True
2952,chr1_74199264_G_C_b38,LRRIQ3,ENSG00000162620.15,rs944795,13005,,,ieu-a-1239,Years of schooling,rs944795,...,0.001834,2.278650e-34,-0.926727,0.057168,3.489796e-13,0.012370,0.001700,0.908106,0.269855,True
2953,chr5_80680569_T_C_b38,ACOT12,ENSG00000172497.8,rs6883090,13005,,,ieu-a-1239,Years of schooling,rs6864512,...,0.005166,6.504630e-06,0.330985,0.070714,4.079998e-06,0.007890,0.001710,0.762063,0.216790,True


In [27]:
# Allen tissue names
# find the mapping actual allen region names
file_dir = "/project/pi_rachel_melamed_uml_edu/Jianfeng/Allen/src/Pytorch/02162024/MR+coloc/allen_table/"
filename = '2011_Allen_Supplementary_Table2.xls'
allen_supp2 = pd.read_excel(file_dir+filename, sheet_name=1, engine='xlrd')
# find the sub-matrix with only our keys
keys_list = [int(k) for k in keys.tolist()]
allen_supp2_sub = allen_supp2[allen_supp2['ID'].isin(keys_list)]
# find the mapped region names for allen regions
mapped_names = []
for allen_id in keys:
    allen_id = int(allen_id)
    row_idx = allen_supp2_sub['ID'].tolist().index(allen_id)
    for col_idx in range(8,15):
        element = allen_supp2_sub.iloc[row_idx,col_idx]
        if pd.isna(element)==False:
            mapped_names.append(element)
# remove "Left" from the mapped names
mapped_names = [name.replace(", Left", "") for name in mapped_names]

# Allen level 4 tissue name
my_dict = {'Cerebral Cortex': keys.tolist()[:55], 
           'Cerebral Nuclei': keys.tolist()[55:68], 
           'Thalamus': ['4395', '4400', '4409', '4417', '4432', '4437', '4440', '4506', '4507'], 
           'Subthalamus': ['4518'], 
           'Hypothalamus': ['4542', '4679', '13005'], 
           'Cerebellum': ['4720', '4722', '4723', '4725', '4738', '4739', '4740', '4741', '4782'], 
           'Midbrain Tegmentum': ['9054', '9067', '9074', '9075'], 
           'Pons': ['9161', '9492'], 
           'Level 3 tissue Myelencephalon': ['9520', '9543', '9561', '9598', '9614', '9677', '9698']}

for i in range(save_coloc_result.shape[0]):
    tissue_id = str(save_coloc_result['tissue_id'][i])
    tissue_name = mapped_names[keys.tolist().index(tissue_id)]
    save_coloc_result.loc[i, 'tissue_name'] = tissue_name
    for lv4_tissue in my_dict.keys():
        if tissue_id in my_dict[lv4_tissue]:
            save_coloc_result.loc[i, 'allen_level4_tissue'] = lv4_tissue
            continue

save_dir = "/project/pi_rachel_melamed_uml_edu/Jianfeng/Allen/src/Pytorch/02162024/MR+coloc/"
save_coloc_result.to_csv(save_dir+'coloc_sig_results.csv', index=False)

In [28]:
save_coloc_result[save_coloc_result['gene_name']=='RMI1']

Unnamed: 0,variant_id,gene_name,gene_id,rsid,tissue_id,tissue_name,allen_level4_tissue,dataset,trait,proxy_snp,...,mr_se,eqtl_p,eqtl_b,eqtl_se,gwas_p,gwas_b,gwas_se,default_coloc_pp4,corrected_coloc_pp4,pass_coloc
1783,chr9_83944298_T_C_b38,RMI1,ENSG00000178966.16,rs4147132,4322,Claustrum,Cerebral Nuclei,ieu-b-9,"Generalized epilepsy, all documented cases",rs4147132,...,0.045161,1.58895e-08,-0.311286,0.051977,5e-06,-0.064428,0.014058,0.852379,0.168617,True


In [13]:
keys.tolist()

['4012',
 '4013',
 '4014',
 '4023',
 '4024',
 '4030',
 '4031',
 '4039',
 '4045',
 '4048',
 '4051',
 '4060',
 '4074',
 '4079',
 '4087',
 '4088',
 '4098',
 '4099',
 '4106',
 '4107',
 '4113',
 '4114',
 '4120',
 '4121',
 '4135',
 '4136',
 '4142',
 '4143',
 '4149',
 '4150',
 '4151',
 '4158',
 '4160',
 '4166',
 '4178',
 '4186',
 '4187',
 '4193',
 '4194',
 '4200',
 '4201',
 '4214',
 '4223',
 '4224',
 '4230',
 '4244',
 '4245',
 '4251',
 '4254',
 '4255',
 '4256',
 '4257',
 '4258',
 '4270',
 '4273',
 '4280',
 '4281',
 '4282',
 '4288',
 '4291',
 '4296',
 '4322',
 '4329',
 '4342',
 '4351',
 '4360',
 '4367',
 '4379',
 '4395',
 '4400',
 '4409',
 '4417',
 '4432',
 '4437',
 '4440',
 '4506',
 '4507',
 '4518',
 '4542',
 '4679',
 '4720',
 '4722',
 '4723',
 '4725',
 '4738',
 '4739',
 '4740',
 '4741',
 '4782',
 '9054',
 '9067',
 '9074',
 '9075',
 '9161',
 '9492',
 '9520',
 '9543',
 '9561',
 '9598',
 '9614',
 '9677',
 '9698',
 '13005']