In [1]:
from __future__ import print_function
import keras
from keras.models import Sequential, Model, load_model
from keras import backend as K

import tensorflow as tf

import os
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt
import matplotlib.cm as cm

import aparent.visualization as vis

from aparent.predictor import *

import urllib
import urllib.request
import pickle
from time import sleep


Using TensorFlow backend.


In [2]:
#Tissue types to compile data for

tissue_types = [
    'Adipose_Subcutaneous',
    'Adipose_Visceral_Omentum',
    'Adrenal_Gland',
    'Artery_Aorta',
    'Artery_Coronary',
    'Artery_Tibial',
    'Brain_Amygdala',
    'Brain_Anterior_cingulate_cortex_BA24',
    'Brain_Caudate_basal_ganglia',
    'Brain_Cerebellar_Hemisphere',
    'Brain_Cerebellum',
    'Brain_Cortex',
    'Brain_Frontal_Cortex_BA9',
    'Brain_Hippocampus',
    'Brain_Hypothalamus',
    'Brain_Nucleus_accumbens_basal_ganglia',
    'Brain_Putamen_basal_ganglia',
    'Brain_Spinal_cord_cervical_c-1',
    'Breast_Mammary_Tissue',
    'Cells_EBV-transformed_lymphocytes',
    'Cells_Transformed_fibroblasts',
    'Colon_Sigmoid',
    'Colon_Transverse',
    'Esophagus_Gastroesophageal_Junction',
    'Esophagus_Mucosa',
    'Esophagus_Muscularis',
    'Heart_Atrial_Appendage',
    'Heart_Left_Ventricle',
    'Liver',
    'Lung',
    'Muscle_Skeletal',
    'Nerve_Tibial',
    'Ovary',
    'Pancreas',
    'Pituitary',
    'Prostate',
    'Skin_Not_Sun_Exposed_Suprapubic',
    'Skin_Sun_Exposed_Lower_leg',
    'Small_Intestine_Terminal_Ileum',
    'Spleen',
    'Stomach',
    'Testis',
    'Thyroid',
    'Uterus',
    'Vagina',
    'Whole_Blood'
]


In [3]:

version_suffix = '_all'


In [15]:
#Compile apaQTL data from GTEx

for tissue_type in tissue_types :
    
    print("Processing data for tissue = '" + str(tissue_type) + "'.")
    
    snp_df = pd.read_csv(tissue_type + '.cis.3aQTL.txt', sep='\t')

    snp_df['chrom'] = snp_df['SNP'].apply(lambda x: x.split("_")[0])
    snp_df['start'] = snp_df['SNP'].apply(lambda x: x.split("_")[1])
    snp_df['end'] = snp_df['SNP'].apply(lambda x: str(int(x.split("_")[1]) + 1))
    snp_df['gene'] = snp_df['transcript'].apply(lambda x: x.split("|")[1])

    snp_df = snp_df[['chrom', 'start', 'end', 'gene', 'SNP', 'transcript', 'beta', 't.stat', 'p.value']]
    snp_df = snp_df.rename(columns={
        'SNP' : 'snp_id',
        'transcript' : 'transcript_id',
        'beta' : 'effect_size',
        't.stat' : 'test_statistic',
        'p.value' : 'p_val',
    })
    
    snp_df['transcript_id'] = snp_df['transcript_id'].apply(lambda x: x.split("|")[0])
    snp_df['snp_transcript_id'] = snp_df['snp_id'] + "_" + snp_df['transcript_id']
    
    #Drop duplicates
    snp_df['row_id'] = snp_df['gene'] + "_" + snp_df['snp_id'] + "_" + snp_df['transcript_id']
    snp_df = snp_df.sort_values(by='snp_id').drop_duplicates(subset=['row_id'], keep='first')

    #Save final apaQTL dataframe
    snp_df.to_csv("polyadb_" + tissue_type + version_suffix + "_SNPs.csv", sep='\t')


Processing data for tissue = 'Adipose_Subcutaneous'.
Processing data for tissue = 'Adipose_Visceral_Omentum'.
Processing data for tissue = 'Adrenal_Gland'.
Processing data for tissue = 'Artery_Aorta'.
Processing data for tissue = 'Artery_Coronary'.
Processing data for tissue = 'Artery_Tibial'.
Processing data for tissue = 'Brain_Amygdala'.
Processing data for tissue = 'Brain_Anterior_cingulate_cortex_BA24'.
Processing data for tissue = 'Brain_Caudate_basal_ganglia'.
Processing data for tissue = 'Brain_Cerebellar_Hemisphere'.
Processing data for tissue = 'Brain_Cerebellum'.
Processing data for tissue = 'Brain_Cortex'.
Processing data for tissue = 'Brain_Frontal_Cortex_BA9'.
Processing data for tissue = 'Brain_Hippocampus'.
Processing data for tissue = 'Brain_Hypothalamus'.
Processing data for tissue = 'Brain_Nucleus_accumbens_basal_ganglia'.
Processing data for tissue = 'Brain_Putamen_basal_ganglia'.
Processing data for tissue = 'Brain_Spinal_cord_cervical_c-1'.
Processing data for tiss

In [4]:
#Compute median effect sizes

exclude_tissues = [[]]
exclude_suffixes = ['']

#exclude_tissues = [[], ['Brain_Cortex', 'Brain_Frontal_Cortex_BA9'], ['Testis']]
#exclude_suffixes = ['', '_Exclude_Brain', '_Exclude_Testis']

for exclude_tissue, exclude_suffix in zip(exclude_tissues, exclude_suffixes) :

    print("Excluding " + str(exclude_tissue) + "...")
    
    snp_dict = {}

    for i, tissue_type in enumerate(tissue_types) :
        
        if tissue_type in exclude_tissue :
            continue

        print("Processing tissue = '" + tissue_type + "'")

        snp_df = pd.read_csv("polyadb_" + tissue_type + version_suffix + "_SNPs.csv", sep='\t')

        for _, row in snp_df.iterrows() :

            snp_id = row['snp_id'] + "__" + row['gene'] + "__" + row['transcript_id']

            effect_size = row['effect_size']
            p_val = row['p_val']

            if snp_id not in snp_dict :
                snp_dict[snp_id] = {
                    'effect_size' : [],
                    'p_val' : [],
                    'snp_id' : row['snp_id']
                }

            snp_dict[snp_id]['effect_size'].append(effect_size)
            snp_dict[snp_id]['p_val'].append(p_val)

    extended_snp_ids = []
    snp_ids = []
    effect_sizes = []
    p_vals = []
    n_tissues = []

    for snp_id in snp_dict :

        extended_snp_ids.append(snp_id)
        snp_ids.append(snp_dict[snp_id]['snp_id'])
        effect_sizes.append(np.median(snp_dict[snp_id]['effect_size']))
        p_vals.append(np.min(snp_dict[snp_id]['p_val']))

        n_tissues.append(len(snp_dict[snp_id]['effect_size']))

    median_df = pd.DataFrame({
        'extended_snp_id' : extended_snp_ids,
        'snp_id' : snp_ids,
        'effect_size' : effect_sizes,
        'p_val' : p_vals,
        'n_tissues' : n_tissues
    })
    median_df = median_df[['extended_snp_id', 'snp_id', 'effect_size', 'p_val', 'n_tissues']]

    median_df.to_csv("polyadb_median" + version_suffix + exclude_suffix + "_SNPs.csv", sep='\t')


Excluding []...
Processing tissue = 'Adipose_Subcutaneous'
Processing tissue = 'Adipose_Visceral_Omentum'
Processing tissue = 'Adrenal_Gland'
Processing tissue = 'Artery_Aorta'
Processing tissue = 'Artery_Coronary'
Processing tissue = 'Artery_Tibial'
Processing tissue = 'Brain_Amygdala'
Processing tissue = 'Brain_Anterior_cingulate_cortex_BA24'
Processing tissue = 'Brain_Caudate_basal_ganglia'
Processing tissue = 'Brain_Cerebellar_Hemisphere'
Processing tissue = 'Brain_Cerebellum'
Processing tissue = 'Brain_Cortex'
Processing tissue = 'Brain_Frontal_Cortex_BA9'
Processing tissue = 'Brain_Hippocampus'
Processing tissue = 'Brain_Hypothalamus'
Processing tissue = 'Brain_Nucleus_accumbens_basal_ganglia'
Processing tissue = 'Brain_Putamen_basal_ganglia'
Processing tissue = 'Brain_Spinal_cord_cervical_c-1'
Processing tissue = 'Breast_Mammary_Tissue'
Processing tissue = 'Cells_EBV-transformed_lymphocytes'
Processing tissue = 'Cells_Transformed_fibroblasts'
Processing tissue = 'Colon_Sigmoid'


In [5]:
median_df

Unnamed: 0,extended_snp_id,snp_id,effect_size,p_val,n_tissues
0,chr10_100122640_C_G__HPS1__NM_000195,chr10_100122640_C_G,0.013850,3.724159e-13,8
1,chr10_100126295_C_T__HPS1__NM_000195,chr10_100126295_C_T,0.019626,2.152371e-23,23
2,chr10_100134036_C_T__HPS1__NM_000195,chr10_100134036_C_T,0.021223,1.803418e-23,23
3,chr10_100137700_T_C__HPS1__NM_000195,chr10_100137700_T_C,0.021223,1.803418e-23,23
4,chr10_100141027_T_C__HPS1__NM_000195,chr10_100141027_T_C,-0.020458,4.057859e-26,25
5,chr10_100142068_A_G__HPS1__NM_000195,chr10_100142068_A_G,-0.019053,3.760774e-13,16
6,chr10_100142154_G_A__HPS1__NM_000195,chr10_100142154_G_A,-0.017817,1.636756e-14,17
7,chr10_100143193_C_G__HPS1__NM_000195,chr10_100143193_C_G,-0.020112,4.057859e-26,25
8,chr10_100144782_C_T__HPS1__NM_000195,chr10_100144782_C_T,-0.018361,3.347797e-15,16
9,chr10_100145913_A_G__HPS1__NM_000195,chr10_100145913_A_G,-0.018714,2.064971e-25,24


In [6]:

len(median_df['snp_id'].unique())


414868

In [7]:
len(tissue_types)

46