In [1]:
from __future__ import print_function
import keras
from keras.models import Sequential, Model, load_model
from keras import backend as K

import tensorflow as tf

import os
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt
import matplotlib.cm as cm

import aparent.visualization as vis

from aparent.predictor import *

import urllib
import urllib.request
import pickle
from time import sleep


Using TensorFlow backend.


In [2]:
#Load PolyADB data

df = pd.read_csv('../../data/native_data/polyadb_processed_v3.csv', delimiter=',')
df = df.query("site_type == '3_most_exon'").copy().reset_index(drop=True)
#df = df.loc[~df['wide_seq_ext'].str.slice(175 - 70, 175 - 70 + 205).str.contains("AAAAAAA|AAAGAAAA|AAACAAAA|AAAAGAAA|AAAACAAA")].copy().reset_index(drop=True)

isoform_df = pd.read_csv("../../data/native_data/polyadb_processed_v3_utr3_isoforms_and_coords.csv", sep='\t')

polyadb_df = df.loc[df['gene_id'].isin(isoform_df['gene_id'].values.tolist())].copy().reset_index(drop=True)

print('len(polyadb_df) = ' + str(len(polyadb_df)))


len(polyadb_df) = 84723


In [3]:

polyadb_df['start'] = polyadb_df['pas_pos'] - 150
polyadb_df['end'] = polyadb_df['pas_pos'] + 150

polyadb_bed = polyadb_df[['chrom', 'start', 'end', 'gene', 'gene_id', 'strand']].copy().reset_index(drop=True)

polyadb_bed.to_csv("polyadb_utr3_only_coordinates.bed", sep='\t', header=False, index=False)

isoform_bed = isoform_df[['chrom', 'start', 'end', 'gene', 'gene_id', 'strand']].copy().reset_index(drop=True)

isoform_bed.to_csv("isoform_utr3_only_coordinates.bed", sep='\t', header=False, index=False)


In [4]:
#Tissue types to compile data for

tissue_types = [
    'Adipose_Subcutaneous',
    'Adipose_Visceral_Omentum',
    'Adrenal_Gland',
    'Artery_Aorta',
    'Artery_Coronary',
    'Artery_Tibial',
    'Brain_Amygdala',
    'Brain_Anterior_cingulate_cortex_BA24',
    'Brain_Caudate_basal_ganglia',
    'Brain_Cerebellar_Hemisphere',
    #'Brain_Cerebellum',
    'Brain_Cortex',
    'Brain_Frontal_Cortex_BA9',
    'Brain_Hippocampus',
    'Brain_Hypothalamus',
    'Brain_Nucleus_accumbens_basal_ganglia',
    'Brain_Putamen_basal_ganglia',
    'Brain_Spinal_cord_cervical_c-1',
    'Breast_Mammary_Tissue',
    'Cells_EBV-transformed_lymphocytes',
    'Cells_Transformed_fibroblasts',
    'Colon_Sigmoid',
    'Colon_Transverse',
    'Esophagus_Gastroesophageal_Junction',
    'Esophagus_Mucosa',
    'Esophagus_Muscularis',
    #'Heart_Atrial_Appendage',
    'Heart_Left_Ventricle',
    'Liver',
    'Lung',
    'Muscle_Skeletal',
    'Nerve_Tibial',
    'Ovary',
    'Pancreas',
    'Pituitary',
    'Prostate',
    'Skin_Not_Sun_Exposed_Suprapubic',
    'Skin_Sun_Exposed_Lower_leg',
    'Small_Intestine_Terminal_Ileum',
    'Spleen',
    'Stomach',
    'Testis',
    'Thyroid',
    'Uterus',
    'Vagina',
    'Whole_Blood'
]


In [6]:

version_suffix = '_utr3_only_iso_nonlead_sizematched_p_thresh'


In [8]:
#Compile apaQTL data from GTEx

p_val_thresh = 1e-10

def get_snp_pos(row) :
    if row['strand'] == '+' :
        return row['snp_pos'] - row['pas_pos'] + 175 - 1
    else :
        return row['pas_pos'] - row['snp_pos'] + 175

def get_snp_ref(row) :
    if row['strand'] == '+' :
        return row['snp_ref']
    else :
        if row['snp_ref'] == 'A' :
            return 'T'
        if row['snp_ref'] == 'C' :
            return 'G'
        if row['snp_ref'] == 'G' :
            return 'C'
        if row['snp_ref'] == 'T' :
            return 'A'

def get_snp_var(row) :
    if row['strand'] == '+' :
        return row['snp_var']
    else :
        if row['snp_var'] == 'A' :
            return 'T'
        if row['snp_var'] == 'C' :
            return 'G'
        if row['snp_var'] == 'G' :
            return 'C'
        if row['snp_var'] == 'T' :
            return 'A'

def get_var_seq(row) :
    ref_seq = row['wide_seq_ext']
    var_seq = ref_seq[:row['rel_snp_pos']] + row['rel_snp_var'] + ref_seq[row['rel_snp_pos']+1:]
    
    return var_seq

for tissue_type in tissue_types :
    
    print("Processing data for tissue = '" + str(tissue_type) + "'.")
    
    snp_df = pd.read_csv(tissue_type + '.cis.3aQTL.txt', sep='\t')

    snp_df['chr'] = snp_df['SNP'].apply(lambda x: x.split("_")[0])
    snp_df['start'] = snp_df['SNP'].apply(lambda x: x.split("_")[1])
    snp_df['end'] = snp_df['SNP'].apply(lambda x: str(int(x.split("_")[1]) + 1))
    snp_df['gene'] = snp_df['transcript'].apply(lambda x: x.split("|")[1])

    snp_df = snp_df[['chr', 'start', 'end', 'gene', 'SNP', 'transcript', 'beta', 't.stat', 'p.value']]

    snp_df.to_csv(tissue_type + version_suffix + ".cis.3aQTL.coordinates.bed", sep='\t', header=False, index=False)

    !bedtools intersect -a polyadb_utr3_only_coordinates.bed -b {tissue_type + version_suffix}.cis.3aQTL.coordinates.bed -wa -wb > polyadb_{tissue_type + version_suffix}_intersect.bed
    
    snp_bed_hg19 = pd.read_csv("polyadb_" + tissue_type + version_suffix + "_intersect.bed", sep='\t', error_bad_lines=False, names=['chrom', 'start', 'end', 'gene', 'gene_id', 'strand', 'chr_2', 'snp_pos', 'end_2', 'gene_2', 'snp_id', 'transcript_id', 'effect_size', 'test_statistic', 'p_val'])
    snp_bed_hg19 = snp_bed_hg19.query("gene == gene_2").copy().reset_index(drop=True)

    snp_bed_hg19 = snp_bed_hg19[['chrom', 'start', 'end', 'gene', 'gene_id', 'strand', 'chr_2', 'snp_pos', 'end_2', 'gene_2', 'snp_id', 'transcript_id', 'effect_size', 'test_statistic', 'p_val']]

    snp_bed_hg19['snp_ref'] = snp_bed_hg19['snp_id'].apply(lambda s: s.split("_")[2])
    snp_bed_hg19['snp_var'] = snp_bed_hg19['snp_id'].apply(lambda s: s.split("_")[3])
    
    polyadb_df_sel = polyadb_df[['wide_seq_ext', 'gene_id', 'pas', 'pas_pos', 'cut_mode', 'site_type', 'rpm']].copy().reset_index(drop=True)

    snp_df = snp_bed_hg19.join(polyadb_df_sel.set_index('gene_id'), on='gene_id', how='inner').copy().reset_index(drop=True)
    
    snp_df['rel_snp_pos'] = snp_df.apply(get_snp_pos, axis=1)
    snp_df['rel_snp_ref'] = snp_df.apply(get_snp_ref, axis=1)
    snp_df['rel_snp_var'] = snp_df.apply(get_snp_var, axis=1)

    snp_df['wide_seq_ext_var'] = snp_df.apply(get_var_seq, axis=1)
    
    #Filter by lead SNPs (remove lead SNPs)
    lead_df = pd.read_csv("lead/" + tissue_type + ".lead.3aQTL.txt", sep='\t')
    lead_df['is_lead'] = True
    
    snp_df = snp_df.join(lead_df[['SNP', 'is_lead']].set_index("SNP"), on='snp_id', how='left')
    snp_df = snp_df.loc[snp_df['is_lead'].isnull()].copy()
    
    #Filter on p-value
    snp_df = snp_df.query("p_val <= " + str(p_val_thresh)).copy()
    
    #Calculate mean PDUIs
    pdui_df = pd.read_csv(tissue_type + "_combined_All_PDUIs_clean.txt", sep='\t')
    pdui_df = pdui_df.join(snp_df[['transcript_id']].set_index('transcript_id'), on='event_id', how='inner').copy().reset_index(drop=True)

    gtex_samples = pdui_df.columns.values.tolist()[2:]

    def calc_mean_PDUI(row) :

        n_samples = 0.
        PDUI = 0.

        for gtex_sample in gtex_samples :
            if not np.isnan(row[gtex_sample]) :

                PDUI += row[gtex_sample]
                n_samples += 1.

        return PDUI / n_samples

    pdui_df['mean_PDUI'] = pdui_df.apply(calc_mean_PDUI, axis=1)

    snp_df = snp_df.join(pdui_df[['event_id', 'mean_PDUI']].set_index('event_id'), on='transcript_id', how='inner').copy().reset_index(drop=True)

    snp_df['transcript_id'] = snp_df['transcript_id'].apply(lambda x: x.split("|")[0])
    snp_df['snp_transcript_id'] = snp_df['snp_id'] + "_" + snp_df['transcript_id']
    
    transcript_end_dict = { row['transcript_id'] : -1 for _, row in snp_df.iterrows() }

    with open("/home/jlinder2/Downloads/GRCh37_latest_genomic.gtf", 'rt') as f :
        for line_raw in f.readlines() :

            a = line_raw.strip()

            if len(a.split("\t")) > 8 :
                start = a.strip().split("\t")[3]
                end = a.strip().split("\t")[4]

                strand = a.strip().split("\t")[6]

                t_id = ""
                if strand in ['+', '-'] :
                    t_id = a.strip().split("\t")[8].split(";")[1].split("\"")[1].split(".")[0]

                if t_id in transcript_end_dict :

                    if transcript_end_dict[t_id] == -1 :
                        if strand == '+' :
                            transcript_end_dict[t_id] = int(end)
                        else :
                            transcript_end_dict[t_id] = int(start)
                    else :
                        if strand == '+' :
                            transcript_end_dict[t_id] = max(transcript_end_dict[t_id], int(end))
                        else :
                            transcript_end_dict[t_id] = min(transcript_end_dict[t_id], int(start))

    qtl_cut_poses = []
    for _, row in snp_df.iterrows() :

        qtl_cut_poses.append(transcript_end_dict[row['transcript_id']])

    snp_df['qtl_cut_pos'] = qtl_cut_poses
    snp_df['snp_distance'] = np.abs(snp_df['qtl_cut_pos'] - snp_df['snp_pos'])

    #Drop duplicates
    snp_df['row_id'] = snp_df['gene_id'] + "_" + snp_df['snp_id'] + "_" + snp_df['transcript_id']
    snp_df = snp_df.sort_values(by='snp_id').drop_duplicates(subset=['row_id'], keep='first')
    
    #Load processed lead aQTL dataframe (for sizematching)
    lead_snp_df = pd.read_csv("polyadb_" + tissue_type + "_utr3_only_iso_lead_SNPs.csv", sep='\t')
    
    #Sort dataframe in ascending order of absolute effect size
    snp_df['abs_effect_size'] = np.abs(snp_df['effect_size'])
    snp_df = snp_df.sort_values(by='abs_effect_size', ascending=True).iloc[:2*len(lead_snp_df)].copy().reset_index(drop=True)

    #Save final apaQTL dataframe
    snp_df.to_csv("polyadb_" + tissue_type + version_suffix + "_SNPs.csv", sep='\t')


Processing data for tissue = 'Adipose_Subcutaneous'.
Processing data for tissue = 'Adipose_Visceral_Omentum'.
Processing data for tissue = 'Adrenal_Gland'.
Processing data for tissue = 'Artery_Aorta'.
Processing data for tissue = 'Artery_Coronary'.
Processing data for tissue = 'Artery_Tibial'.
Processing data for tissue = 'Brain_Amygdala'.
Processing data for tissue = 'Brain_Anterior_cingulate_cortex_BA24'.
Processing data for tissue = 'Brain_Caudate_basal_ganglia'.
Processing data for tissue = 'Brain_Cerebellar_Hemisphere'.
Processing data for tissue = 'Brain_Cortex'.
Processing data for tissue = 'Brain_Frontal_Cortex_BA9'.
Processing data for tissue = 'Brain_Hippocampus'.
Processing data for tissue = 'Brain_Hypothalamus'.
Processing data for tissue = 'Brain_Nucleus_accumbens_basal_ganglia'.
Processing data for tissue = 'Brain_Putamen_basal_ganglia'.
Processing data for tissue = 'Brain_Spinal_cord_cervical_c-1'.
Processing data for tissue = 'Breast_Mammary_Tissue'.
Processing data for

In [10]:
#Compile apaQTL data from GTEx (UTR isoforms)

p_val_thresh = 1e-10

def get_snp_pos(row) :
    if row['strand'] == '+' :
        return row['snp_pos'] - row['start'] - 1
    else :
        return row['end'] - row['snp_pos']

def get_snp_ref(row) :
    if row['strand'] == '+' :
        return row['snp_ref']
    else :
        if row['snp_ref'] == 'A' :
            return 'T'
        if row['snp_ref'] == 'C' :
            return 'G'
        if row['snp_ref'] == 'G' :
            return 'C'
        if row['snp_ref'] == 'T' :
            return 'A'

def get_snp_var(row) :
    if row['strand'] == '+' :
        return row['snp_var']
    else :
        if row['snp_var'] == 'A' :
            return 'T'
        if row['snp_var'] == 'C' :
            return 'G'
        if row['snp_var'] == 'G' :
            return 'C'
        if row['snp_var'] == 'T' :
            return 'A'

def get_var_seq(row) :
    ref_seq = row['seq']
    
    var_seq = ref_seq[:row['rel_snp_pos']] + row['rel_snp_var'] + ref_seq[row['rel_snp_pos']+1:]
    
    return var_seq

for tissue_type in tissue_types :
    
    print("Processing data for tissue = '" + str(tissue_type) + "'.")
    
    snp_df = pd.read_csv(tissue_type + '.cis.3aQTL.txt', sep='\t')

    snp_df['chr'] = snp_df['SNP'].apply(lambda x: x.split("_")[0])
    snp_df['start'] = snp_df['SNP'].apply(lambda x: x.split("_")[1])
    snp_df['end'] = snp_df['SNP'].apply(lambda x: str(int(x.split("_")[1]) + 1))
    snp_df['gene'] = snp_df['transcript'].apply(lambda x: x.split("|")[1])

    snp_df = snp_df[['chr', 'start', 'end', 'gene', 'SNP', 'transcript', 'beta', 't.stat', 'p.value']]

    snp_df.to_csv(tissue_type + version_suffix + ".cis.3aQTL.coordinates.bed", sep='\t', header=False, index=False)

    !bedtools intersect -a isoform_utr3_only_coordinates.bed -b {tissue_type + version_suffix}.cis.3aQTL.coordinates.bed -wa -wb > isoform_{tissue_type + version_suffix}_intersect.bed
    
    snp_bed_hg19 = pd.read_csv("isoform_" + tissue_type + version_suffix + "_intersect.bed", sep='\t', error_bad_lines=False, names=['chrom', 'start', 'end', 'gene', 'gene_id', 'strand', 'chr_2', 'snp_pos', 'end_2', 'gene_2', 'snp_id', 'transcript_id', 'effect_size', 'test_statistic', 'p_val'])
    snp_bed_hg19 = snp_bed_hg19.query("gene == gene_2").copy().reset_index(drop=True)

    snp_bed_hg19 = snp_bed_hg19[['chrom', 'start', 'end', 'gene', 'gene_id', 'strand', 'chr_2', 'snp_pos', 'end_2', 'gene_2', 'snp_id', 'transcript_id', 'effect_size', 'test_statistic', 'p_val']]

    snp_bed_hg19['snp_ref'] = snp_bed_hg19['snp_id'].apply(lambda s: s.split("_")[2])
    snp_bed_hg19['snp_var'] = snp_bed_hg19['snp_id'].apply(lambda s: s.split("_")[3])
    
    isoform_df_sel = isoform_df[['seq', 'gene_id']].copy().reset_index(drop=True)

    snp_df = snp_bed_hg19.join(isoform_df_sel.set_index('gene_id'), on='gene_id', how='inner').copy().reset_index(drop=True)
    
    snp_df['rel_snp_pos'] = snp_df.apply(get_snp_pos, axis=1)
    snp_df['rel_snp_ref'] = snp_df.apply(get_snp_ref, axis=1)
    snp_df['rel_snp_var'] = snp_df.apply(get_snp_var, axis=1)

    snp_df['seq_var'] = snp_df.apply(get_var_seq, axis=1)
    
    #Filter by lead SNPs (remove lead SNPs)
    lead_df = pd.read_csv("lead/" + tissue_type + ".lead.3aQTL.txt", sep='\t')
    lead_df['is_lead'] = True
    
    snp_df = snp_df.join(lead_df[['SNP', 'is_lead']].set_index("SNP"), on='snp_id', how='left')
    snp_df = snp_df.loc[snp_df['is_lead'].isnull()].copy()
    
    #Filter on p-value
    snp_df = snp_df.query("p_val <= " + str(p_val_thresh)).copy()
    
    #Calculate mean PDUIs
    pdui_df = pd.read_csv(tissue_type + "_combined_All_PDUIs_clean.txt", sep='\t')
    pdui_df = pdui_df.join(snp_df[['transcript_id']].set_index('transcript_id'), on='event_id', how='inner').copy().reset_index(drop=True)

    gtex_samples = pdui_df.columns.values.tolist()[2:]

    def calc_mean_PDUI(row) :

        n_samples = 0.
        PDUI = 0.

        for gtex_sample in gtex_samples :
            if not np.isnan(row[gtex_sample]) :

                PDUI += row[gtex_sample]
                n_samples += 1.

        return PDUI / n_samples

    pdui_df['mean_PDUI'] = pdui_df.apply(calc_mean_PDUI, axis=1)

    snp_df = snp_df.join(pdui_df[['event_id', 'mean_PDUI']].set_index('event_id'), on='transcript_id', how='inner').copy().reset_index(drop=True)

    snp_df['transcript_id'] = snp_df['transcript_id'].apply(lambda x: x.split("|")[0])
    snp_df['snp_transcript_id'] = snp_df['snp_id'] + "_" + snp_df['transcript_id']
    
    transcript_end_dict = { row['transcript_id'] : -1 for _, row in snp_df.iterrows() }

    with open("/home/jlinder2/Downloads/GRCh37_latest_genomic.gtf", 'rt') as f :
        for line_raw in f.readlines() :

            a = line_raw.strip()

            if len(a.split("\t")) > 8 :
                start = a.strip().split("\t")[3]
                end = a.strip().split("\t")[4]

                strand = a.strip().split("\t")[6]

                t_id = ""
                if strand in ['+', '-'] :
                    t_id = a.strip().split("\t")[8].split(";")[1].split("\"")[1].split(".")[0]

                if t_id in transcript_end_dict :

                    if transcript_end_dict[t_id] == -1 :
                        if strand == '+' :
                            transcript_end_dict[t_id] = int(end)
                        else :
                            transcript_end_dict[t_id] = int(start)
                    else :
                        if strand == '+' :
                            transcript_end_dict[t_id] = max(transcript_end_dict[t_id], int(end))
                        else :
                            transcript_end_dict[t_id] = min(transcript_end_dict[t_id], int(start))

    qtl_cut_poses = []
    for _, row in snp_df.iterrows() :

        qtl_cut_poses.append(transcript_end_dict[row['transcript_id']])

    snp_df['qtl_cut_pos'] = qtl_cut_poses
    snp_df['snp_distance'] = np.abs(snp_df['qtl_cut_pos'] - snp_df['snp_pos'])

    #Drop duplicates
    snp_df['row_id'] = snp_df['gene_id'] + "_" + snp_df['snp_id'] + "_" + snp_df['transcript_id']
    snp_df = snp_df.sort_values(by='snp_id').drop_duplicates(subset=['row_id'], keep='first')
    
    #Load processed lead aQTL dataframe (for sizematching)
    lead_snp_df = pd.read_csv("isoform_" + tissue_type + "_utr3_only_iso_lead_SNPs.csv", sep='\t')
    
    #Sort dataframe in ascending order of absolute effect size
    snp_df['abs_effect_size'] = np.abs(snp_df['effect_size'])
    snp_df = snp_df.sort_values(by='abs_effect_size', ascending=True).iloc[:2*len(lead_snp_df)].copy().reset_index(drop=True)

    #Save final apaQTL dataframe
    snp_df.to_csv("isoform_" + tissue_type + version_suffix + "_SNPs.csv", sep='\t')


Processing data for tissue = 'Adipose_Subcutaneous'.
Processing data for tissue = 'Adipose_Visceral_Omentum'.
Processing data for tissue = 'Adrenal_Gland'.
Processing data for tissue = 'Artery_Aorta'.
Processing data for tissue = 'Artery_Coronary'.
Processing data for tissue = 'Artery_Tibial'.
Processing data for tissue = 'Brain_Amygdala'.
Processing data for tissue = 'Brain_Anterior_cingulate_cortex_BA24'.
Processing data for tissue = 'Brain_Caudate_basal_ganglia'.
Processing data for tissue = 'Brain_Cerebellar_Hemisphere'.
Processing data for tissue = 'Brain_Cortex'.
Processing data for tissue = 'Brain_Frontal_Cortex_BA9'.
Processing data for tissue = 'Brain_Hippocampus'.
Processing data for tissue = 'Brain_Hypothalamus'.
Processing data for tissue = 'Brain_Nucleus_accumbens_basal_ganglia'.
Processing data for tissue = 'Brain_Putamen_basal_ganglia'.
Processing data for tissue = 'Brain_Spinal_cord_cervical_c-1'.
Processing data for tissue = 'Breast_Mammary_Tissue'.
Processing data for

In [11]:
#Merge tissue dataframes

snp_dfs = []

for tissue_type in tissue_types :
    
    print("Processing tissue = '" + tissue_type + "'")
    
    snp_df = pd.read_csv("polyadb_" + tissue_type + version_suffix + "_SNPs.csv", sep='\t')
    snp_df['tissue_type'] = tissue_type
    
    snp_dfs.append(snp_df)

snp_df = pd.concat(snp_dfs).copy().reset_index(drop=True)


Processing tissue = 'Adipose_Subcutaneous'
Processing tissue = 'Adipose_Visceral_Omentum'
Processing tissue = 'Adrenal_Gland'
Processing tissue = 'Artery_Aorta'
Processing tissue = 'Artery_Coronary'
Processing tissue = 'Artery_Tibial'
Processing tissue = 'Brain_Amygdala'
Processing tissue = 'Brain_Anterior_cingulate_cortex_BA24'
Processing tissue = 'Brain_Caudate_basal_ganglia'
Processing tissue = 'Brain_Cerebellar_Hemisphere'
Processing tissue = 'Brain_Cortex'
Processing tissue = 'Brain_Frontal_Cortex_BA9'
Processing tissue = 'Brain_Hippocampus'
Processing tissue = 'Brain_Hypothalamus'
Processing tissue = 'Brain_Nucleus_accumbens_basal_ganglia'
Processing tissue = 'Brain_Putamen_basal_ganglia'
Processing tissue = 'Brain_Spinal_cord_cervical_c-1'
Processing tissue = 'Breast_Mammary_Tissue'
Processing tissue = 'Cells_EBV-transformed_lymphocytes'
Processing tissue = 'Cells_Transformed_fibroblasts'
Processing tissue = 'Colon_Sigmoid'
Processing tissue = 'Colon_Transverse'
Processing tissu

In [12]:

merged_df = snp_df


In [13]:

snp_dict = {}

for i, tissue_type in enumerate(tissue_types) :
    
    snp_df = pd.read_csv("polyadb_" + tissue_type + version_suffix + "_SNPs.csv", sep='\t')
    
    for _, row in snp_df.iterrows() :
        
        snp_id = row['snp_id'] + "__" + row['gene_id'] + "__" + row['transcript_id']
        
        effect_size = row['effect_size']
        
        if snp_id not in snp_dict :
            snp_dict[snp_id] = {
                'effect_size' : []
            }
        
        snp_dict[snp_id]['effect_size'].append(effect_size)

snp_ids = []
effect_sizes = []
n_tissues = []

for snp_id in snp_dict :
    
    snp_ids.append(snp_id)
    effect_sizes.append(np.median(snp_dict[snp_id]['effect_size']))
    
    n_tissues.append(len(snp_dict[snp_id]['effect_size']))

median_df = pd.DataFrame({
    'extended_snp_id' : snp_ids,
    'effect_size' : effect_sizes,
    'n_tissues' : n_tissues
})
median_df = median_df[['extended_snp_id', 'effect_size', 'n_tissues']]


In [14]:

merged_df['extended_snp_id'] = merged_df['snp_id'] + "__" + merged_df['gene_id'] + "__" + merged_df['transcript_id']

merged_df = merged_df.join(median_df.set_index('extended_snp_id'), on="extended_snp_id", how='inner', rsuffix='_median').copy().reset_index(drop=True)


In [15]:

merged_df.to_csv("polyadb_merged" + version_suffix + "_SNPs.csv", sep='\t')


In [16]:
#Merge tissue dataframes (UTR isoforms)

snp_dfs = []

for tissue_type in tissue_types :
    
    print("Processing tissue = '" + tissue_type + "'")
    
    snp_df = pd.read_csv("isoform_" + tissue_type + version_suffix + "_SNPs.csv", sep='\t')
    snp_df['tissue_type'] = tissue_type
    
    snp_dfs.append(snp_df)

snp_df = pd.concat(snp_dfs).copy().reset_index(drop=True)


Processing tissue = 'Adipose_Subcutaneous'
Processing tissue = 'Adipose_Visceral_Omentum'
Processing tissue = 'Adrenal_Gland'
Processing tissue = 'Artery_Aorta'
Processing tissue = 'Artery_Coronary'
Processing tissue = 'Artery_Tibial'
Processing tissue = 'Brain_Amygdala'
Processing tissue = 'Brain_Anterior_cingulate_cortex_BA24'
Processing tissue = 'Brain_Caudate_basal_ganglia'
Processing tissue = 'Brain_Cerebellar_Hemisphere'
Processing tissue = 'Brain_Cortex'
Processing tissue = 'Brain_Frontal_Cortex_BA9'
Processing tissue = 'Brain_Hippocampus'
Processing tissue = 'Brain_Hypothalamus'
Processing tissue = 'Brain_Nucleus_accumbens_basal_ganglia'
Processing tissue = 'Brain_Putamen_basal_ganglia'
Processing tissue = 'Brain_Spinal_cord_cervical_c-1'
Processing tissue = 'Breast_Mammary_Tissue'
Processing tissue = 'Cells_EBV-transformed_lymphocytes'
Processing tissue = 'Cells_Transformed_fibroblasts'
Processing tissue = 'Colon_Sigmoid'
Processing tissue = 'Colon_Transverse'
Processing tissu

In [17]:

merged_df = snp_df


In [18]:

snp_dict = {}

for i, tissue_type in enumerate(tissue_types) :
    
    snp_df = pd.read_csv("isoform_" + tissue_type + version_suffix + "_SNPs.csv", sep='\t')
    
    for _, row in snp_df.iterrows() :
        
        snp_id = row['snp_id'] + "__" + row['gene_id'] + "__" + row['transcript_id']
        
        effect_size = row['effect_size']
        
        if snp_id not in snp_dict :
            snp_dict[snp_id] = {
                'effect_size' : []
            }
        
        snp_dict[snp_id]['effect_size'].append(effect_size)

snp_ids = []
effect_sizes = []
n_tissues = []

for snp_id in snp_dict :
    
    snp_ids.append(snp_id)
    effect_sizes.append(np.median(snp_dict[snp_id]['effect_size']))
    
    n_tissues.append(len(snp_dict[snp_id]['effect_size']))

median_df = pd.DataFrame({
    'extended_snp_id' : snp_ids,
    'effect_size' : effect_sizes,
    'n_tissues' : n_tissues
})
median_df = median_df[['extended_snp_id', 'effect_size', 'n_tissues']]


In [19]:

merged_df['extended_snp_id'] = merged_df['snp_id'] + "__" + merged_df['gene_id'] + "__" + merged_df['transcript_id']

merged_df = merged_df.join(median_df.set_index('extended_snp_id'), on="extended_snp_id", how='inner', rsuffix='_median').copy().reset_index(drop=True)


In [20]:

merged_df.to_csv("isoform_merged" + version_suffix + "_SNPs.csv", sep='\t')
