In [1]:
import os
import pandas as pd
import numpy as np
import h5py
from collections import Counter
try:
    import cPickle as pickle
except:
    import pickle

import sys
sys.path.append(os.path.abspath('../src'))

from enrichment_helper import *

from statsmodels.sandbox.stats.multicomp import multipletests
from statsmodels.stats.proportion import proportion_confint
import scipy.stats as stats

import sqlite3
with open('../src/ensembl2genename_GRCh38.85.p', 'rb') as reader:
    gid2gn = pickle.load(reader)   
    
  

In [2]:
selected_phenotype = 'AdvancedAMD_2015'
lincs_file = 'CD_signatures/AMD_lincs_l1000.p'
fn_out = 'AMD_targets.csv'
mgi_search_criteria = lambda x: ('age related macular degeneration' in x.lower())
creed_search_criteria = lambda x: 'macular' in x.lower()

In [3]:
## MGI
hg2mg, mg, mg2pheno, pheno, pheno2name = load_mgi('../../../project_wrapup/GSK_project/GWAS_component/data/MGI/')  

df = pd.read_csv('../../../project_wrapup/GSK_project/GWAS_component/data/MGI/MGI_DO.rpt', sep = '\t')

dis2pheno = {}
for d, m in zip(df['DO Disease Name'], df['Mouse MGI ID']):
    if mgi_search_criteria(d) and m != None:
        if m in mg:
            print d, m, m in mg, len(mg2pheno[m])
            dis2pheno[m] = set(mg2pheno[m])
        #else:
            #print d, m, m in mg

age related macular degeneration MGI:98259 True 118
age related macular degeneration MGI:1203290 True 12
age related macular degeneration MGI:106185 True 113
age related macular degeneration 1 MGI:88518 True 22
age related macular degeneration 1 MGI:98935 True 56
age related macular degeneration 12 MGI:1333815 True 55
age related macular degeneration 4 MGI:88385 True 35


In [4]:
with open('./gcomp_drive_genes.p', 'rb') as reader:
    drivers = set([gid2gn[item[0]] for s, item in pickle.load(reader)[selected_phenotype]])

In [5]:
#!/usr/bin/env python
import pandas
import sqlite3

connection = sqlite3.connect("../../data/gwas_g2p/metaxcan_results_p/metaxcan_results_v1.5.db")

query = 'SELECT g.gene_name, m.zscore, m.n_snps_used, m.n_snps_model, p.tag as phenotype, t.tissue as tissue, g.gene ' 
query += ' FROM gene AS g INNER JOIN metaxcan_result AS m ON g.id = m.gene_id' 
query += ' INNER JOIN tissue AS t ON t.id = m.tissue_id  INNER JOIN pheno AS p ON p.id = m.pheno_id'
query += ' WHERE p.tag = \'%s\'' % selected_phenotype

pi = pandas.read_sql_query(query, connection)
pi = pi[pi.tissue != 'DGN_WB']


In [6]:
from scipy.stats import norm 
import statsmodels.stats.multitest as multi

pvals = norm.sf(np.abs(pi.zscore)) * 2
_, res_adjp, _, _ = multi.multipletests(pvals)


In [7]:
sgenes = set(pi.gene_name[res_adjp < 0.05])
print len(sgenes)

145


In [8]:
mgi_res_meta = mgi_match(sgenes, hg2mg, mg2pheno, dis2pheno, pheno, alpha_sig = (0.05) / float(len(dis2pheno)))
mgi_hit_meta = set([g for g, c in mgi_res_meta.items()])
n1 = len(mgi_hit_meta)
n0 = len([g for g in sgenes if g in hg2mg])
print(n1, n0, float(n1) / float(n0), proportion_confint(n1, n0))

(29, 128, 0.2265625, (0.15404384631292495, 0.29908115368707505))


In [9]:
mgi_res_gc = mgi_match(drivers, hg2mg, mg2pheno, dis2pheno, pheno, alpha_sig = (0.05) / float(len(dis2pheno)))
mgi_hit_gc = set([g for g, c in mgi_res_gc.items()])
n1 = len(mgi_hit_gc)
n0 = len([g for g in drivers if g in hg2mg])
print(n1, n0, float(n1) / float(n0), proportion_confint(n1, n0))

(42, 135, 0.3111111111111111, (0.2330178502227221, 0.38920437199950014))


In [10]:
s = ''
for i, g in enumerate(drivers | sgenes):
    if (i % 8) == 0:
        print(s)
        s = ''
    s += '\'%s\', ' % g


'FAM213A', 'C4A', 'STK19', 'TRIM15', 'TMEM229B', 'RSPH4A', 'CKM', 'CHSY3', 
'NELFE', 'BOK', 'MYL2', 'TNNC1', 'TNNC2', 'APOC1', 'RASAL1', 'SFTA2', 
'MICB', 'ADIPOQ', 'PLVAP', 'C7orf61', 'DDX39B', 'MT-ND5', 'MT-ND6', 'DMBT1', 
'TYROBP', 'LHX9', 'PPT2', 'PRRT1', 'THRB', 'PLEKHH1', 'SLC12A5', 'LAD1', 
'SPTLC3', 'GPR108', 'RELB', 'RP5-862P8.2', 'FGA', 'FGB', 'IRX2', 'TCAP', 
'LIPC', 'CETP', 'ARHGEF15', 'SLC38A3', 'SHC3', 'SNTG1', 'ARHGEF19', 'PRRC2A', 
'HLA-DRB1', 'CRB1', 'TSC22D4', 'AGER', 'LDB3', 'RDH5', 'MEPCE', 'CGNL1', 
'CSF3R', 'COL11A2', 'PABPC1L2B-AS1', 'BRD2', 'CXCL2', 'LST1', 'RBM47', 'FKBPL', 
'C1QA', 'NOTCH4', 'MEFV', 'MUC21', 'PLEKHA1', 'LSAMP', 'RNF5', 'TCF19', 
'SCN3A', 'DDAH2', 'CSF2RB', 'NIT2', 'STMN2', 'ADORA1', 'HLA-B', 'CDKN1A', 
'CELF3', 'NMRK2', 'KHDRBS3', 'MYO18B', 'ATF6B', 'NRXN1', 'SLC44A4', 'FAM200A', 
'KLHDC3', 'PKHD1L1', 'SLC9A4', 'NEK7', 'HCG27', 'ABCF1', 'KCNT2', 'PLA2G12A', 
'APOM', 'ZCWPW1', 'TBC1D23', 'PC', 'PILRB', 'SKIV2L', 'HRH2', 'TTN', 
'C4B', 'HLA-DQB

In [11]:
## Lincs 1000
df0 = pd.read_csv('../../../data/CREED/Disease_entire_exp_landmark.tsv.gz', sep = '\t')
df1 = pd.read_csv('../../../data/CREED/Disease_signatures_info_data.tsv', sep = '\t')

ind = [d0 for d0, d1 in zip(df1.id, df1.disease_name) if creed_search_criteria(d1)]
dis = df0[ind].values.T
dis = dis / np.linalg.norm(dis, axis = 1, keepdims=True)
print dis.shape

(1, 978)


In [12]:
with open('CD_signatures/REF_lincs_l1000.p', 'rb') as f:
    res = pickle.load(f)

cd_ref = np.vstack([item[u'chdirLm'] for item in res]) 
cd_ref /= (np.sqrt(np.sum(cd_ref **2, axis = 1)) + 1e-5)[:, np.newaxis]

with open(lincs_file, 'rb') as f:
    res_shrna = pickle.load(f)

In [13]:
lincs_res_meta = lincs_match(sgenes, res_shrna, dis, cd_ref, alpha_sig = 0.05)
lincs_hit_meta = set([g for g, item in lincs_res_meta.items() if item[0] > 0])
n1 = len(lincs_hit_meta)
n0 = len(lincs_res_meta)
print(n1, n0, float(n1) / float(n0), proportion_confint(n1, n0))

(5, 54, 0.09259259259259259, (0.01528173453074011, 0.16990345065444507))


In [14]:
lincs_res_gc = lincs_match(drivers, res_shrna, dis, cd_ref, alpha_sig = 0.05)
lincs_hit_gc = set([g for g, item in lincs_res_gc .items() if item[0] > 0])
n1 = len(lincs_hit_gc)
n0 = len(lincs_res_gc)
print(n1, n0, float(n1) / float(n0), proportion_confint(n1, n0))

(3, 51, 0.058823529411764705, (-0.005752906657967544, 0.12339996548149695))


In [15]:
id2gene = sorted( drivers | sgenes)

In [16]:
mgi_genes = set([g for g in (drivers|sgenes) if g in hg2mg])
mgi_hits = (mgi_hit_gc | mgi_hit_meta)
lincs_genes =  (set(lincs_res_gc) | set(lincs_res_meta))
lincs_hits = (lincs_hit_gc | lincs_hit_meta)

In [17]:
df_res = []
for item in [drivers, sgenes, lincs_hits, lincs_genes, mgi_hits, mgi_genes]:
    vec = [int(g in item) for g in id2gene]
    df_res.append(vec)
df_res = pd.DataFrame(df_res).T

In [18]:
df_res.columns = ['genes_selected_by_GCA+BNs', 'genes_selected_by_metaxcan', 
                  'LINCS_hit', 'genes_found_in_LINCS', 'MGI_hit', 'genes_found_in_MGI']
df_res.index = id2gene

df_res.to_csv(fn_out)