In [1]:
import os
import pandas as pd
import numpy as np
import h5py
from collections import Counter
try:
    import cPickle as pickle
except:
    import pickle

import sys
sys.path.append(os.path.abspath('../src'))

from enrichment_helper import *

from statsmodels.sandbox.stats.multicomp import multipletests
from statsmodels.stats.proportion import proportion_confint
import scipy.stats as stats

import sqlite3
with open('../src/ensembl2genename_GRCh38.85.p', 'rb') as reader:
    gid2gn = pickle.load(reader)   
    
  

In [2]:
selected_phenotype = 'RA_OKADA_TRANS_ETHNIC'
lincs_file = 'CD_signatures/RA_lincs_l1000.p'
fn_out = 'RA_targets.csv'
mgi_search_criteria = lambda x: 'rheumatoid arthritis' in x.lower()
creed_search_criteria = lambda x: 'rheumatoid' in x.lower()

In [3]:
## MGI
hg2mg, mg, mg2pheno, pheno, pheno2name = load_mgi('../../../project_wrapup/GSK_project/GWAS_component/data/MGI/')  

df = pd.read_csv('../../../project_wrapup/GSK_project/GWAS_component/data/MGI/MGI_DO.rpt', sep = '\t')

dis2pheno = {}
for d, m in zip(df['DO Disease Name'], df['Mouse MGI ID']):
    if mgi_search_criteria(d) and m != None:
        if m in mg:
            print d, m, m in mg, len(mg2pheno[m])
            dis2pheno[m] = set(mg2pheno[m])
        #else:
            #print d, m, m in mg

rheumatoid arthritis MGI:104798 True 221
rheumatoid arthritis MGI:1329019 True 32
rheumatoid arthritis MGI:99180 True 52
rheumatoid arthritis MGI:101900 True 142
rheumatoid arthritis MGI:99613 True 59
rheumatoid arthritis MGI:96560 True 165


In [4]:
with open('./gcomp_drive_genes.p', 'rb') as reader:
    drivers = set([gid2gn[item[0]] for s, item in pickle.load(reader)[selected_phenotype]])

In [5]:
#!/usr/bin/env python
import pandas
import sqlite3

connection = sqlite3.connect("../../data/gwas_g2p/metaxcan_results_p/metaxcan_results_v1.5.db")

query = 'SELECT g.gene_name, m.zscore, m.n_snps_used, m.n_snps_model, p.tag as phenotype, t.tissue as tissue, g.gene ' 
query += ' FROM gene AS g INNER JOIN metaxcan_result AS m ON g.id = m.gene_id' 
query += ' INNER JOIN tissue AS t ON t.id = m.tissue_id  INNER JOIN pheno AS p ON p.id = m.pheno_id'
query += ' WHERE p.tag = \'%s\'' % selected_phenotype

pi = pandas.read_sql_query(query, connection)
pi = pi[pi.tissue != 'DGN_WB']


In [6]:
from scipy.stats import norm 
import statsmodels.stats.multitest as multi

pvals = norm.sf(np.abs(pi.zscore)) * 2
_, res_adjp, _, _ = multi.multipletests(pvals)


In [7]:
sgenes = set(pi.gene_name[res_adjp < 0.05])
print len(sgenes)

232


In [8]:
mgi_res_meta = mgi_match(sgenes, hg2mg, mg2pheno, dis2pheno, pheno, alpha_sig = (0.05) / float(len(dis2pheno)))
mgi_hit_meta = set([g for g, c in mgi_res_meta.items()])
n1 = len(mgi_hit_meta)
n0 = len([g for g in sgenes if g in hg2mg])
print(n1, n0, float(n1) / float(n0), proportion_confint(n1, n0))

(57, 206, 0.2766990291262136, (0.21560794636570374, 0.3377901118867234))


In [9]:
mgi_res_gc = mgi_match(drivers, hg2mg, mg2pheno, dis2pheno, pheno, alpha_sig = (0.05) / float(len(dis2pheno)))
mgi_hit_gc = set([g for g, c in mgi_res_gc.items()])
n1 = len(mgi_hit_gc)
n0 = len([g for g in drivers if g in hg2mg])
print(n1, n0, float(n1) / float(n0), proportion_confint(n1, n0))

(73, 148, 0.49324324324324326, (0.4126965752141444, 0.5737899112723421))


In [10]:
s = ''
for i, g in enumerate(drivers | sgenes):
    if (i % 8) == 0:
        print(s)
        s = ''
    s += '\'%s\', ' % g


'TAP1', 'PGAP3', 'TAP2', 'C6orf25', 'TBX21', 'HIST1H4J', 'HIST1H4H', 'CPE', 
'NELFE', 'ST7L', 'HIPK1', 'MFAP4', 'MYL2', 'SFTA2', 'LEMD2', 'KIF2C', 
'DCN', 'COL11A2', 'C6orf123', 'TYROBP', 'GSDMB', 'CKM', 'SKIV2L', 'MAGI3', 
'DLGAP5', 'PRKG1', 'B3GALT4', 'HLA-DPB1', 'ARHGEF19', 'TRIM26', 'HLA-DQB2', 'ZNRD1', 
'PAPPA', 'PCOLCE', 'CSF3R', 'PRRT1', 'CYP4X1', 'BRD2', 'INPP5B', 'CNN1', 
'ORMDL3', 'LST1', 'FKBPL', 'BAK1', 'C2ORF15', 'ZBTB22', 'DHX16', 'ZSCAN12', 
'TCF19', 'CD3E', 'ZSCAN16', 'LRRN4', 'ATP2A1', 'CTLA4', 'ANO5', 'CDKN1A', 
'ANO1', 'PHF1', 'UHRF1BP1', 'GABBR1', 'AP4B1', 'C6orf10', 'IFIT1', 'SLC9A4', 
'DAXX', 'HCG27', 'PPP1R1B', 'WDR46', 'APOM', 'AFF3', 'PAM', 'RAPH1', 
'HLA-DQB1', 'C4A', 'C4B', 'HRH1', 'C6orf15', 'MRGPRF-AS1', 'PLCB2', 'HLA-DOB', 
'GBP5', 'PPP1R11', 'STEAP1', 'COL4A1', 'GIMAP4', 'PPP1R18', 'BTNL2', 'PKNOX2', 
'SLC39A7', 'ITGAX', 'PPP1R10', 'CDHR1', 'DDR1', 'KCNH8', 'RING1', 'CSNK2B', 
'MELK', 'FLOT1', 'TTC34', 'SORT1', 'NOV', 'MOG', 'LCK', 'CD19', 
'BAG6', 'BLK'

In [12]:
## Lincs 1000
df0 = pd.read_csv('../../../data/CREED/Disease_entire_exp_landmark.tsv.gz', sep = '\t')
df1 = pd.read_csv('../../../data/CREED/Disease_signatures_info_data.tsv', sep = '\t')

ind = [d0 for d0, d1 in zip(df1.id, df1.disease_name) if creed_search_criteria(d1)]
dis = df0[ind].values.T
dis = dis / np.linalg.norm(dis, axis = 1, keepdims=True)
print dis.shape

(7, 978)


In [13]:
with open('CD_signatures/REF_lincs_l1000.p', 'rb') as f:
    res = pickle.load(f)

cd_ref = np.vstack([item[u'chdirLm'] for item in res]) 
cd_ref /= (np.sqrt(np.sum(cd_ref **2, axis = 1)) + 1e-5)[:, np.newaxis]

with open(lincs_file, 'rb') as f:
    res_shrna = pickle.load(f)

In [14]:
lincs_res_meta = lincs_match(sgenes, res_shrna, dis, cd_ref, alpha_sig = 0.05)
lincs_hit_meta = set([g for g, item in lincs_res_meta.items() if item[0] > 0])
n1 = len(lincs_hit_meta)
n0 = len(lincs_res_meta)
print(n1, n0, float(n1) / float(n0), proportion_confint(n1, n0))

(22, 93, 0.23655913978494625, (0.1501888885343538, 0.3229293910355387))


In [15]:
lincs_res_gc = lincs_match(drivers, res_shrna, dis, cd_ref, alpha_sig = 0.05)
lincs_hit_gc = set([g for g, item in lincs_res_gc .items() if item[0] > 0])
n1 = len(lincs_hit_gc)
n0 = len(lincs_res_gc)
print(n1, n0, float(n1) / float(n0), proportion_confint(n1, n0))

(17, 52, 0.3269230769230769, (0.1994256094147095, 0.45442054443144436))


In [16]:
id2gene = sorted( drivers | sgenes)

In [17]:
mgi_genes = set([g for g in (drivers|sgenes) if g in hg2mg])
mgi_hits = (mgi_hit_gc | mgi_hit_meta)
lincs_genes =  (set(lincs_res_gc) | set(lincs_res_meta))
lincs_hits = (lincs_hit_gc | lincs_hit_meta)

In [18]:
df_res = []
for item in [drivers, sgenes, lincs_hits, lincs_genes, mgi_hits, mgi_genes]:
    vec = [int(g in item) for g in id2gene]
    df_res.append(vec)
df_res = pd.DataFrame(df_res).T

In [19]:
df_res.columns = ['genes_selected_by_GCA+BNs', 'genes_selected_by_metaxcan', 
                  'LINCS_hit', 'genes_found_in_LINCS', 'MGI_hit', 'genes_found_in_MGI']
df_res.index = id2gene

df_res.to_csv(fn_out)