# csaQTL testing in datasets with suggestive trans-eGenes removed

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle, cna
import scipy.stats as st

In [2]:
loci = pd.read_csv("/data/srlab/lrumker/MCSC_Project/cna-qtl/results/loci_table.tsv", sep = "\t")
loci = loci.iloc[1:,:].reset_index(drop=True) # no trans eQTL candidates for the myeloid csaQTL
loci

Unnamed: 0,celltype,#CHROM,POS,ID,REF,ALT,A1,TEST,OBS_CT,P,...,CHR,MAF,Cell Type,SNP,Effect Allele,Other Allele,rsID,Cyto. Band,Phenotype Annotation,Novel
0,NK,2,111851212,2:111851212:C:T,C,T,T,ADD,935,1.764828e-09,...,2,0.21504,NK,2:111851212:C:T,T,C,rs13025330,2q13,Decrease in CD56bright \% NK cells,No
1,NK,11,128070535,11:128070535:A:G,A,G,G,ADD,935,2.480238e-13,...,11,0.27872,NK,11:128070535:A:G,G,A,rs519062,11q24.3,Increase in NK cells activated by TNF-$\alpha$...,Yes
2,NK,12,10583611,12:10583611:C:T,C,T,C,ADD,935,1.95608e-11,...,12,0.17811,NK,12:10583611:C:T,T,C,rs3003,12p13.2,Increase in NK cells activated by TNF-$\alpha$...,Yes
3,NK,19,16441973,19:16441973:G:A,G,A,A,ADD,935,1.958433e-13,...,19,0.32867,NK,19:16441973:G:A,A,G,rs56133626,19p13.11,Increase in NK cells activated by TNF-$\alpha$...,Yes


In [3]:
eQTL_dep_table = pd.DataFrame({})
for i_locus in np.arange(loci.shape[0]):
    celltype = loci['celltype'][i_locus]
    lead_snp = loci['ID'][i_locus]
    gwas_p = '{:.2e}'.format(loci.iloc[i_locus, :].P)

    masked_locus = pd.read_table('/data/srlab/lrumker/MCSC_Project/cna-qtl/mask_trans_eGenes/'+lead_snp+\
                                 "/GeNA_sumstats.txt")
    masked_p = '{:.2e}'.format(masked_locus.loc[masked_locus.ID==lead_snp].P.values[0])

    # load data
    res = pickle.load(open('/data/srlab/lrumker/MCSC_Project/cna-qtl/results/gwas_'+\
                                   celltype+"/cna_res_"+lead_snp+'.p','rb'))
    res_mask = pickle.load(open('/data/srlab/lrumker/MCSC_Project/cna-qtl/mask_trans_eGenes/'+\
                                celltype+'_'+lead_snp+'_mask_trans_eGenes.p','rb'))

    # trans eGenes removed
    dropped = [i for i in np.arange(res.vargene_cors.shape[0]) if 
                          res.vargene_cors.gene[i] not in res_mask.vargene_cors.gene.values]
    dropped_genes = res.vargene_cors.gene.values[dropped]

    # expression change in these genes R
    dropped_cors = res.vargene_cors.iloc[dropped,:].cor.values

    # percentile of R^2 among all genes
    rsq_all_vargenes = res.vargene_cors.cor.values**2
    pctile_rsq = [np.sum(rsq_all_vargenes<(dropped_cors[i]**2))/res.vargene_cors.shape[0]\
                    for i in np.arange(len(dropped_genes))]
    pctile_rsq = np.around(pctile_rsq,2)

    # sort genes
    sel_order = np.argsort(-pctile_rsq)
    droppped_genes_str = ", ".join(dropped_genes[sel_order])
    droppped_cors_str = ", ".join(np.around(dropped_cors[sel_order],3).astype(str))
    pctile_rsq_str = ", ".join(((pctile_rsq[sel_order]*100).astype(int)).astype(str))

    res.vargene_cors.set_index('gene', inplace = True, drop = True)
    res_mask.vargene_cors.set_index('gene', inplace = True, drop = True)

    # correlation in pre/post vargene correlations to ncorrs (retained genes)
    Rsq_nbhd = np.around(np.corrcoef(res.vargene_cors.loc[res_mask.vargene_cors.index,:].cor, 
                          res_mask.vargene_cors.cor )[0,1]**2,2)

    # correlation in pre/post phenotypes, per individual, Rsq
    d = cna.read("/data/srlab/lrumker/datasets/onek1k/pheno/"+celltype+".h5ad")
    d_mc = cna.read("/data/srlab/lrumker/MCSC_Project/cna-qtl/mask_trans_eGenes/masked_"+\
                    celltype+"_"+lead_snp+".h5ad")

    nampcs = d.uns['NAM_sampleXpc'].iloc[:,:res.k]
    pheno = np.dot(res.beta.reshape(1,-1), nampcs.T).reshape(-1,)
    nampcs = d_mc.uns['NAM_sampleXpc'].iloc[:,:res_mask.k]
    pheno_mask = np.dot(nampcs, res_mask.beta)
    Rsq_sample = np.around(np.corrcoef(pheno, pheno_mask)[0,1]**2,2)

    eQTL_dep_table = pd.concat((eQTL_dep_table,
                                pd.DataFrame({"Celltype": [celltype], "SNP": [lead_snp], "Cis Vargenes": [droppped_genes_str],
                 "Expr Cor (R)": [droppped_cors_str], "%ile Expr Cor (Rsq)": [pctile_rsq_str],
                 "Rsq Pheno, Nbhds": [Rsq_nbhd], "Rsq Pheno, Samples": Rsq_sample,
                                             "GWAS P": gwas_p, "P, Masked": masked_p})))

In [4]:
# No candidate trans-eQTLs detected for the myeloid csaQTL
eQTL_pvals=[]
for i in np.arange(eQTL_dep_table.shape[0]):
    celltype=eQTL_dep_table.Celltype.values[i]
    lead_snp=eQTL_dep_table.SNP.values[i]
    genes=eQTL_dep_table['Cis Vargenes'].values[i].split(", ") # This column label is later replaced with Trans-eGenes
    
    # Note that not all vargenes passed PEER QC for pseudobulk eQTL testing
    res = pd.read_csv("/data/srlab/lrumker/MCSC_Project/cna-qtl/eqtls/results/trans_eQTL_candidates/pseudobulk/"+
                  celltype+"_"+lead_snp+"_csaQTL_test_eQTLs_pseudobulk_eQTLs.csv", index_col = 0)
    res = res.loc[res.variant.values==lead_snp,:].reset_index(drop=True)
    res.set_index('gene', inplace = True)
    new_pvals = res.loc[genes,'p.val'].values
    new_pvals = ['{:.2e}'.format(new_pvals[j]) for j in np.arange(len(new_pvals))]
    new_pvals = ", ".join(new_pvals)
    eQTL_pvals.append(new_pvals)
    
eQTL_dep_table['eQTL P, Pseudobulk'] = eQTL_pvals

In [5]:
res_folder="/data/srlab/lrumker/MCSC_Project/cna-qtl/results/"
eQTL_dep_table.to_csv(res_folder+"trans_eQTL_dep_table.tsv", sep = "\t", index = False)
eQTL_dep_table

Unnamed: 0,Celltype,SNP,Cis Vargenes,Expr Cor (R),%ile Expr Cor (Rsq),"Rsq Pheno, Nbhds","Rsq Pheno, Samples",GWAS P,"P, Masked","eQTL P, Pseudobulk"
0,NK,2:111851212:C:T,GZMK,-0.525,100,1.0,1.0,1.76e-09,6.7e-09,3.14e-06
0,NK,11:128070535:A:G,"PRSS23, TMIGD2, CNFN","-0.204, -0.114, -0.106","100, 98, 97",1.0,0.99,2.48e-13,2.63e-12,"1.80e-08, 5.17e-06, 3.81e-05"
0,NK,12:10583611:C:T,CD52,-0.204,100,0.99,0.95,1.96e-11,4.67e-10,2.42e-05
0,NK,19:16441973:G:A,"PRSS23, SH3BGRL3, CD52, ZNF683, CNFN, ANXA1, C...","-0.28, -0.239, -0.223, -0.244, -0.118, -0.107,...","100, 99, 99, 99, 97, 96, 94",0.99,0.93,1.96e-13,3.53e-11,"3.20e-07, 6.17e-07, 3.63e-05, 3.40e-09, 5.82e-..."


In [6]:
# Format results for exported table
res_folder="/data/srlab/lrumker/MCSC_Project/cna-qtl/results/"
eQTL_dep_table = pd.read_csv(res_folder+"trans_eQTL_dep_table.tsv", sep = "\t")
eQTL_dep_table['GWAS P'] = [str(eQTL_dep_table['GWAS P'].values[i]) for i in np.arange(eQTL_dep_table.shape[0])]
eQTL_dep_table['P, Masked'] = [str(eQTL_dep_table['P, Masked'].values[i]) for i in np.arange(eQTL_dep_table.shape[0])]

In [7]:
eQTL_dep_table['Cis Vargenes'] = [", ".join([r"\textit{"+eQTL_dep_table['Cis Vargenes'].values[j].split(",")[i]+r"}" \
 for i in np.arange(len(eQTL_dep_table['Cis Vargenes'].values[j].split(",")))]) \
                                 for j in np.arange(eQTL_dep_table.shape[0])]

In [8]:
# \shortstack{a \\ bb \\ c} for vertical stacking within one table cell
eQTL_dep_table['%ile Expr Cor (Rsq)'] = [r"\shortstack{"+r' \\ '.join(eQTL_dep_table['%ile Expr Cor (Rsq)'].values[i].split(", "))+"}" \
                                  if (type(eQTL_dep_table['%ile Expr Cor (Rsq)'].values[i]) is not float \
                                      and len(eQTL_dep_table['%ile Expr Cor (Rsq)'].values[i].split(", ")) != 1 ) \
                                        else eQTL_dep_table['%ile Expr Cor (Rsq)'].values[i] \
                                         for i in np.arange(eQTL_dep_table.shape[0]) ]
eQTL_dep_table['Expr Cor (R)'] = [r"\shortstack{"+r' \\ '.join(eQTL_dep_table['Expr Cor (R)'].values[i].split(", "))+"}" \
                                  if (type(eQTL_dep_table['Expr Cor (R)'].values[i]) is not float \
                                      and len(eQTL_dep_table['Expr Cor (R)'].values[i].split(", ")) != 1 ) \
                                        else eQTL_dep_table['Expr Cor (R)'].values[i] \
                                  for i in np.arange(eQTL_dep_table.shape[0])]
eQTL_dep_table['Cis Vargenes'] = [r"\shortstack{"+r' \\ '.join(eQTL_dep_table['Cis Vargenes'].values[i].split(", "))+"}" \
                                  if (type(eQTL_dep_table['Cis Vargenes'].values[i]) is not float \
                                      and len(eQTL_dep_table['Cis Vargenes'].values[i].split(", ")) != 1 ) \
                                        else eQTL_dep_table['Cis Vargenes'].values[i] \
                                  for i in np.arange(eQTL_dep_table.shape[0])]

eQTL_dep_table['eQTL P, Pseudobulk'] = [r"\shortstack{"+r' \\ '.join(eQTL_dep_table['eQTL P, Pseudobulk'].values[i].split(", "))+"}" \
                                  if (type(eQTL_dep_table['eQTL P, Pseudobulk'].values[i]) is not float \
                                      and len(eQTL_dep_table['eQTL P, Pseudobulk'].values[i].split(", ")) != 1 ) \
                                        else eQTL_dep_table['eQTL P, Pseudobulk'].values[i] \
                                  for i in np.arange(eQTL_dep_table.shape[0])]
savenames = eQTL_dep_table.columns
savenames = [savenames[i] if i !=0 else 'Cell Type' for i in np.arange(len(savenames))]
savenames = [savenames[i] if i !=1 else 'CHR:POS' for i in np.arange(len(savenames))]
savenames = [savenames[i] if i !=2 else 'Trans-eGenes' for i in np.arange(len(savenames))]
savenames = [savenames[i] if i !=3 else 'Expr Cor (r)' for i in np.arange(len(savenames))]
savenames = [savenames[i] if i !=4 else r'\%tile Expr Cor (r$^{2}$)' for i in np.arange(len(savenames))]
savenames = [savenames[i] if i !=5 else 'r$^{2}$ Pheno, Nbhds' for i in np.arange(len(savenames))]
savenames = [savenames[i] if i !=6 else 'r$^{2}$ Pheno, Samples' for i in np.arange(len(savenames))]
eQTL_dep_table.columns = savenames

eQTL_dep_table['CHR:POS'] = [":".join(eQTL_dep_table['CHR:POS'].values[i].split(":")[:2]) \
                             for i in np.arange(eQTL_dep_table.shape[0])]

In [9]:
eQTL_dep_table = eQTL_dep_table.loc[:,['Cell Type', 'CHR:POS', 'Trans-eGenes', 'eQTL P, Pseudobulk',
                     'Expr Cor (r)', r'\%tile Expr Cor (r$^{2}$)', 'r$^{2}$ Pheno, Nbhds', 
                     'r$^{2}$ Pheno, Samples', 'GWAS P', 'P, Masked']]
eQTL_dep_table

Unnamed: 0,Cell Type,CHR:POS,Trans-eGenes,"eQTL P, Pseudobulk",Expr Cor (r),\%tile Expr Cor (r$^{2}$),"r$^{2}$ Pheno, Nbhds","r$^{2}$ Pheno, Samples",GWAS P,"P, Masked"
0,NK,2:111851212,\textit{GZMK},3.14e-06,-0.525,100,1.0,1.0,1.76e-09,6.7e-09
1,NK,11:128070535,\shortstack{\textit{PRSS23} \\ \textit{ TMIGD2...,\shortstack{1.80e-08 \\ 5.17e-06 \\ 3.81e-05},\shortstack{-0.204 \\ -0.114 \\ -0.106},\shortstack{100 \\ 98 \\ 97},1.0,0.99,2.48e-13,2.63e-12
2,NK,12:10583611,\textit{CD52},2.42e-05,-0.204,100,0.99,0.95,1.96e-11,4.67e-10
3,NK,19:16441973,\shortstack{\textit{PRSS23} \\ \textit{ SH3BGR...,\shortstack{3.20e-07 \\ 6.17e-07 \\ 3.63e-05 \...,\shortstack{-0.28 \\ -0.239 \\ -0.223 \\ -0.24...,\shortstack{100 \\ 99 \\ 99 \\ 99 \\ 97 \\ 96 ...,0.99,0.93,1.96e-13,3.53e-11


In [10]:
pd.set_option('display.max_colwidth', None)
table_latex = eQTL_dep_table.to_latex(index = False,  escape=False,
              column_format='p{0.7cm}|p{1.8cm}|p{1.7cm}|p{1.2cm}|p{1cm}|p{1cm}|p{1cm}|p{1cm}|p{1.3cm}|p{1.3cm}')
table_latex = table_latex.replace("\\\n", "\\ \hline\n") # add lines between rows
with open('/data/srlab/lrumker/MCSC_Project/cna-qtl/tables/supptable.trans_masked.tex','w') as tf:
    tf.write(table_latex)
eQTL_dep_table

Unnamed: 0,Cell Type,CHR:POS,Trans-eGenes,"eQTL P, Pseudobulk",Expr Cor (r),\%tile Expr Cor (r$^{2}$),"r$^{2}$ Pheno, Nbhds","r$^{2}$ Pheno, Samples",GWAS P,"P, Masked"
0,NK,2:111851212,\textit{GZMK},3.14e-06,-0.525,100,1.0,1.0,1.76e-09,6.7e-09
1,NK,11:128070535,\shortstack{\textit{PRSS23} \\ \textit{ TMIGD2} \\ \textit{ CNFN}},\shortstack{1.80e-08 \\ 5.17e-06 \\ 3.81e-05},\shortstack{-0.204 \\ -0.114 \\ -0.106},\shortstack{100 \\ 98 \\ 97},1.0,0.99,2.48e-13,2.63e-12
2,NK,12:10583611,\textit{CD52},2.42e-05,-0.204,100,0.99,0.95,1.96e-11,4.67e-10
3,NK,19:16441973,\shortstack{\textit{PRSS23} \\ \textit{ SH3BGRL3} \\ \textit{ CD52} \\ \textit{ ZNF683} \\ \textit{ CNFN} \\ \textit{ ANXA1} \\ \textit{ COL13A1}},\shortstack{3.20e-07 \\ 6.17e-07 \\ 3.63e-05 \\ 3.40e-09 \\ 5.82e-07 \\ 2.00e-05 \\ 1.09e-05},\shortstack{-0.28 \\ -0.239 \\ -0.223 \\ -0.244 \\ -0.118 \\ -0.107 \\ -0.082},\shortstack{100 \\ 99 \\ 99 \\ 99 \\ 97 \\ 96 \\ 94},0.99,0.93,1.96e-13,3.53e-11
