In [1]:
import pandas as pd
import torch
import tensorqtl
import numpy as np
from tensorqtl import genotypeio, cis, trans
from sklearn import preprocessing
from sklearn.preprocessing import quantile_transform
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm




In [47]:
genotype_df = pd.read_csv("../../data/trans_qtl_calling/gtex/genotypes_filtered/Vagina.GOBP_TISSUE_DEVELOPMENT.tsv.gz", 
                          sep="\t", index_col='snp')

In [48]:
genotype_df

Unnamed: 0_level_0,GTEX-113JC,GTEX-11DXX,GTEX-11EM3,GTEX-11EMC,GTEX-11GSP,GTEX-11I78,GTEX-11P81,GTEX-11TTK,GTEX-11VI4,GTEX-11XUK,...,GTEX-ZP4G,GTEX-ZQG8,GTEX-ZTPG,GTEX-ZV6S,GTEX-ZVT2,GTEX-ZVT3,GTEX-ZXES,GTEX-ZYVF,GTEX-ZYY3,GTEX-ZZPU
snp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
chr1_9919625_C_T_b38,0.531717,0.531717,0.531717,0.531717,0.531717,0.531717,0.531717,-1.536070,0.531717,-1.536070,...,0.531717,0.531717,-1.536070,0.531717,0.531717,0.531717,0.531717,-1.536070,0.531717,0.531717
chr1_9932244_A_C_b38,0.520278,0.520278,0.520278,0.520278,0.520278,0.520278,0.520278,-1.347385,0.520278,-1.347385,...,0.520278,0.520278,-1.347385,0.520278,0.520278,0.520278,0.520278,-1.347385,0.520278,0.520278
chr1_9934281_A_G_b38,0.531717,0.531717,0.531717,0.531717,0.531717,0.531717,0.531717,-1.536070,0.531717,-1.536070,...,0.531717,0.531717,-1.536070,0.531717,0.531717,0.531717,0.531717,-1.536070,0.531717,0.531717
chr1_9940342_T_A_b38,0.509803,0.509803,0.509803,0.509803,0.509803,0.509803,0.509803,-1.368419,0.509803,-1.368419,...,0.509803,0.509803,-1.368419,0.509803,0.509803,0.509803,0.509803,-1.368419,0.509803,0.509803
chr1_9944824_C_T_b38,0.488223,0.488223,0.488223,0.488223,0.488223,0.488223,0.488223,-1.647751,0.488223,-1.647751,...,0.488223,0.488223,-1.647751,0.488223,0.488223,0.488223,0.488223,-1.647751,0.488223,0.488223
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chr20_14358157_G_T_b38,1.024173,-0.438931,-0.438931,1.024173,-0.438931,-0.438931,-1.902035,-1.902035,-0.438931,1.024173,...,-0.438931,1.024173,1.024173,1.024173,1.024173,-0.438931,-0.438931,-0.438931,-1.902035,-0.438931
chr20_14360685_A_T_b38,1.024173,-0.438931,-0.438931,1.024173,-0.438931,-0.438931,-1.902035,-1.902035,-0.438931,1.024173,...,-0.438931,1.024173,1.024173,1.024173,1.024173,-0.438931,-0.438931,-0.438931,-1.902035,-0.438931
chr22_36992814_G_A_b38,1.709592,-0.466252,-0.466252,-0.466252,-0.466252,-0.466252,-0.466252,-0.466252,-0.466252,-0.466252,...,-0.466252,1.709592,-0.466252,-0.466252,-0.466252,-0.466252,-0.466252,1.709592,1.709592,-0.466252
chr22_46543361_T_C_b38,-0.096275,1.401330,-0.096275,-1.593879,-1.593879,1.401330,-0.096275,-0.096275,-1.593879,1.401330,...,-1.593879,-1.593879,-0.096275,-0.096275,-0.096275,-1.593879,1.401330,-1.593879,-0.096275,1.401330


In [5]:
plink_prefix_path = "../../data/trans_qtl_calling/gtex/genotypes_filtered/plink.Vagina.GOBP_TISSUE_DEVELOPMENT"
proportions_loc = "../../data/trans_qtl_calling/gtex/celltype_proportions/proportions-Vagina.txt"
covariates_file = "../../data/trans_qtl_calling/gtex/covariates/Vagina.GOBP_TISSUE_DEVELOPMENT.supervised_surrogate_variables.txt"
other_covariates_file = "

In [3]:
# Load genotypes
pr = genotypeio.PlinkReader(plink_prefix_path)
genotype_df = pr.load_genotypes()
variant_df = pr.bim.set_index('snp')[['chrom', 'pos']]



Mapping files: 100%|██████████| 3/3 [00:00<00:00, 33.45it/s]


In [17]:
# Load covariates
covariates_df = pd.read_csv(covariates_file, sep='\t', index_col=0)

# Filter to the donors for whom we have expression + genotype data
donors_inc = genotype_df.columns.intersection(covariates_df.columns)

# Load phenotypes, filter to the donors included and cell types with median score of at least 0.1
phenotype_df = pd.read_csv(proportions_loc, sep="\t").set_index('cell_type')
phenotype_df = phenotype_df.loc[phenotype_df.median(axis=1) > 0.1, donors_inc]

In [18]:
phenotype_df

Unnamed: 0_level_0,GTEX-113JC,GTEX-11DXX,GTEX-11EM3,GTEX-11EMC,GTEX-11GSP,GTEX-11I78,GTEX-11P81,GTEX-11TTK,GTEX-11VI4,GTEX-11XUK,...,GTEX-ZP4G,GTEX-ZQG8,GTEX-ZTPG,GTEX-ZV6S,GTEX-ZVT2,GTEX-ZVT3,GTEX-ZXES,GTEX-ZYVF,GTEX-ZYY3,GTEX-ZZPU
cell_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Epithelial_cells,0.793,0.9287,0.0413,0.8718,0.0339,1.0438,0.8742,0.9934,0.9706,0.9993,...,0.8869,0.0659,0.8879,0.0806,0.8388,0.1999,0.9556,0.3423,0.8462,1.0362
Keratinocytes,0.3034,0.3138,0.0,0.3832,0.0,0.4367,0.3861,0.404,0.3983,0.4101,...,0.3736,0.0041,0.3334,0.0013,0.3402,0.0407,0.4022,0.0785,0.3192,0.4406


In [19]:
# Quantile normalize the phenotypes
phenotype_df_norm = quantile_transform(phenotype_df, axis=1, output_distribution='normal')
phenotype_df_norm = pd.DataFrame(phenotype_df_norm, index=phenotype_df.index, columns=phenotype_df.columns)



In [21]:
covariates_df

Unnamed: 0_level_0,SV,GTEX-113JC,GTEX-11DXX,GTEX-11EM3,GTEX-11EMC,GTEX-11GSP,GTEX-11I78,GTEX-11P81,GTEX-11TTK,GTEX-11VI4,...,GTEX-ZP4G,GTEX-ZQG8,GTEX-ZTPG,GTEX-ZV6S,GTEX-ZVT2,GTEX-ZVT3,GTEX-ZXES,GTEX-ZYVF,GTEX-ZYY3,GTEX-ZZPU
SNP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
chr1_9919625_C_T_b38,SV1,0.038587,-0.039670,0.105410,-0.053653,0.132816,-0.156442,-0.056175,-0.061968,-0.073499,...,-0.052557,0.091624,-0.043430,0.088830,-0.060455,0.131354,-0.095262,0.089424,0.021594,-0.193784
chr1_9919625_C_T_b38,SV2,0.138770,-0.050425,-0.134885,0.059502,0.050024,0.000083,0.082671,0.220010,-0.051829,...,-0.081541,-0.170614,-0.049160,-0.094879,-0.071630,0.121075,-0.029765,0.036436,0.105361,-0.029743
chr1_9919625_C_T_b38,SV3,-0.090111,0.125861,-0.050761,-0.036838,-0.044150,-0.035638,0.021395,-0.058289,0.062501,...,0.014255,-0.043997,-0.041007,-0.035784,0.090860,-0.006237,-0.008421,-0.153538,0.004387,0.027481
chr1_9919625_C_T_b38,SV4,0.028608,0.130073,-0.085788,0.031414,-0.087025,-0.054375,-0.019131,0.071823,-0.020164,...,0.052177,0.076382,-0.047688,0.038705,0.031387,0.049561,-0.195894,-0.028858,-0.024465,-0.039247
chr1_9919625_C_T_b38,SV5,-0.003912,0.140435,-0.089580,0.071391,0.083646,-0.088467,-0.061425,-0.121552,-0.030867,...,0.019518,-0.116345,0.152404,0.002726,0.069022,-0.036733,-0.145103,0.162525,0.006609,-0.061220
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chr22_46572159_T_C_b38,SV17,-0.092581,-0.024967,0.089723,0.060059,0.004937,-0.042914,-0.009192,-0.015913,-0.080479,...,0.027472,-0.038047,0.045054,-0.039964,-0.083676,-0.129791,-0.073323,0.053011,0.057591,-0.251469
chr22_46572159_T_C_b38,SV18,0.033971,0.153969,-0.056824,-0.054989,-0.188374,-0.025801,0.024387,-0.194183,-0.055419,...,0.004834,-0.050135,-0.118679,0.102540,0.115837,-0.070005,0.030836,-0.000598,-0.001644,-0.040801
chr22_46572159_T_C_b38,SV19,-0.097723,0.000990,0.051396,-0.187566,0.067813,0.126685,-0.013566,0.116059,-0.050961,...,0.016721,0.019348,-0.028593,-0.006873,-0.087005,-0.076171,-0.037504,0.054156,-0.054959,-0.066190
chr22_46572159_T_C_b38,SV20,-0.045642,-0.052407,0.075512,0.072122,-0.153264,-0.056789,-0.027060,0.116842,0.047655,...,-0.039039,-0.000896,0.060139,0.026911,0.035450,-0.084432,-0.011720,-0.042957,0.041175,-0.233448


In [None]:
# assert np.all(phenotype_df.columns==covariates_df.index)
# assert covariates_df.index.isin(genotype_df.columns).all()

# Perform cell type proportion QTL calling
trans_df = trans.map_trans(genotype_df, phenotype_df_norm, covariates_df,
                           pval_threshold=1, maf_threshold=0.05,
                           batch_size=20000)

In [25]:
genotype_df

iid,GTEX-113JC,GTEX-11DXX,GTEX-11EM3,GTEX-11EMC,GTEX-11GSP,GTEX-11I78,GTEX-11P81,GTEX-11TTK,GTEX-11VI4,GTEX-11XUK,...,GTEX-ZP4G,GTEX-ZQG8,GTEX-ZTPG,GTEX-ZV6S,GTEX-ZVT2,GTEX-ZVT3,GTEX-ZXES,GTEX-ZYVF,GTEX-ZYY3,GTEX-ZZPU
snp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
chr1_9919625_C_T_b38,2,2,2,2,2,2,2,1,2,1,...,2,2,1,2,2,2,2,1,2,2
chr1_9932244_A_C_b38,2,2,2,2,2,2,2,1,2,1,...,2,2,1,2,2,2,2,1,2,2
chr1_9934281_A_G_b38,2,2,2,2,2,2,2,1,2,1,...,2,2,1,2,2,2,2,1,2,2
chr1_9940342_T_A_b38,2,2,2,2,2,2,2,1,2,1,...,2,2,1,2,2,2,2,1,2,2
chr1_9944824_C_T_b38,2,2,2,2,2,2,2,1,2,1,...,2,2,1,2,2,2,2,1,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chr20_14358157_G_T_b38,2,1,1,2,1,1,0,0,1,2,...,1,2,2,2,2,1,1,1,0,1
chr20_14360685_A_T_b38,2,1,1,2,1,1,0,0,1,2,...,1,2,2,2,2,1,1,1,0,1
chr22_36992814_G_A_b38,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,1,0
chr22_46543361_T_C_b38,1,2,1,0,0,2,1,1,0,2,...,0,0,1,1,1,0,2,0,1,2


In [28]:
cov_df.index

Index(['SV1', 'SV2', 'SV3', 'SV4', 'SV5', 'SV6', 'SV7', 'SV8', 'SV9', 'SV10',
       'SV11', 'SV12', 'SV13', 'SV14', 'SV15', 'SV16', 'SV17', 'SV18', 'SV19',
       'SV20', 'SV21'],
      dtype='object', name='SV')

In [30]:
s = 'chr1_9919625_C_T_b38'
geno_df = genotype_df.loc[genotype_df.index == s, :]
cov_df = covariates_df.loc[covariates_df.index == s, :].set_index('SV').T
trans.map_trans(geno_df, phenotype_df_norm, cov_df,
                pval_threshold=1, maf_threshold=0.05,
                batch_size=20000)

trans-QTL mapping
  * 140 samples
  * 2 phenotypes
  * 21 covariates
  * 1 variants
    processing batch 1/1
    elapsed time: 0.00 min
  * 1 variants passed MAF >= 0.05 filtering
done.


Unnamed: 0,variant_id,phenotype_id,pval,b,b_se,af
0,chr1_9919625_C_T_b38,Epithelial_cells,0.340832,0.087145,0.091116,0.871429
1,chr1_9919625_C_T_b38,Keratinocytes,0.017541,0.442265,0.183564,0.871429


In [32]:
qtl_all = []
for s in covariates_df.index.unique().tolist():
    geno_df = genotype_df.loc[genotype_df.index == s, :]
    cov_df = covariates_df.loc[covariates_df.index == s, :].set_index('SV').T
    qtl_all.append(trans.map_trans(geno_df, phenotype_df_norm, cov_df,
                                   pval_threshold=1, maf_threshold=0.05))
ct_qtls = pd.concat(qtl_all)

trans-QTL mapping
  * 140 samples
  * 2 phenotypes
  * 21 covariates
  * 1 variants
    processing batch 1/1
    elapsed time: 0.00 min
  * 1 variants passed MAF >= 0.05 filtering
done.
trans-QTL mapping
  * 140 samples
  * 2 phenotypes
  * 21 covariates
  * 1 variants
    processing batch 1/1
    elapsed time: 0.00 min
  * 1 variants passed MAF >= 0.05 filtering
done.
trans-QTL mapping
  * 140 samples
  * 2 phenotypes
  * 21 covariates
  * 1 variants
    processing batch 1/1
    elapsed time: 0.00 min
  * 1 variants passed MAF >= 0.05 filtering
done.
trans-QTL mapping
  * 140 samples
  * 2 phenotypes
  * 21 covariates
  * 1 variants
    processing batch 1/1
    elapsed time: 0.00 min
  * 1 variants passed MAF >= 0.05 filtering
done.
trans-QTL mapping
  * 140 samples
  * 2 phenotypes
  * 21 covariates
  * 1 variants
    processing batch 1/1
    elapsed time: 0.00 min
  * 1 variants passed MAF >= 0.05 filtering
done.
trans-QTL mapping
  * 140 samples
  * 2 phenotypes
  * 21 covariates
 

In [39]:
trans_df = pd.concat(qtl_all)
trans_df

Unnamed: 0,variant_id,phenotype_id,pval,b,b_se,af
0,chr1_9919625_C_T_b38,Epithelial_cells,0.340832,0.087145,0.091116,0.871429
1,chr1_9919625_C_T_b38,Keratinocytes,0.017541,0.442265,0.183564,0.871429
0,chr1_9932244_A_C_b38,Epithelial_cells,0.364105,0.083052,0.091154,0.870504
1,chr1_9932244_A_C_b38,Keratinocytes,0.020111,0.431131,0.182955,0.870504
0,chr1_9934281_A_G_b38,Epithelial_cells,0.340832,0.087145,0.091116,0.871429
...,...,...,...,...,...,...
1,chr22_36992814_G_A_b38,Keratinocytes,0.156314,0.304195,0.213208,0.115942
0,chr22_46543361_T_C_b38,Epithelial_cells,0.377322,-0.059597,0.067249,0.539568
1,chr22_46543361_T_C_b38,Keratinocytes,0.725437,-0.048548,0.137902,0.539568
0,chr22_46572159_T_C_b38,Epithelial_cells,0.164198,0.113429,0.081028,0.796429
