In [None]:
import pandas as pd
import numpy as np
import tensorqtl
from tensorqtl import genotypeio, cis, trans
import matplotlib.pyplot as plt

# define paths to data
plink_prefix_path = 'swath-ms.01'
expression_bed = 'swath-ms.expression.bed.gz'
covariates_file = 'swath-ms.covariates.txt'
prefix = 'swath-ms'

# load phenotypes and covariates
phenotype_df, phenotype_pos_df = tensorqtl.read_phenotype_bed(expression_bed)
covariates_df = pd.read_csv(covariates_file, sep='\t', index_col=0).T

# PLINK reader for genotypes
pr = genotypeio.PlinkReader(plink_prefix_path)
genotype_df = pr.load_genotypes()
variant_df = pr.bim.set_index('snp')[['chrom', 'pos']]

In [None]:
print(phenotype_df)

### *cis*-QTL: empirical p-values for phenotypes

In [None]:
# all genes
cis_df = cis.map_cis(genotype_df, variant_df, phenotype_df, phenotype_pos_df, covariates_df)

In [None]:
cis_df.head()

### *trans*-QTL mapping

In [14]:
# run mapping
# to limit output size, only associations with p-value <= 1e-5 are returned
trans_df = trans.map_trans(genotype_df, phenotype_df, covariates_df, batch_size=20000,
                           return_sparse=True, pval_threshold=1e-5, maf_threshold=0.05)

trans-QTL mapping
  * 196 samples
  * 598 phenotypes
  * 23 covariates
  * 9766504 variants
    processing batch 489/489
    elapsed time: 0.94 min
  * 6233782 variants passed MAF >= 0.05 filtering
done.


In [10]:
# remove cis-associations
trans_df = trans.filter_cis(trans_df, phenotype_pos_df.T.to_dict(), variant_df, window=5000000)

In [11]:
trans_df.head()

Unnamed: 0,variant_id,phenotype_id,pval,maf
0,rs12131377,ENSG00000035403,9e-06,0.053571
1,rs34260203,ENSG00000159377,5e-06,0.405612
2,rs7411115,ENSG00000197157,3e-06,0.109694
3,rs6605067,ENSG00000186081,1e-06,0.066327
4,rs2839,ENSG00000186081,3e-06,0.068878
