## effect size sharing for multi-gene qtls

say we have a qtl that impacts multiple genes. How does the fractional effect of that qtl on each gene vary? w.r.t for instance distance from qtl to tss? maybe tie in ABC connections here? Maybe build a model that predicts how qtl impact on each gene will be split based on gene baseline expression, distance, ect

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 
import os
import ast
import re

import upsetplot as up
from tqdm.auto import tqdm  # for notebooks

# Create new `pandas` methods which use `tqdm` progress
# (can use tqdm_gui, optional kwargs, etc.)
tqdm.pandas()

In [4]:
# get outputs from a config file
prefix = '/home/klawren/oak/pcqtls'
import yaml
config_path= f'{prefix}/config/new_clusters/proteincoding_rewrite.yaml'
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

tissue_id_path = config['tissue_id_path']
clusters_dir = config['clusters_dir']
eqtl_output_dir = config['eqtl_output_dir']
pcqtl_output_dir = config['pcqtl_output_dir']
overlap_output_dir = config['overlap_output_dir']


# load in the tissue ids 
tissue_df = pd.read_csv(f"{prefix}/{tissue_id_path}", header=0)
tissue_ids = list(tissue_df['Tissue'])

# select just 1 tissue id to do for now 
tissue_id = 'Cells_Cultured_fibroblasts' # because only one cell type
tissue_ids = [tissue_id]

### load in data
effect sizes for qtls
overlap dfs
gencode for tss starts

In [5]:
# load nominal p value dfs

# load in e nominal
def load_e_nominal(path):
    e_nominal_df = pd.read_parquet(path)
    e_nominal_df['cluster_id'] = e_nominal_df['phenotype_id'].str.split('_e_').str[0]
    e_nominal_df['egene_id'] = e_nominal_df['phenotype_id'].str.split('_e_').str[1]
    return e_nominal_df

e_nominal_dfs=[]
for chr_id in tqdm(range(1,23)):
    e_nominal_dfs.append(load_e_nominal(f'{prefix}/{eqtl_output_dir}/{tissue_id}/{tissue_id}.v8.cluster_genes.cis_qtl_pairs.chr{chr_id}.parquet'))

e_nominal_df = pd.concat(e_nominal_dfs)

def load_pc_nominal(path):
    pc_nominal_df = pd.read_parquet(path)
    pc_nominal_df['cluster_id'] = pc_nominal_df['phenotype_id'].str.split('_pc').str[0]
    pc_nominal_df['pc_num'] = pc_nominal_df['phenotype_id'].str.split('_pc').str[1]
    return pc_nominal_df

pc_nominal_dfs=[]
for chr_id in tqdm(range(1,23)):
    pc_nominal_dfs.append(load_pc_nominal(f'{prefix}/{pcqtl_output_dir}/{tissue_id}/{tissue_id}.v8.pcs.cis_qtl_pairs.chr{chr_id}.parquet'))
pc_nominal_df = pd.concat(pc_nominal_dfs)

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

In [6]:
overlap_df = pd.read_csv(f'{prefix}/{overlap_output_dir}/{tissue_id}.v8.overlap.txt', sep='\t')

In [7]:
# label cluster-variant pairs
overlap_df['var_cluster'] = overlap_df['lead_variant_id'] + '_' + overlap_df['cluster_id']
pc_nominal_df['var_cluster'] = pc_nominal_df['variant_id'] + '_' + pc_nominal_df['cluster_id']
e_nominal_df['var_cluster'] = e_nominal_df['variant_id'] + '_' + e_nominal_df['cluster_id']

# set e_nominal to be index based on variant-cluster pairs
e_nominal_df_cid = e_nominal_df.set_index('var_cluster')

In [8]:
# load in the susie df, so I have the pip values for the variants as well as the slopes
pc_susie_df = pd.read_csv(f'{prefix}/{pcqtl_output_dir}/{tissue_id}/{tissue_id}.v8.pcs.susie.txt', sep='\t', index_col=0)
pc_susie_df['var_cluster'] = pc_susie_df['variant_id'] + '_' + pc_susie_df['phenotype_id'].str.split('_pc1').str[0]
pc_susie_df['cs_id'] =  pc_susie_df['phenotype_id'].str.split('_pc1').str[0] + '_' + pc_susie_df['cs_id'].astype(str)
pc_susie_df['pc_num'] = pc_susie_df['phenotype_id'].str.split('_pc').str[-1].astype(int)

In [18]:
# get a subset of variants that ended up in credible sets
e_nominal_pcsusie_subset_df = e_nominal_df[e_nominal_df['var_cluster'].isin(pc_susie_df['var_cluster'])]

# I want to get a pip weighted variance for each egene
e_nominal_pcsusie_subset_df['var_egene_cluster'] = e_nominal_pcsusie_subset_df['variant_id'] + '_' + e_nominal_pcsusie_subset_df['phenotype_id']

# merge in the data from the susie finemapping
e_nominal_pcsusie_subset_df = pd.merge(e_nominal_pcsusie_subset_df, pc_susie_df[['cs_id', 'var_cluster', 'pip', 'pc_num']], on='var_cluster')

# pip weighted variance
e_nominal_pcsusie_subset_df['variance'] = e_nominal_pcsusie_subset_df['slope'].apply(np.square) * 100
e_nominal_pcsusie_subset_df['variance_weighted'] = e_nominal_pcsusie_subset_df['variance'] * e_nominal_pcsusie_subset_df['pip'] 

# group by phenotype (egene and cluster)
sum_variance_df = e_nominal_pcsusie_subset_df.groupby('phenotype_id').agg({'variance_weighted':sum, 
                                                        'cluster_id':'first', 
                                                        'variant_id':'first',
                                                        'egene_id':'first', 
                                                        'cs_id':'first'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  e_nominal_pcsusie_subset_df['var_egene_cluster'] = e_nominal_pcsusie_subset_df['variant_id'] + '_' + e_nominal_pcsusie_subset_df['phenotype_id']


In [20]:
e_nominal_pcsusie_subset_df

Unnamed: 0,phenotype_id,variant_id,start_distance,end_distance,af,ma_samples,ma_count,pval_nominal,slope,slope_se,cluster_id,egene_id,var_cluster,var_egene_cluster,cs_id,pip,pc_num,variance,variance_weighted
0,ENSG00000187583.10_ENSG00000187961.13_e_ENSG00...,chr1_966179_G_A_b38,5592,-318,0.838509,129,156,3.469805e-24,0.053869,0.004979,ENSG00000187583.10_ENSG00000187961.13,ENSG00000187961.13,chr1_966179_G_A_b38_ENSG00000187583.10_ENSG000...,chr1_966179_G_A_b38_ENSG00000187583.10_ENSG000...,ENSG00000187583.10_ENSG00000187961.13_1,0.999976,1,0.290188,0.290181
1,ENSG00000187583.10_ENSG00000187961.13_e_ENSG00...,chr1_966179_G_A_b38,5592,-318,0.838509,129,156,2.024204e-23,0.053055,0.005000,ENSG00000187583.10_ENSG00000187961.13,ENSG00000187583.10,chr1_966179_G_A_b38_ENSG00000187583.10_ENSG000...,chr1_966179_G_A_b38_ENSG00000187583.10_ENSG000...,ENSG00000187583.10_ENSG00000187961.13_1,0.999976,1,0.281480,0.281473
2,ENSG00000160072.19_ENSG00000197785.13_e_ENSG00...,chr1_1497758_C_T_b38,25989,-14393,0.010352,10,10,1.051561e-47,-0.218986,0.013205,ENSG00000160072.19_ENSG00000197785.13,ENSG00000160072.19,chr1_1497758_C_T_b38_ENSG00000160072.19_ENSG00...,chr1_1497758_C_T_b38_ENSG00000160072.19_ENSG00...,ENSG00000160072.19_ENSG00000197785.13_1,0.247937,1,4.795481,1.188976
3,ENSG00000160072.19_ENSG00000197785.13_e_ENSG00...,chr1_1497758_C_T_b38,25989,-14393,0.010352,10,10,4.022739e-01,-0.014278,0.017029,ENSG00000160072.19_ENSG00000197785.13,ENSG00000197785.13,chr1_1497758_C_T_b38_ENSG00000160072.19_ENSG00...,chr1_1497758_C_T_b38_ENSG00000160072.19_ENSG00...,ENSG00000160072.19_ENSG00000197785.13_1,0.247937,1,0.020386,0.005054
4,ENSG00000160072.19_ENSG00000197785.13_e_ENSG00...,chr1_1499000_C_A_b38,27231,-13151,0.010352,10,10,1.051561e-47,-0.218986,0.013205,ENSG00000160072.19_ENSG00000197785.13,ENSG00000160072.19,chr1_1499000_C_A_b38_ENSG00000160072.19_ENSG00...,chr1_1499000_C_A_b38_ENSG00000160072.19_ENSG00...,ENSG00000160072.19_ENSG00000197785.13_1,0.247937,1,4.795481,1.188976
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32327,ENSG00000025708.13_ENSG00000177989.13_e_ENSG00...,chr22_50535371_C_G_b38,5315,2791,0.021739,20,21,4.031875e-05,-0.053479,0.012885,ENSG00000025708.13_ENSG00000177989.13,ENSG00000177989.13,chr22_50535371_C_G_b38_ENSG00000025708.13_ENSG...,chr22_50535371_C_G_b38_ENSG00000025708.13_ENSG...,ENSG00000025708.13_ENSG00000177989.13_2,0.101840,1,0.286000,0.029126
32328,ENSG00000025708.13_ENSG00000177989.13_e_ENSG00...,chr22_50540263_G_A_b38,10207,7683,0.022774,20,22,9.602827e-04,-0.040343,0.012130,ENSG00000025708.13_ENSG00000177989.13,ENSG00000025708.13,chr22_50540263_G_A_b38_ENSG00000025708.13_ENSG...,chr22_50540263_G_A_b38_ENSG00000025708.13_ENSG...,ENSG00000025708.13_ENSG00000177989.13_2,0.088869,1,0.162755,0.014464
32329,ENSG00000025708.13_ENSG00000177989.13_e_ENSG00...,chr22_50540263_G_A_b38,10207,7683,0.022774,20,22,1.981092e-05,-0.051906,0.012023,ENSG00000025708.13_ENSG00000177989.13,ENSG00000177989.13,chr22_50540263_G_A_b38_ENSG00000025708.13_ENSG...,chr22_50540263_G_A_b38_ENSG00000025708.13_ENSG...,ENSG00000025708.13_ENSG00000177989.13_2,0.088869,1,0.269419,0.023943
32330,ENSG00000025708.13_ENSG00000177989.13_e_ENSG00...,chr22_50541096_C_A_b38,11040,8516,0.021739,20,21,6.250424e-04,-0.044693,0.012966,ENSG00000025708.13_ENSG00000177989.13,ENSG00000025708.13,chr22_50541096_C_A_b38_ENSG00000025708.13_ENSG...,chr22_50541096_C_A_b38_ENSG00000025708.13_ENSG...,ENSG00000025708.13_ENSG00000177989.13_2,0.101840,1,0.199749,0.020342


In [14]:
sum_variance_df

Unnamed: 0_level_0,variance_weighted,cluster_id,variant_id
phenotype_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000001460.17_ENSG00000001461.16_e_ENSG00000001460.17,0.163261,ENSG00000001460.17_ENSG00000001461.16,chr1_24385859_T_A_b38
ENSG00000001460.17_ENSG00000001461.16_e_ENSG00000001461.16,0.243458,ENSG00000001460.17_ENSG00000001461.16,chr1_24385859_T_A_b38
ENSG00000001561.6_ENSG00000112796.9_e_ENSG00000001561.6,0.372927,ENSG00000001561.6_ENSG00000112796.9,chr6_46102670_C_T_b38
ENSG00000001561.6_ENSG00000112796.9_e_ENSG00000112796.9,0.199345,ENSG00000001561.6_ENSG00000112796.9,chr6_46102670_C_T_b38
ENSG00000002726.20_ENSG00000002933.7_ENSG00000055118.14_ENSG00000106565.17_e_ENSG00000002726.20,0.039052,ENSG00000002726.20_ENSG00000002933.7_ENSG00000...,chr7_150780504_T_G_b38
...,...,...,...
ENSG00000274750.2_ENSG00000275713.2_e_ENSG00000274750.2,0.384807,ENSG00000274750.2_ENSG00000275713.2,chr6_26208413_C_G_b38
ENSG00000274750.2_ENSG00000275713.2_e_ENSG00000275713.2,0.022816,ENSG00000274750.2_ENSG00000275713.2,chr6_26208413_C_G_b38
ENSG00000275464.4_ENSG00000280071.3_ENSG00000280433.1_e_ENSG00000275464.4,0.518858,ENSG00000275464.4_ENSG00000280071.3_ENSG000002...,chr21_5089937_T_C_b38
ENSG00000275464.4_ENSG00000280071.3_ENSG00000280433.1_e_ENSG00000280071.3,0.727659,ENSG00000275464.4_ENSG00000280071.3_ENSG000002...,chr21_5089937_T_C_b38


I now have a table with a row for each pcqtl for the the % var explained for each egene in its cluster. I want to further annotate this with relevant info, like the qtl lead variant to egene tss distance

In [11]:
# load in the gene information (start and strand are what I need)
full_gencode=pd.read_csv('/home/klawren/oak/pcqtls/data/references/processed_gencode.v26.GRCh38.genes.gtf', sep='\t', skiprows=range(6), 
            header=None, names=['chr', 'dataset', 'type', 'start','end', '.', 'strand', 'na', 'info'])

full_gencode = full_gencode[full_gencode['type']=='transcript']
full_gencode['transcript_id'] = full_gencode['info'].str.split(';').str[1].str.split('\"').str[-2]

# add in the start and end info
full_gencode['tss_start'] = np.where(full_gencode['strand'] == '+', full_gencode['start'], full_gencode['end'])
full_gencode['gene_end'] = np.where(full_gencode['strand'] == '-', full_gencode['start'], full_gencode['end'])

# filter to just the transcripts that are in the clusters
gene_ids = np.concatenate(overlap_df['cluster_id'].str.split('_'))
gid_gencode = full_gencode.set_index('transcript_id').loc[gene_ids]
gid_gencode = gid_gencode.drop_duplicates()

In [15]:
row = sum_variance_df.iloc[0]

In [19]:
egene_tss = gid_gencode.loc[row['egene_id']]['tss_start']
lead

KeyError: 'egene_id'