### Manhatten Plot for example cluster

including:
* for each egene and cluster pc:
    * nominal ps
    * lead variant for each independent signal via susie 
    * credible set via susie
* gene annotations
    * genes in cluster
        * variance in gene expression explained by each pc
    * genes not in cluster?

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import matplotlib.patches as patches
from scipy.stats import linregress


import seaborn as sns 
from tqdm.auto import tqdm  # for notebooks
import matplotlib.gridspec as gridspec
import matplotlib.lines as mlines
from matplotlib.lines import Line2D
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.cm import register_cmap


import matplotlib as mpl
#mpl.rcParams['figure.dpi']= 700


In [None]:

# get outputs from a config file
prefix = '/home/klawren/oak/pcqtls'
import yaml
config_path= f'{prefix}/config/pcqtl_proteincoding.yaml'
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

tissue_id_path = config['tissue_id_path']
eqtl_output_dir = config['eqtl_output_dir']
pcqtl_output_dir = config['pcqtl_output_dir']
pc_output_dir = config['pc_output_dir']
filtered_expression_output_dir = config['filtered_expression_output_dir']
overlap_output_dir = config['overlap_output_dir']


# load in the tissue ids 
tissue_df = pd.read_csv(f"{prefix}/{tissue_id_path}", header=0)
tissue_ids = list(tissue_df['Tissue'])


decide which cluster to do

In [None]:
overlap_dfs=[]
for i in range(len(tissue_ids)):
    tissue_id = tissue_ids[i]
    overlap_df = pd.read_csv(f'{prefix}/{overlap_output_dir}/{tissue_id}.v8.overlap.txt', sep='\t')
    overlap_dfs.append(overlap_df)

full_overlap_df = pd.concat(overlap_dfs, keys=tissue_ids, names=['Tissue']).reset_index()
full_overlap_df['n_genes'] = full_overlap_df['cluster_id'].str.split('_').apply(len)
full_overlap_df['pc_num'] = np.where(full_overlap_df['orig_cs_dataset']=='pc_qtl',  full_overlap_df['cs_full_id'].str.split('_').str[-2].str.strip('pc'), 0)
full_overlap_df['pc_num'] = full_overlap_df['pc_num'].astype(int)

In [None]:
coloc_df = pd.read_csv('/home/klawren/oak/pcqtls/data/references/ribeiro_2021_coloc_results.txt', sep='\t')

In [None]:
shared_signals = full_overlap_df[full_overlap_df['lead_variant_id'].str[:-4].isin(coloc_df['variant_id'])]
shared_signals

In [None]:
filtered_shared_signals = shared_signals[shared_signals['num_e_overlap'] == 0]
filtered_shared_signals

In [None]:
row = filtered_shared_signals[filtered_shared_signals['n_genes']> 2].iloc[3]
filtered_shared_signals[filtered_shared_signals['n_genes'] >2]

# ENSG00000120071.13_ENSG00000176681.14 has really long LD
# ENSG00000026950.16_ENSG00000111801.1 is ok but has lots of signals, seems like LD error 
# ENSG00000140474.12_ENSG00000140497.16 is the one I will do


In [None]:
tissue_id = row.Tissue
chr_id = row['lead_variant_id'].split('_')[0].split('chr')[1]
cluster_id = row['cluster_id']

In [None]:
# HLA cluster?
#cluster_id = 'ENSG00000204252.13_ENSG00000204257.14_ENSG00000223865.10_ENSG00000231389.7'
#chr_id = 6
#tissue_id = 'Nerve_Tibial'


In [None]:
# skin cluster 
#tissue_id = 'Skin_Sun_Exposed_Lower_leg'
#chr_id = 17
#cluster_id = 'ENSG00000006059.3_ENSG00000094796.4_ENSG00000108244.16_ENSG00000108417.3_ENSG00000108759.3_ENSG00000126337.13_ENSG00000131737.5_ENSG00000131738.10_ENSG00000171360.3_ENSG00000171396.11_ENSG00000171431.3_ENSG00000186860.4_ENSG00000187272.6_ENSG00000188581.8_ENSG00000196156.4_ENSG00000196859.7_ENSG00000197079.8_ENSG00000198083.9_ENSG00000198090.3_ENSG00000198271.4_ENSG00000198443.6_ENSG00000204873.4_ENSG00000204880.7_ENSG00000204889.10_ENSG00000212657.1_ENSG00000212721.3_ENSG00000212722.7_ENSG00000212724.3_ENSG00000212725.3_ENSG00000212899.2_ENSG00000212900.2_ENSG00000212901.3_ENSG00000213416.3_ENSG00000213417.3_ENSG00000214518.3_ENSG00000221852.4_ENSG00000221880.3_ENSG00000240871.5_ENSG00000241595.2_ENSG00000244537.2'


## Load in the data
* nominal p values for control eqtl and cluster
* susie for control eqtl and susie
* gene annotations

In [None]:
# load nominal p value dfs

def var_pos(df):
    return df['variant_id'].str.split('_').str[1].astype(int)

# load in e nominal
def load_e_nominal(path):
    e_nominal_df = pd.read_parquet(path)
    e_nominal_df['variant_pos'] = var_pos(e_nominal_df)
    e_nominal_df['cluster_id'] = e_nominal_df['phenotype_id'].str.split('_e_').str[0]
    e_nominal_df['egene_id'] = e_nominal_df['phenotype_id'].str.split('_e_').str[1]
    return e_nominal_df

e_nominal_df = load_e_nominal(f'{eqtl_output_dir}/{tissue_id}/{tissue_id}.v8.cluster_genes.cis_qtl_pairs.chr{chr_id}.parquet')


def load_pc_nominal(path):
    pc_nominal_df = pd.read_parquet(path)
    pc_nominal_df['variant_pos'] = var_pos(pc_nominal_df)
    pc_nominal_df['cluster_id'] = pc_nominal_df['phenotype_id'].str[:-4]
    pc_nominal_df['cluster_size'] = pc_nominal_df['phenotype_id'].str.split('_').str.len() - 1
    pc_nominal_df['pc_id'] = pc_nominal_df['phenotype_id'].str.split('_pc').str[-1].astype(int)
    return pc_nominal_df

pc_nominal_df = load_pc_nominal(f'{pcqtl_output_dir}/{tissue_id}/{tissue_id}.v8.pcs.cis_qtl_pairs.chr{chr_id}.parquet')
pc_nominal_df.head()

In [None]:
# load susie dfs
def get_lead_var(susie_df):
    lv = susie_df.loc[susie_df.groupby('cs_full_id')['pip'].idxmax(),['cs_full_id','variant_id']].set_index('cs_full_id')
    return susie_df['cs_full_id'].map(lv['variant_id'])

def load_e_susie(path):
    e_susie_df = pd.read_csv(path, sep='\t', index_col=0)
    e_susie_df['cluster_id'] = e_susie_df['phenotype_id'].str.split('_e_').str[0]
    e_susie_df['egene_id'] = e_susie_df['phenotype_id'].str.split('_e_').str[1]
    e_susie_df['cs_full_id'] = e_susie_df['phenotype_id'].astype(str) + '_e_cs' + e_susie_df['cs_id'].astype(str) 
    e_susie_df['lead_variant'] = get_lead_var(e_susie_df)
    e_susie_df['is_lead'] = e_susie_df['variant_id'] == e_susie_df['lead_variant']
    e_susie_df['variant_pos'] = var_pos(e_susie_df)
    return e_susie_df

e_susie_df = load_e_susie(f'{eqtl_output_dir}/{tissue_id}/{tissue_id}.v8.cluster_genes.susie.txt')
# filter to only the chromosome we're considering
e_susie_df = e_susie_df[e_susie_df['variant_id'].str.split('_').str[0] == f'chr{chr_id}']

def load_pc_susie(path):
    pc_susie_df = pd.read_csv(path, sep='\t', index_col=0)
    pc_susie_df['cluster_id'] = pc_susie_df['phenotype_id'].str.split('_pc').str[0]
    pc_susie_df['cs_full_id'] = pc_susie_df['phenotype_id'].astype(str) + '_cs' + pc_susie_df['cs_id'].astype(str)
    pc_susie_df['lead_variant'] = get_lead_var(pc_susie_df)
    pc_susie_df['is_lead'] = pc_susie_df['variant_id'] == pc_susie_df['lead_variant']
    pc_susie_df['variant_pos'] = var_pos(pc_susie_df)
    pc_susie_df['pc_id'] = pc_susie_df['phenotype_id'].str.split('_pc').str[-1].astype(int)
    return pc_susie_df

pc_susie_df = load_pc_susie(f'{pcqtl_output_dir}/{tissue_id}/{tissue_id}.v8.pcs.susie.txt')
# filter to only the chromosome we're considering
pc_susie_df = pc_susie_df[pc_susie_df['variant_id'].str.split('_').str[0] == f'chr{chr_id}']

pc_susie_df.head()

In [None]:
# load in pc data
pc_df = pd.read_csv(f'{prefix}/{pc_output_dir}/{tissue_id}.pcs.bed', sep='\t')
# get a list of all the sample ids
sample_ids = pc_df.columns[4:]
# add a cluster id and pc number column
pc_df['cluster_id'] = pc_df['gene_id'].str.split('_pc').str[0]
pc_df['pc_id'] = pc_df['gene_id'].str.split('_pc').str[1].astype('float')
pc_df['cluster_size'] = pc_df['cluster_id'].str.split('_').apply(len)
# filter to only the chromosome we're considering
pc_df = pc_df[pc_df['#chr'] == f'chr{chr_id}']

pc_df.head()

In [None]:
# load in gene expression data
expression_df = pd.read_csv(f'{prefix}/{filtered_expression_output_dir}/{tissue_id}.v8.normalized_expression.cluster_genes.bed', sep='\t')
# add a cluster id and pc number column
expression_df['cluster_id'] = expression_df['gene_id'].str.split('_e_').str[0]
expression_df['egene_id'] = expression_df['gene_id'].str.split('_e_').str[1]
# filter to only the chromosome we're considering
expression_df = expression_df[expression_df['#chr'] == f'chr{chr_id}']


expression_df.head()

In [None]:
# load gene annotations
full_gencode=pd.read_csv('/home/klawren/oak/pcqtls/data/references/processed_gencode.v26.GRCh38.genes.gtf', sep='\t', skiprows=range(6), 
            header=None, names=['chr', 'dataset', 'type', 'start','end', '.', 'strand', 'na', 'info'])

gencode = full_gencode[full_gencode['type']=='transcript']
gencode['transcript_id'] = gencode['info'].str.split(';').str[1].str.split('\"').str[-2]
gencode['gene_name'] = gencode['info'].str.split(';').str[3].str.split('\"').str[-2]
gencode['tss_start'] = np.where(gencode['strand'] == '+', gencode['start'], gencode['end'])
gencode['gene_end'] = np.where(gencode['strand'] == '-', gencode['start'], gencode['end'])


# filter to just the genes in clusters on the chromosome we're considering
gene_ids = expression_df['egene_id'].unique()
gid_gencode = gencode.set_index('transcript_id').loc[gene_ids]
gid_gencode = gid_gencode.drop_duplicates()

In [None]:
# plot the gene-gene correlations

def plot_cluster_corr(cluster_id, expression_df, sample_ids, gid_gencode=gid_gencode):
    # get the gene ids (this will be the order on the heatmap)
    gene_ids = cluster_id.split('_')
    # set the order to be the order along the genome
    cluster_gencode = gid_gencode.loc[gene_ids]
    sorted_gene_ids = cluster_gencode.sort_values(by=['start', 'end']).index.values


    # pull out expression for this cluster
    expression_cluster = expression_df[expression_df['cluster_id'] == cluster_id]
    # get the expression in the same order as the gene ids
    expression_cluster = expression_cluster.set_index('egene_id').loc[sorted_gene_ids]
    # get correlations 
    corrs = expression_cluster[sample_ids].T.corr(method='spearman')

    # rename with gene names
    gene_names = cluster_gencode.sort_values(by=['start', 'end'])['gene_name']
    corrs = corrs.rename(index=gene_names, columns=gene_names)

    # make a plot
    fig,ax = plt.subplots(figsize=(7,6))
    # mask so the off diagonal isn't shown
    mask = np.triu(np.ones_like(corrs)) 
    # create the colormap, these are colors matched to the manhattenplots. coolwarm is another good option
    cmap = LinearSegmentedColormap.from_list('mycmap', [(0, '#c4553aff'), (.5, 'white'), (1, '#3e8093ff')])
    # plot the correlations
    sns.heatmap(corrs, mask=mask, cmap=cmap, vmin=-1, vmax=1, ax=ax, cbar_kws={'label':'Spearman Corrleation', 'pad':0}, xticklabels=True, yticklabels=True)

    # rotate the gene names
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
    ax.set_yticklabels(ax.get_yticklabels(), rotation=0)

    # no axis labels
    ax.set_ylabel('')
    ax.set_xlabel('')


    # removes the top and right x-tick labels 
    plt.xticks()[1][-1].set_visible(False)
    plt.yticks()[1][0].set_visible(False)

    fig.tight_layout()


plot_cluster_corr(cluster_id, expression_df, sample_ids)

In [None]:
# plot the gene-gene correlations for one of the big keratin clusters

# load in gene expression data
skin_expression_df = pd.read_csv(f'{prefix}/{filtered_expression_output_dir}/Skin_Not_Sun_Exposed_Suprapubic.v8.normalized_expression.cluster_genes.bed', sep='\t')
skin_sample_ids = skin_expression_df.columns[4:]
# add a cluster id and pc number column
skin_expression_df['cluster_id'] = skin_expression_df['gene_id'].str.split('_e_').str[0]
skin_expression_df['egene_id'] = skin_expression_df['gene_id'].str.split('_e_').str[1]

skin_cluster_id = 'ENSG00000006059.3_ENSG00000108244.16_ENSG00000131737.5_ENSG00000131738.10_ENSG00000167916.4_ENSG00000167920.8_ENSG00000171396.11_ENSG00000171431.3_ENSG00000171446.6_ENSG00000173908.8_ENSG00000180386.7_ENSG00000186393.5_ENSG00000186395.7_ENSG00000186860.4_ENSG00000187242.4_ENSG00000187272.6_ENSG00000188581.8_ENSG00000196156.4_ENSG00000196859.7_ENSG00000198083.9_ENSG00000198090.3_ENSG00000198271.4_ENSG00000198443.6_ENSG00000204873.4_ENSG00000204880.7_ENSG00000204889.10_ENSG00000204897.6_ENSG00000212657.1_ENSG00000212658.1_ENSG00000212659.1_ENSG00000212721.3_ENSG00000212722.7_ENSG00000212724.3_ENSG00000212725.3_ENSG00000212899.2_ENSG00000212900.2_ENSG00000212901.3_ENSG00000213416.3_ENSG00000213417.3_ENSG00000214518.3_ENSG00000221852.4_ENSG00000221880.3_ENSG00000239886.5_ENSG00000240871.5_ENSG00000241595.2_ENSG00000244537.2'

plot_cluster_corr(skin_cluster_id,skin_expression_df, skin_sample_ids, gid_gencode=gencode.set_index('transcript_id').drop_duplicates())

In [None]:
row.lead_variant_id

In [None]:
# the cluster we're considering
full_overlap_df['pc_num'] = np.where(full_overlap_df['orig_cs_dataset']=='pc_qtl',  full_overlap_df['cs_full_id'].str.split('_').str[-2].str.strip('pc'), 0)
full_overlap_df['pc_num'] = full_overlap_df['pc_num'].astype(int)
full_overlap_df[(full_overlap_df['Tissue']==tissue_id)&(full_overlap_df['cluster_id'] == cluster_id)]

# chr15_74932469_T_A_b38
# chr6_26372599_C_T

In [None]:
plot_cluster(cluster_id)

In [None]:
var_pos = int(row.lead_variant_id.split('_')[1])

In [None]:
# get only variants near the top snp (1kb)
pc_nominal_set['var_position'] = pc_nominal_set['variant_id'].str.split('_').str[1].astype(int)
pc_nominal_set[abs(pc_nominal_set['var_position'] - var_pos) < 10000]

In [None]:
# pc values for the pc tahthas teh gwas hit var
pc_nominal_set = pc_nominal_df[(pc_nominal_df['cluster_id'] == cluster_id)&((pc_nominal_df['pc_id'] == row.pc_num))]
# get only variants near the top snp (10kb)
pc_nominal_set['var_position'] = pc_nominal_set['variant_id'].str.split('_').str[1].astype(int)
pc_nominal_set = pc_nominal_set[abs(pc_nominal_set['var_position'] - var_pos) < 10000]

pc_susie_set=pc_susie_df[(pc_susie_df['cluster_id'] == cluster_id)&(pc_susie_df['pc_id']==row.pc_num)]

fig, ax = plt.subplots()

for i in range(len(cluster_id.split('_'))):
    # p values and credible set for this egene
    nominal_set = e_nominal_df[e_nominal_df['egene_id'] == cluster_id.split('_')[i]]
    # get only variants near the top snp (10kb)
    nominal_set['var_position'] = nominal_set['variant_id'].str.split('_').str[1].astype(int)
    nominal_set = nominal_set[abs(nominal_set['var_position'] - var_pos) < 10000]
    
    susie_set=e_susie_df[e_susie_df['egene_id'] == cluster_id.split('_')[i]]

    # merge into 1 df for plotting
    merged_df = pd.merge(pc_nominal_set, nominal_set, on='variant_id', suffixes=('_pc', '_e'))

    # nominal p values
    sns.scatterplot(merged_df, x='pval_nominal_pc', y='pval_nominal_e',
                            s=10, 
                            linewidth=0, label=gid_gencode.loc[cluster_id.split('_')[i]]['gene_name'], ax=ax)

ax.set(yscale='log', xscale='log')
ax.invert_yaxis()
ax.invert_xaxis()


In [None]:
plot_cluster(cluster_id, x_padding=400000)    

In [None]:
# plot to show how the e gene and pc1 qtl coloc??


## split out types of plots

In [None]:
def plot_genes(ax, cluster_id, scalebar=True, x_padding = 1000000, palette = sns.color_palette()):

    # get the gene ids, assumes cluster id is sorted by position 
    gene_ids = cluster_id.split('_')
    # set the order to be the order along the genome
    cluster_gencode = gid_gencode.loc[gene_ids]

    # Sort the dataframe by 'start'
    cluster_gencode = cluster_gencode.sort_values(by=['start', 'end'])

    # this list will store the end of the last gene at each level:
    ends_of_genes = [0]
    # row height:
    height = .3

    # for each gene
    for i in range(len(cluster_gencode)):
        gene = cluster_gencode.iloc[i]
        # for each level
        for level, end in enumerate(ends_of_genes):
            # if this gene does not overlap with the last gene at this level
            if gene['start'] > end:
                # place the gene at this level
                arrow = patches.FancyArrow(x = gene['tss_start'], y=height*level, dx=(gene['gene_end'] - gene['tss_start']), 
                                           dy=0, width=.1, head_length = .02*x_padding, edgecolor=palette[i%10], 
                                           facecolor=palette[i%10])

                ax.add_patch(arrow)
                # update the end of the last gene at this level
                ends_of_genes[level] = gene['end']
                break
        else:
            # all levels are occupied, we need to create a new level:
            ends_of_genes.append(gene['gene_end'])
            arrow = patches.FancyArrow(x = gene['tss_start'], y=height*(len(ends_of_genes) - 1), dx=(gene['gene_end'] - gene['tss_start']), 
                                           dy=0, width=.1, head_length = .02*x_padding, edgecolor=palette[i%10], 
                                           facecolor=palette[i%10])

            ax.add_patch(arrow)
    
    # Set plot limits
    ax.set_ylim(-1*height, height*len(ends_of_genes))
    # padding for the cis window
    ax.set_xlim(cluster_gencode['start'].min() - x_padding, cluster_gencode['end'].max() + x_padding)

    # to remove black outlines for gene plot
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    # remove y axis and ticks
    ax.spines['left'].set_visible(False)
    ax.set_yticks([])

    # remove x axis and ticks
    ax.spines['bottom'].set_visible(False)
    ax.set_xticks([])

    return ax


def plot_scalebar(ax, scalebar_length = 200000, side='left'):
    # add in scalebar
    scalebar_label=f'{scalebar_length/1000:.0f} kb'

    if side=='left':
        # Adding scale bar
        scalebar_start = ax.get_xlim()[0]+.05*scalebar_length
        scalebar = mlines.Line2D([scalebar_start, scalebar_start + scalebar_length], [np.mean(ax.get_ylim()),np.mean(ax.get_ylim())], color='k')
    elif side=='right':
        scalebar_start = ax.get_xlim()[1]-.05*scalebar_length - scalebar_length
        scalebar = mlines.Line2D([scalebar_start, scalebar_start + scalebar_length], [np.mean(ax.get_ylim()),np.mean(ax.get_ylim())], color='k')

    ax.add_line(scalebar)

    # Adding caps for bar
    cap_length = (ax.get_ylim()[1] - ax.get_ylim()[0])/5
    cap1 = mlines.Line2D([scalebar_start, scalebar_start], [np.mean(ax.get_ylim()) - cap_length / 2., np.mean(ax.get_ylim()) + cap_length / 2.], color='k')
    cap2 = mlines.Line2D([scalebar_start + scalebar_length, scalebar_start + scalebar_length], [np.mean(ax.get_ylim()) - cap_length / 2., np.mean(ax.get_ylim()) + cap_length / 2.], color='k')
    ax.add_line(cap1)
    ax.add_line(cap2)

    # Adding text
    ax.text(scalebar_start + scalebar_length / 2, np.mean(ax.get_ylim()) + cap_length, scalebar_label, ha='center')  
    return ax

In [None]:
def plot_cs(ax, nominal_set, susie_set, gene_color, use_color_cs=True):
        # nominal p values
        sns.scatterplot(nominal_set,
                        x='variant_pos', 
                        y='pval_nominal', 
                        ax=ax,
                        color=gene_color, 
                        s=10, 
                        linewidth=0)
        # susie credible sets
        sns.scatterplot(nominal_set[nominal_set['variant_id'].isin(susie_set['variant_id'])],
                        x='variant_pos', 
                        y='pval_nominal', 
                        ax=ax,
                        color=gene_color if use_color_cs else 'black', 
                        marker='*' if use_color_cs else '.', 
                        s=100, 
                        linewidth=.1)
        # susie lead vars sets
        sns.scatterplot(nominal_set[nominal_set['variant_id'].isin(susie_set[susie_set['is_lead']]['variant_id'])],
                        x='variant_pos', 
                        y='pval_nominal', 
                        ax=ax,
                        color=gene_color if use_color_cs else '#B83A4B', 
                        marker='x', 
                        s=100, 
                        linewidth=3)
        
        # to remove black outlines for gene plot
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)

        # set y label
        ax.set_ylabel('nominal p-value')

        
        return ax

In [None]:
        
def add_legend(ax, sorted_gene_ids, palette=sns.color_palette(), use_color_cs=True, var_plot=True):
    # create handles for 'gene' section of legend
    gene_names = gid_gencode.loc[sorted_gene_ids]['gene_name']
    handles_colors_genes = [patches.Patch(color=palette[i], label=gene) for i, gene in enumerate(gene_names)]
    # create handels for 'pcs' section of legend
    handles_colors_pcs = [patches.Patch(color=palette[i+len(sorted_gene_ids)+1], label=f'pc {i+1}') for i in range(len(sorted_gene_ids))]
    # create custom handles for 'marks' section of legend
    handles_marks = [Line2D([0], [0], color='black', marker='.', markersize=4, linestyle='None', label='nominal p-value'),
                    Line2D([0], [0], color='black', marker='*' if use_color_cs else '.', markersize=7, linestyle='None', label='credible set'),
                    Line2D([0], [0], color='#B83A4B', marker='x', markersize=7, linestyle='None', label='lead variant', markeredgewidth=3)]
    # if var plot 
    handles_marks.append(Line2D([0], [0], color=palette[0], marker='P', markersize=7, linestyle='None', label='% variance explained', markeredgewidth=0))

    # add in section titles
    handles = [Line2D([], [], color='none', label='Genes'), *handles_colors_genes, 
               Line2D([], [], color='none', label='PCs'), *handles_colors_pcs,
               Line2D([], [], color='none', label='Markers'), *handles_marks]

    ax.legend(handles=handles, loc='center left', frameon=False)

    # to remove black outlines for gene plot
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    # remove y axis and ticks
    ax.spines['left'].set_visible(False)
    ax.set_yticks([])

    # remove x axis and ticks
    ax.spines['bottom'].set_visible(False)
    ax.set_xticks([])


In [None]:
def get_rs(cluster_id):
    # the expression and pc values for this cluster
    expression_cluster = expression_df[expression_df['cluster_id'] == cluster_id].reset_index()
    pc_cluster = pc_df[pc_df['cluster_id'] == cluster_id].reset_index()
    # pull gene locations 
    cluster_gencode = gid_gencode.loc[expression_cluster['egene_id']]
    rs = np.zeros((len(expression_cluster), len(pc_cluster)))

    for i in range(len(expression_cluster)):
        for j in range(len(pc_cluster)):
            # pull the right data
            pc_values = pc_cluster.loc[j, sample_ids].astype('float')
            expression_values = expression_cluster.loc[i, sample_ids].astype('float')
            # get the r squared value
            slope, intercept, r_value, p_value, std_err = linregress(pc_values, expression_values)
            rs[i,j] = (r_value**2)*100

    rs_df = pd.DataFrame(rs, columns=[f'pc{i+1}' for i in range(len(pc_cluster))], index=expression_cluster['egene_id'])

    rs_df = rs_df.loc[cluster_gencode.index]
    rs_df.reset_index()

    # add the location for the variance to be ploted as the gene tss
    rs_df['position'] = (cluster_gencode['start'] + cluster_gencode['end'])/2

    return rs_df

def plot_pc_var(ax, rs_df, pc_nums, gene_ids, gene_palette, line=False):
    plot_rs = rs_df.loc[gene_ids, [*pc_nums, 'position']]

    # split out all the pc values into one column
    rs_melt_df = plot_rs.reset_index().melt(id_vars=['position', 'transcript_id'], value_vars=plot_rs.columns[:-1].values, 
                  var_name='pc_num')
    
    # plot the varaince explained as a scatterplot
    sns.lineplot(rs_melt_df, y='value', x='position', style='pc_num', color='k', ax=ax, legend=False, alpha=0.5)
    sns.scatterplot(rs_melt_df, y='value', x='position', hue='transcript_id', ax=ax, palette=gene_palette, legend=False, linewidth=0, s=50, marker='P')

    ax.set_ylabel('% expression \nvariance explained')
    ax.set_ylim((0,100))
    ax.spines['top'].set_visible(False)


In [None]:
def plot_cluster(cluster_id, x_padding = 1000000, figsize=(20, 15), var_plot=True):
    # get gene ids
    gene_ids = cluster_id.split('_')
    # sort gene ids by start position
    sorted_gene_ids = gid_gencode.loc[gene_ids].sort_values(by=['start', 'end']).index.values
    gene_names = gid_gencode.loc[sorted_gene_ids]['gene_name']


    # create a gridspec object
    # Create a 2x2 figure with specified figure width
    fig = plt.figure(figsize=figsize) 

    num_genes = len(gene_ids)
    num_manhatten = num_genes*2
    # colors (1st half is genes, 2nd half is pcs)
    palette = sns.diverging_palette(220, 20, n=num_genes*2+1)

    # cols determines the width of the legend vs the plots
    num_cols=10

    # set up axes for the different plots 
    gs = gridspec.GridSpec(num_manhatten*2+1, num_cols)

    # one plot for each gene's manhatten, plus one for gene arrows on the bottom
    # manhatten plots are twice the height as the arrow plot
    axes_arrows = plt.subplot(gs[num_manhatten*2, :num_cols-1])
    # plot for the legend on the right side
    axes_legend = plt.subplot(gs[:, num_cols-1])

    # axes for each manhattenplot
    for i in range(num_manhatten):
        if i==0:
            axs = [plt.subplot(gs[i*2:i*2 + 2, :num_cols-1], sharex=axes_arrows)]
            # log scale y axis
            axs[0].set(yscale='log')
            axs[0].invert_yaxis()
        else:
            axs.append(plt.subplot(gs[i*2:i*2 + 2, :num_cols-1], sharex=axes_arrows, sharey=axs[0]))


    # add in the gene arrows
    plot_genes(axes_arrows, cluster_id, x_padding=x_padding, palette = palette)
    axes_arrows.set_title('Genes')

    # add in a scalebar
    plot_scalebar(axes_arrows, scalebar_length = x_padding/5, side='right')

    # add in manhattenplots for each e gene
    for i in range(num_genes):
        nominal_set = e_nominal_df[e_nominal_df['egene_id'] == sorted_gene_ids[i]]
        susie_set=e_susie_df[e_susie_df['egene_id'] == sorted_gene_ids[i]]
        plot_cs( axs[i], nominal_set, susie_set, palette[i], use_color_cs=False)
        axs[i].set_title(f'eQTL for {gene_names[i]}')


    # add in manhattenplots for each pc
    for i in range(num_genes):
        nominal_set = pc_nominal_df[(pc_nominal_df['cluster_id'] == cluster_id)&((pc_nominal_df['pc_id'] == i+1))]
        susie_set=pc_susie_df[(pc_susie_df['cluster_id'] == cluster_id)&(pc_susie_df['pc_id']==i+1)]
        plot_cs(axs[i+num_genes], nominal_set, susie_set, palette[num_genes+i+1], use_color_cs=False)
        axs[i+num_genes].set_title(f'pcQTL for pc {i+1}')

    if var_plot:
        # add in pc variance explained for each gene by pcs (on each pc plot)
        rs_df = get_rs(cluster_id)
        # add on each pc plot the var explained for only that pc
        for i in range(num_genes):
            # shared x axis, different y axis
            ax_twin = axs[i+num_genes].twinx()
            plot_pc_var(ax_twin, rs_df, [f'pc{i+1}'], gene_ids, palette[:num_genes])

        


    # add in a legend
    add_legend(axes_legend, sorted_gene_ids, palette=palette, use_color_cs=False, var_plot=var_plot)

    # add some spacing
    #plt.subplots_adjust(hspace=2)
    fig.tight_layout()  # otherwise the right y-label is slightly clipped


In [None]:
# pc 1 nominal set
pc_nominal_set = pc_nominal_df[(pc_nominal_df['cluster_id'] == cluster_id)&((pc_nominal_df['pc_id'] == 1))]
pc_susie_set=pc_susie_df[(pc_susie_df['cluster_id'] == cluster_id)&(pc_susie_df['pc_id']==1)]

nominal_set = e_nominal_df[e_nominal_df['egene_id'] == cluster_id.split('_')[0]]
susie_set=e_susie_df[e_susie_df['egene_id'] == cluster_id.split('_')[0]]

In [None]:
fig, axes = plt.subplots(len(cluster_id.split('_')), figsize=(4, 12))

lead_var = 'chr15_74860799_A_G_b38'
cs_var = pc_susie_set[pc_susie_set['lead_variant'] == 'chr15_74860799_A_G_b38']['variant_id']
gene_names = gid_gencode.loc[cluster_id.split('_')]['gene_name']
palette = sns.diverging_palette(220, 20, n=len(gene_names)*2+1)



for i in range(len(cluster_id.split('_'))):
    gene_id = cluster_id.split('_')[i]
    nominal_set = e_nominal_df[e_nominal_df['egene_id'] == gene_id]
    susie_set=e_susie_df[e_susie_df['egene_id'] == gene_id]
    merged_df = pd.merge(pc_nominal_set, nominal_set, on='variant_id', suffixes=('_pc', '_e'))

    # nominal p values
    sns.scatterplot(merged_df, x='pval_nominal_pc', y='pval_nominal_e', ax = axes[i] ,
                        color=palette[i], 
                        s=10, 
                        linewidth=0)
    # susie credible sets
    sns.scatterplot(merged_df[merged_df['variant_id'].isin(cs_var)], x='pval_nominal_pc', y='pval_nominal_e', ax = axes[i],
                        color='black', 
                        marker='.', 
                        s=100, 
                        linewidth=.1)
    # susie lead vars sets
    sns.scatterplot(merged_df[merged_df['variant_id'] == lead_var], x='pval_nominal_pc', y='pval_nominal_e', ax = axes[i],
                        color= '#B83A4B', 
                        marker='x', 
                        s=100, 
                        linewidth=3)

    axes[i].set(yscale='log', xscale='log')
    #axes[i].set_ylim(axes[i].get_xlim())
    axes[i].invert_yaxis()
    axes[i].invert_xaxis()

    # to remove black outlines for gene plot
    axes[i].spines['top'].set_visible(False)
    axes[i].spines['right'].set_visible(False)

    axes[i].set_xlabel('pcQTL nominal p-value')
    axes[i].set_ylabel(f'{gene_names[i]} eQTL nominal p-value')

fig.tight_layout()




In [None]:
chr15_74860799_A_G_b38

In [None]:
    for i in range(num_genes):
        nominal_set = e_nominal_df[e_nominal_df['egene_id'] == gene_ids[i]]
        susie_set=e_susie_df[e_susie_df['egene_id'] == gene_ids[i]]
        plot_cs( axs[i], nominal_set, susie_set, palette[i], use_color_cs=False)
        axs[i].set_title(f'eQTL for {gene_names[i]}')

In [None]:
expression_df[expression_df['cluster_id'].str.split('_').apply(len) > 3]['cluster_id'].unique()

In [None]:
# pull an example cluster
#cluster_id = expression_df['cluster_id'].loc[369]
#cluster_id = 'ENSG00000142089.15_ENSG00000185201.16_ENSG00000185885.15'
#cluster_id = expression_df[expression_df['cluster_id'].str.split('_').apply(len) > 3]['cluster_id'].unique()[1]


In [None]:
cluster_id

In [None]:
plot_cluster(cluster_id, x_padding=100000)    

In [None]:

fig, ax = plt.subplots()

rs_df = get_rs(cluster_id)


plot_pc_var(ax, rs_df, ['pc1', 'pc2'], gene_ids, palette[num_genes+1:], palette[:num_genes])


In [None]:
pc_nominal_df[(pc_nominal_df['cluster_id'] == cluster_id)&((pc_nominal_df['pc_id'] == i))]

In [None]:
    cluster_gencode = gid_gencode.loc[gene_ids]

    # Sort the dataframe by 'start'
    cluster_gencode = cluster_gencode.sort_values('start')

In [None]:
# plot 
add_text=False
max_num_pcs = 4


fig, ax = plt.subplots(figsize=(8,5))
# set axis limits 
ax.set_ylim(-(len(cluster_gencode)+1)*3,100)
x_padding = 10000
ax.set_xlim(cluster_gencode['start'].min() - x_padding, cluster_gencode['end'].max() + x_padding)


# add a rectangle for each gene location
for i in range(len(cluster_gencode)):
    row = cluster_gencode.iloc[i]
    # rectangle patch for gene body
    rect = patches.Rectangle((row['start'], -3-i*3), row['end']-row['start'], 2, linewidth=1, edgecolor=sns.color_palette()[i%10], facecolor=sns.color_palette()[i%10])
    ax.add_patch(rect)
    # black line for tss location
    rect = patches.Rectangle((row['tss_start'], -3-i*3), 200, 2, linewidth=1, edgecolor='k', facecolor='k')
    ax.add_patch(rect)

plot_rs_melt_df = rs_melt_df[rs_melt_df['pc_num'].isin([f'pc{i+1}' for i in range(max_num_pcs)])]

# plot the varaince explained as a scatterplot
sns.lineplot(plot_rs_melt_df, y='value', x='position', style='pc_num', color='grey')
sns.scatterplot(plot_rs_melt_df, y='value', x='position', hue='transcript_id', style='pc_num', ax=ax)

if add_text:
    # add text to the scatterplot
    for i in range(plot_rs_melt_df.shape[0]):
        ax.text(plot_rs_melt_df['position'].iloc[i]+100, plot_rs_melt_df['value'].iloc[i]+1, 
            '{:.2f}%'.format(plot_rs_melt_df['value'].iloc[i]), horizontalalignment='left', size='medium', color='k')

# Add the patch to the Axes
ax.set_xlabel('Position (bp)')
ax.set_ylabel('% variance explained')
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))

plt.show()

debuggin get_pcs

In [None]:
from sklearn.decomposition import PCA


In [None]:
clusters_dir = config['clusters_dir']


In [None]:
# load in the clusters data
clusters = []
for t in tissue_df['Tissue']:
    clusters.append(pd.read_csv(f'{prefix}/{clusters_dir}/{t}_clusters_all_chr.csv'))
clusters_df = pd.concat(clusters, keys=tissue_df['Tissue'])


In [None]:
row

In [None]:
row=clusters_df.iloc[0]
cluster_id ='_'.join([*sorted(row['Transcripts'].split(','))])
expression_df = pd.read_csv(f'{prefix}/{filtered_expression_output_dir}/{row.Tissue}.v8.normalized_expression.cluster_genes.bed', sep='\t')

sample_ids = expression_df.columns[4:]

expression_df['egene_id'] = expression_df['gene_id'].str.split('_e_').str[1]
expression_df['cluster_id'] = expression_df['gene_id'].str.split('_e_').str[0]

expression_df_gid = expression_df.set_index('egene_id')

In [None]:
# get all pcs and combine 
cluster_expression_df_gid = expression_df_gid[expression_df_gid['cluster_id']==cluster_id]
cluster = cluster_expression_df_gid.loc[row['Transcripts'].split(',')]
X = cluster[sample_ids].transpose()
pca = PCA()
pc_values = pca.fit_transform(X)

# get an id for each pc
gene_ids = []
for pc_num in range(pc_values.shape[1]):
    gene_ids.append('_'.join([*sorted(row['Transcripts'].split(',')), f'pc{pc_num+1}']))

In [None]:
gene_ids