# Footprints: exploratory data analysis cross cell-types

## Setup

In [None]:
import os
import sys
import numpy as np
import anndata as ad
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib.colors import LogNorm
import seaborn as sns
from upsetplot import from_contents, UpSet

In [None]:
# Ensure cwd is project root and that /code is in path

cwd = os.path.basename(os.getcwd())

if cwd == 'footprintQTL':
    
    PROJECT_DIR = '.'


elif cwd == 'code':
    
    PROJECT_DIR = '..'


elif cwd == 'fichtner':

    PROJECT_DIR = 'projects/footprintQTL'

else:

    PROJECT_DIR = 'manual'


os.chdir(PROJECT_DIR)
sys.path.append(os.getcwd() + '/code')

In [None]:
from helpers.python.utils import list_dirs
from helpers.python.anndata_utils import check_anndata

In [None]:
# User variables

from glob_vars import FOOTPRINTS_DIR, CT_MAP_ID

min_cells_donor = 200 # Serves as sanity check.
                      # Ideally set to 0 since this filter should be applied during or before footprint computation

cell_type_dir = str(os.path.join(FOOTPRINTS_DIR, 'js_divergence', 'ca-qtls_variant-centred_15bp', CT_MAP_ID))

## Get data

In [None]:
# Init
peaks_df_entries = [] # Cross-donor stds for all cell types: {'peak': , cell_type': , 'cell_type_ann': , 'mean': , 'std': }
ct_peaks = {} # 'cell_type': [peaks]
ct_hvps = {} # 'cell_type': [hvps]

# Populate
for cell_type in list_dirs(cell_type_dir):

    adata = ad.read_h5ad(os.path.join(cell_type_dir, cell_type, 'footprints_processed.h5ad'))
    check_anndata(adata, min_obs=10, obs_criteria_kwargs=[{'col': 'n_cells', 'func': lambda x: (x > 0).all()}])


    # Stats
    var = adata.var[['mean', 'std', 'highly_variable_std']]

    # Some annotations
    n_donors, n_peaks = adata.shape
    mean_n_cells, mean_n_frags, mean_n_ins = adata.obs[['n_cells', 'n_fragments', 'n_insertions']].mean()
    
    ## Populate peaks_df_entries

    for index, row in var.iterrows():
 
        df_entry = {'peak': index,
                    'cell_type': cell_type,
                    'cell_type_ann': cell_type
                                     + f'\nn_peaks={n_peaks}\n'
                                     + f'n_donors={n_donors}\n'
                                     + f'mean_cells={round(mean_n_cells, 1)}\n'
                                     + f'mean_frags={round(mean_n_frags, 1)}\n'
                                     + f'mean_ins={round(mean_n_ins, 1)}',
                    'mean': row['mean'],
                    'std': row['std']}
        
        peaks_df_entries.append(df_entry)


    ## Peaks and HVPs
    ct_peaks[cell_type] = adata.var_names

    ct_hvps_lst = var[var['highly_variable_std'] == True].index
    ct_hvps[cell_type] = ct_hvps_lst

    

peaks_df = pd.DataFrame(peaks_df_entries)

## Mean and variance distr. per cell-type

In [None]:
plt.figure(figsize=(12,6))
sns.violinplot(peaks_df, x='cell_type_ann', y='std', inner='box')
plt.title('Peak variance across donors')
plt.xticks(rotation=0)
plt.xlabel('')
plt.ylabel('std')
plt.show()

In [None]:
peaks_df_pivot_mean = peaks_df.pivot(columns='cell_type', values='mean')
peaks_df_pivot_mean.plot(kind='hist', bins=100, subplots=True, title='Mean distr. of different cell types', xlabel='mean');

In [None]:
peaks_df_pivot_std = peaks_df.pivot(columns='cell_type', values='std')
peaks_df_pivot_std.plot(kind='hist', bins=100, subplots=True, title='Std distr. of different cell types', xlabel='mean');

## Highly variable peaks

In [None]:
# Create df entries
combs = [[ct1, ct2] for ct1 in ct_hvps.keys() for ct2 in ct_hvps.keys()]

entries = [] # [(ct1, ct2, overlap), ...]

for ct1, ct2 in combs:

    union = set(ct_hvps[ct1]) & set(ct_hvps[ct2])
    entries.append((ct1, ct2, len(union)))

# Make hvp overlap df
overlap_hvps_df = pd.DataFrame(entries, columns=['cell_type_01', 'cell_type_02', 'overlap_count'])
overlap_hvps_df = overlap_hvps_df.pivot(index='cell_type_01', columns='cell_type_02', values='overlap_count')
overlap_hvps_df

In [None]:
ax = sns.heatmap(overlap_hvps_df, annot=True, fmt='.0f', vmin=0, vmax=3502, cmap='rocket')#, norm=LogNorm(vmin=overlap_df.min().min(), vmax=overlap_df.max().max()))
ax.set(xlabel="", ylabel="")

ax.xaxis.tick_top()
plt.xticks(rotation=90)

plt.show()

In [None]:
hvps_formatted = from_contents(ct_hvps)
UpSet(hvps_formatted, subset_size='count', show_counts=True).plot()

In [None]:
# What if the original peak sets (non-hvps) already had an overlapping bias?

# Repeat previous plots for original peaks and create an overlap plot with the ration of hvps/peaks

# Create df entries
combs = [[ct1, ct2] for ct1 in ct_peaks.keys() for ct2 in ct_peaks.keys()]

entries = [] # [(ct1, ct2, overlap), ...]

for ct1, ct2 in combs:

    union = set(ct_peaks[ct1]) & set(ct_peaks[ct2])
    entries.append((ct1, ct2, len(union)))

# Make hvp overlap df
overlap_peaks_df = pd.DataFrame(entries, columns=['cell_type_01', 'cell_type_02', 'overlap_count'])
overlap_peaks_df = overlap_peaks_df.pivot(index='cell_type_01', columns='cell_type_02', values='overlap_count')
overlap_peaks_df

In [None]:
ax = sns.heatmap(overlap_peaks_df, annot=True, fmt='.0f', vmin=0, vmax=43000, cmap='rocket')#, norm=LogNorm(vmin=overlap_df.min().min(), vmax=overlap_df.max().max()))
ax.set(xlabel="", ylabel="")

ax.xaxis.tick_top()
plt.xticks(rotation=90)

plt.show()

In [None]:
peaks_formatted = from_contents(ct_peaks)
UpSet(peaks_formatted, subset_size='count', show_counts=True).plot()

In [None]:
overlap_ratios_df = overlap_hvps_df / overlap_peaks_df

ax = sns.heatmap(overlap_ratios_df, annot=True, fmt='.4f', vmin=0, vmax=0.1,  cmap='rocket')#, norm=LogNorm(vmin=overlap_df.min().min(), vmax=overlap_df.max().max()))
ax.set(xlabel="", ylabel="")

ax.xaxis.tick_top()
plt.xticks(rotation=90)

plt.show()