In [None]:
%load_ext ipy_pdcache

In [None]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import pyranges as pr

import seaborn as sns
import matplotlib.pyplot as plt

from natsort import natsorted

from tqdm.auto import tqdm

In [None]:
sns.set_context('talk')

# Parameters

In [None]:
tad_fname_list = snakemake.input.tad_fname_list

outdir = Path(snakemake.output.outdir)

In [None]:
outdir.mkdir(parents=True, exist_ok=True)

# Helper functions

In [None]:
def read_tads(fname):
    return pr.PyRanges(
        pd.read_csv(fname)
          .rename(columns={'chrname': 'Chromosome', 'tad_start': 'Start', 'tad_stop': 'End'}))

In [None]:
# remove common prefix/suffix
def remove_common_endings(ser):
    source_prefix = os.path.commonprefix(ser.unique().tolist())
    source_suffix = os.path.commonprefix([x[::-1] for x in ser.unique().tolist()])[::-1]

    print(f'Removing: "{source_prefix}"..."{source_suffix}"')
    return ser.str[len(source_prefix):-len(source_suffix)]

# Load data

In [None]:
df_list = []
for fname in tqdm(tad_fname_list):
    tmp = read_tads(fname).df
    tmp['source'] = os.path.basename(fname)
    df_list.append(tmp)
df_tads = pd.concat(df_list)

df_tads['source'] = remove_common_endings(df_tads['source']).astype('category')
df_tads['source'].cat.reorder_categories(natsorted(df_tads['source'].unique()), inplace=True, ordered=True)
df_tads.head()

# TAD counts

In [None]:
df_tadcount = df_tads.groupby('source').count()['Chromosome'].to_frame('tad_count').reset_index()
df_tadcount['prefix'] = df_tadcount['source'].str.split('.').str[:-1].str.join('.')

df_tadcount.head()

In [None]:
plt.figure(figsize=(16, 6))
sns.boxplot(data=df_tadcount, x='prefix', y='tad_count', color=sns.color_palette()[0])

plt.xlabel('Dataset')
plt.ylabel('TAD count')
plt.tick_params(axis='x', which='both', labelsize=8, rotation=90)

# plt.tight_layout()
plt.savefig(outdir / 'tad_count_boxplot.pdf')

In [None]:
plt.figure(figsize=(16, 6))
sns.countplot(x='source', data=df_tads, color=sns.color_palette()[0])

plt.ylabel('TAD count')
plt.tick_params(axis='x', which='both', labelsize=8, rotation=90)

plt.tight_layout()
plt.savefig(outdir / 'tad_count_barplot.pdf')

# TAD lengths

In [None]:
df_list = []
for source, group in df_tads.groupby('source'):
    tmp = (group['End'] - group['Start']).to_frame('tad_len')
    tmp['source'] = source
    df_list.append(tmp)

df_tadlen = pd.concat(df_list)
df_tadlen['prefix'] = df_tadlen['source'].str.split('.').str[:-1].str.join('.')
df_tadlen.head()

## Histograms

In [None]:
g = sns.displot(
    data=df_tadlen[df_tadlen['tad_len'] > 0], x='tad_len',
    col='prefix', col_wrap=3,
    log_scale=True, element='step',
    height=3)

g.set_xlabels('TAD length')
g.set_ylabels('Count')
g.set_titles(size=8)

g.savefig(outdir / 'tad_length_histograms.pdf')

## Median lengths

In [None]:
df_median_tadlen = df_tadlen.groupby('source').median().reset_index()

df_median_tadlen['source'] = df_median_tadlen['source'].astype('category')
df_median_tadlen['source'].cat.reorder_categories(natsorted(df_median_tadlen['source'].unique()), inplace=True, ordered=True)

df_median_tadlen.head()

In [None]:
plt.figure(figsize=(16, 6))
sns.barplot(
    data=df_median_tadlen, 
    x='source', y='tad_len',
    color=sns.color_palette()[0])

plt.ylabel('Median TAD length')
plt.tick_params(axis='x', which='both', labelsize=8, rotation=90)

plt.tight_layout()
plt.savefig(outdir / 'tad_median_lengths.pdf')

# TAD similarities

Jaccard index: $\frac{|\text{range intersection}|}{|\text{range union}|}$