In [None]:
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from utils import load_config

In [None]:
config = load_config()

sns.set_context('talk')

# Load data

In [None]:
df_tads = pd.read_csv(
    config['input_files']['tad_coordinates_hg18'],
    header=None, names=['chrname', 'tad_start', 'tad_stop'])
df_tads.head()

# Randomize TADs

## Compute additional TAD statistics

In [None]:
df_tads['tad_len'] = df_tads['tad_stop'] - df_tads['tad_start']

df_tads['next_tad_start'] = df_tads.tad_start.shift(-1)
df_tads['next_tad_chr'] = df_tads.chrname.shift(-1)

In [None]:
def func2next(row):
    if row.chrname == row.next_tad_chr:
        return row.next_tad_start - row.tad_stop
    else:
        return np.nan

df_tads['dist_to_next_tad'] = df_tads.apply(func2next, axis=1)

In [None]:
# remove invalid distances (eg between chromosomes)
df_tads[df_tads.isnull().any(axis=1)]
df_tads.dropna(inplace=True)

In [None]:
df_tads.head()

## Draw random TADs

In [None]:
tad_counter = 1

data = []
for chr_, group in df_tads.groupby('chrname'):
    tad_len_distr = group['tad_len'].sample(frac=1).tolist()
    tad_sep_distr = group['dist_to_next_tad'].sample(frac=1).tolist()
    
    cur_pos = group['tad_start'].min()
    for _ in range(group.shape[0]):
        cur_len = tad_len_distr.pop()
        cur_sep = tad_sep_distr.pop()
        
        data.append({
            '#tad_id': tad_counter,
            'chrname': chr_,
            'tad_start': cur_pos,
            'tad_stop': cur_pos+cur_len
        })
        
        cur_pos += cur_len + cur_sep
        tad_counter += 1
        
    # make sure that lists are exhausted
    assert not tad_len_distr and not tad_sep_distr, (tad_len_distr, tad_sep_distr)

# convert to dataframe
df_tads_rand = pd.DataFrame(data)
df_tads_rand['tad_start'] = pd.to_numeric(df_tads_rand['tad_start'], downcast='integer')
df_tads_rand['tad_stop'] = pd.to_numeric(df_tads_rand['tad_stop'], downcast='integer')

df_tads_rand.head()

In [None]:
# more validations
assert df_tads.shape[0] == df_tads_rand.shape[0]
assert sorted(df_tads['tad_len'].tolist()) == sorted((df_tads_rand['tad_stop'] - df_tads_rand['tad_start']).tolist())

## Save result

In [None]:
df_tads_rand.to_csv('data/tads_hg18_randomized.tsv', sep='\t', index=False)

# Compare to actual TADs

In [None]:
df_tads_rand['tad_len'] = df_tads_rand['tad_stop'] - df_tads_rand['tad_start']

## Histograms

In [None]:
plt.figure(figsize=(13,3))

plt.subplot(121)
sns.distplot(df_tads['tad_len'], kde=False)
plt.title('Original TADs')

plt.subplot(122)
sns.distplot(df_tads_rand['tad_len'], kde=False)
plt.title('Shuffled TADs')

## Scatter plots

In [None]:
plt.scatter(df_tads['tad_start'], df_tads_rand['tad_start'], s=10)

plt.xlabel('Original TAD-start position')
plt.ylabel('Randomized TAD-start position')

In [None]:
plt.scatter(df_tads['tad_len'], df_tads_rand['tad_len'], s=50, alpha=.3)

plt.xlabel('Original TAD-length')
plt.ylabel('Randomized TAD-length')

# Compare final results

This part can only be run if the pipeline has been executed with
```bash
#config.yaml
tad_coordinates_hg18: 'results/tads_hg18_randomized.tsv'
```
beforehand, and the obtained results were cached with
```bash
mv results/TAD_enrichment.csv results/TAD_enrichment_randomized.csv
```
(and the proper pipeline was then run once again)

## Read data

In [None]:
df_enr_org = pd.read_csv('results/TAD_enrichment.csv')
df_enr_rand = pd.read_csv('results/TAD_enrichment_randomized.csv')

In [None]:
display(df_enr_org.head())
display(df_enr_rand.head())

## Scatter plots

In [None]:
TAD_border_types = df_enr_org['TAD_type'].unique().tolist()
mylog = np.vectorize(lambda x: np.log10(x) if x > 0 else np.nan)

fig, axes = plt.subplots(2, len(TAD_border_types)//2, figsize=(11, 8))
for type_, ax in zip(TAD_border_types, axes.flatten()):
    sub_org = df_enr_org[df_enr_org['TAD_type'] == type_]
    sub_rand = df_enr_rand[df_enr_rand['TAD_type'] == type_]
    assert (sub_org[['TAD_type', 'disease']] == sub_rand[['TAD_type', 'disease']]).all().all()
    
    ax.scatter(
        -mylog(sub_org['pval_boundary']),
        -mylog(sub_rand['pval_boundary']),
        alpha=.8, s=10)
    ax.set_title(type_)
    ax.set_aspect('equal', 'datalim')
    ax.set_xlabel('original')
    ax.set_ylabel('randomized')
    
    ax.axvline(x=-np.log10(.05), color='red', linestyle='dashed', linewidth=.5)
    ax.axhline(y=-np.log10(.05), color='red', linestyle='dashed', linewidth=.5)
    
plt.suptitle('TAD boundary -log(p_values)')
    
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.savefig('images/original_vs_randomized_tad_boundary_pvalues.pdf')