In [None]:
import os

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from pyliftover import LiftOver

from utils import load_config

In [None]:
sns.set_context('talk')

In [None]:
config = load_config()

results_dir = config['output_dirs']['results']
images_dir = config['output_dirs']['images']

# Load data

In [None]:
df = pd.read_csv(
    config['input_files']['tad_coordinates'],
    header=None, names=['chrname', 'tad_start', 'tad_stop'])
df.head()

# Convert hgXX->hg38

In [None]:
lo = LiftOver(config['parameters']['source_genomiccoordinates_version'], 'hg38')

In [None]:
def convert_pos(chr_, pos):
    res = lo.convert_coordinate(chr_, pos)
    
    if len(res) == 0:
        return np.nan
    
    assert len(res) == 1, res
    r_chr, r_pos, r_strand, r_score = res[0]

    if chr_ != r_chr:
        return np.nan
    
    return r_pos

In [None]:
# case of no conversion
df.loc[1226]
convert_pos('chr6', 61880166)

In [None]:
def _conv(row):
    tad_start_raw = convert_pos(row['chrname'], row['tad_start'])
    tad_stop_raw = convert_pos(row['chrname'], row['tad_stop'])
    
    if pd.isnull(tad_start_raw) or pd.isnull(tad_stop_raw):
        tad_start = np.nan
        tad_stop = np.nan
    else:
        tad_start = min(tad_start_raw, tad_stop_raw)
        tad_stop = max(tad_start_raw, tad_stop_raw)
    
    return pd.Series({
        'chrname': row['chrname'],
        'tad_start': tad_start,
        'tad_stop': tad_stop
    })

df_conv = df.apply(_conv, axis=1).dropna()

df_conv['tad_start'] = df_conv['tad_start'].astype(int)
df_conv['tad_stop'] = df_conv['tad_stop'].astype(int)

df_conv.head()

## Check special cases

In [None]:
# coordinate order gets switched
print(convert_pos('chr1', 144848643))
print(convert_pos('chr1', 145048643))

In [None]:
# vanishing tads
tad_len = df_conv['tad_stop'] - df_conv['tad_start']
empty_tads = df_conv[tad_len==0]

print(empty_tads.shape)
empty_tads.head()

# Save result

In [None]:
print(df.shape)
print(df_conv.shape)
df_conv.head()

In [None]:
df_conv.to_csv(f'{results_dir}/tads_hg38.tsv', sep='\t', index=False)

# Comparison statistics

In [None]:
df_comp = pd.concat([
    df.describe()['tad_start'].to_frame().transpose().rename(index={'tad_start': 'before'}),
    df_conv.describe()['tad_start'].to_frame().transpose().rename(index={'tad_start': 'after'})
]).reset_index()
df_comp

In [None]:
df_comp_long = pd.melt(df_comp, id_vars=['index'], value_vars=['count', 'mean', 'min', 'max'])
df_comp_long.head()

In [None]:
def annotated_barplot(*args, **kwargs):
    g = sns.barplot(*args, **kwargs)
    for i, row in enumerate(kwargs['data'].itertuples()):
        g.annotate(
            f'{row.value:,.1f}', (i, row.value), xycoords='data', 
            ha='center', xytext=(0, 3), textcoords='offset pixels')

g = sns.FacetGrid(df_comp_long, col='variable', sharex=False, sharey=False, height=7)
g.map_dataframe(annotated_barplot, x='index', y='value')

plt.suptitle('TAD-start coordinate statistics (before and after hg19$\Rightarrow$hg38 liftover)')
plt.subplots_adjust(top=.85)

g.savefig(os.path.join(images_dir, 'tad_conversion_stats.pdf'))