# Goal

Investigate whether cancer associated TAD-Borders fall into genomic regions which "strongly" vary between hg19 and hg38.

In [None]:
import os

import numpy as np
import pandas as pd

import pyranges as pr

import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.auto import tqdm

In [None]:
sns.set_context('talk')

# Load data

## SNP data

In [None]:
source_dir = 'MY_RUN/agg_both/pipeline_run/results/'

In [None]:
df_final_hg38 = pd.read_csv(os.path.join(source_dir, 'final.do_further_investigations=False,input_files+tad_coordinates=data_newleopoldtads_dixon_ES_40k_hg38_10.csv.csv'))
df_final_hg38.head()

In [None]:
sub = df_final_hg38.loc[
    (df_final_hg38['TAD_type'] == '20in') & df_final_hg38['is_cancer'],
    ['snpId', 'chromosome', 'position']
].rename(columns={'chromosome': 'Chromosome', 'position': 'Start'})
sub['End'] = sub['Start'] + 1

sub.drop_duplicates('snpId', inplace=True)

gr_snps = pr.PyRanges(sub)
gr_snps

## TADs

In [None]:
df_tads = pd.read_csv(os.path.join(source_dir, 'tads_hg38.do_further_investigations=False,input_files+tad_coordinates=data_newleopoldtads_dixon_ES_40k_hg38_10.csv.tsv'), sep='\t')
df_tads.head()

In [None]:
border_size = 20_000

tmp = (df_tads.reset_index()
              .rename(columns={'index': 'tad_idx'}))

foo = []
for row in tqdm(tmp.itertuples(), total=tmp.shape[0]):
    foo.extend([
        {
            'Start': row.tad_start,
            'End': row.tad_start + border_size,
            'border_side': 'left',
            **row._asdict()
        },
        {
            'Start': row.tad_stop - border_size,
            'End': row.tad_stop,
            'border_side': 'right',
            **row._asdict()
        }
    ])

df_tad_borders = pd.DataFrame(foo).rename(columns={'chrname': 'Chromosome'})
df_tad_borders['Chromosome'] = df_tad_borders['Chromosome'].str[3:]  # remove 'chr' prefix
df_tad_borders['border_idx'] = df_tad_borders['tad_idx'].map(str) + '_' + df_tad_borders['border_side']
df_tad_borders.head()

In [None]:
gr_borders = pr.PyRanges(df_tad_borders)
gr_borders

## Genome differences

In [None]:
df_contigs = pd.read_csv(
    'data/hg38ContigDiff.txt', sep='\t', 
    header=None, names=['bin', 'chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb'])

df_contigs.rename(columns={
    'chrom': 'Chromosome',
    'chromStart': 'Start', 'chromEnd': 'End',
    'strand': 'Strand'
}, inplace=True)

df_contigs['Chromosome'] = df_contigs['Chromosome'].str[3:]  # remove 'chr' prefix

df_contigs.head()

In [None]:
gr_contigs = pr.PyRanges(df_contigs)#.unstrand()
#gr_contigs = gr_contigs.subset(lambda df: df['score'] == 0)
gr_contigs

# Find SNP/TAD-Border overlaps

In [None]:
overlap = gr_borders.join(gr_snps, strandedness=False, suffix='_snp')
overlap.df.head()

In [None]:
overlap_counts = (overlap.df
                         .groupby('border_idx')['snpId']
                         .count()
                         .sort_values(ascending=False))
overlap_counts.head(10)

In [None]:
selected_tad_borders = overlap_counts.index  # overlap_counts.nlargest(5).index
selected_tad_borders

In [None]:
gr_selected_borders = gr_borders.subset(lambda df: df.border_idx.isin(selected_tad_borders))
gr_selected_borders

# Find TAD-Border/hg38DiffContig overlaps

## True

In [None]:
gr_overlap = gr_contigs.join(gr_selected_borders, strandedness=False, suffix='_tadborder')
gr_overlap

## Random

In [None]:
nonselected_borders = list(set(df_tad_borders['border_idx'].to_list()) - set(selected_tad_borders))
len(nonselected_borders)

In [None]:
random_tad_borders = np.random.choice(nonselected_borders, size=len(selected_tad_borders))
random_tad_borders

In [None]:
gr_random_borders = gr_borders.subset(lambda df: df.border_idx.isin(random_tad_borders))
gr_random_borders

In [None]:
gr_overlap_random = gr_contigs.join(gr_random_borders, strandedness=False, suffix='_tadborder')
gr_overlap_random

# Merge counts

In [None]:
selected_counts = gr_overlap.df.groupby('border_idx')['name'].count()
selected_counts = selected_counts.reindex(selected_tad_borders).fillna(0)

In [None]:
random_counts = gr_overlap_random.df.groupby('border_idx')['name'].count()
random_counts = random_counts.reindex(random_tad_borders).fillna(0)

In [None]:
df_data = pd.DataFrame({
    'cancer': selected_counts.reset_index(drop=True),
    'random': random_counts.reset_index(drop=True)
})
df_long = pd.melt(df_data)
df_long.sample(5)

In [None]:
df_data.head()

# Plot comparison

In [None]:
plt.figure(figsize=(8, 6))
sns.violinplot(x='variable', y='value', data=df_long)
sns.swarmplot(x='variable', y='value', data=df_long, color='red')

plt.xlabel('TAD-Border selection')
plt.ylabel('hg38DiffContig overlap count')

plt.tight_layout()
plt.savefig('images/hg38DiffContig_overlap.pdf')