In [None]:
import os

import pandas as pd
import pyranges as pr

from tqdm.auto import tqdm

# Parameters

## Input files

In [None]:
fname = '../pipeline_run/results/final_data.csv.gz'

tad_fname_list = [
    entry.path
    for entry in os.scandir('../pipeline_run/tads/data/')
    if 'Rao2014-IMR90-MboI-allreps-filtered-10kb' in fname
]

## Search configuration

In [None]:
region_length = 1e6

min_border_cancer_snp_count = 4
min_border_noncancer_snp_count = 2

min_tad_number = 5
gap_number = 1

step_size = 1e5

# Read data

## SNP data

In [None]:
df_all = pd.read_csv(fname, dtype={'associated_genes': str})
df_all.head()

In [None]:
%%time
# classify SNPs
def classify(x):
    return x['is_cancer'].value_counts().idxmax()


snp_cancer_map = (
    df_all[['diseaseId', 'snpId', 'is_cancer']]
    .drop_duplicates()
    .dropna()
    .groupby('snpId')
    .apply(classify)
    .to_dict()
)
list(snp_cancer_map.items())[:2]

In [None]:
%%time
tmp = (
    df_all[
        [
            'chromosome_hg19',
            'position_hg19',
            'snpId',
            'TAD_20in',
            'tad_source',
            'window_size',
        ]
    ]
    .drop_duplicates()
    .dropna()
    .copy()
    .rename(columns={'chromosome_hg19': 'Chromosome', 'position_hg19': 'Start'})
)
tmp['Chromosome'] = 'chr' + tmp['Chromosome'].astype(str)
tmp['End'] = tmp['Start'] + 1

tmp['idx'] = tmp['tad_source'] + '_' + tmp['window_size'].astype(str)
tmp.drop(columns=['tad_source', 'window_size'], inplace=True)

tmp = tmp[tmp['TAD_20in'] == 'boundary']
tmp['is_cancer'] = tmp['snpId'].apply(lambda x: snp_cancer_map[x])

tmp = tmp[tmp['idx'] == 'Rao2014-IMR90-MboI-allreps-filtered-10kb_10']

df_snps = pr.PyRanges(tmp)
df_snps

## TAD data

In [None]:
df_list = []
for fname in tqdm(tad_fname_list):
    _, tad_source, window_size, _ = os.path.basename(fname).split('.')

    tmp = pd.read_csv(fname)
    tmp['idx'] = f'{tad_source}_{window_size}'

    df_list.append(tmp)

tmp = pd.concat(df_list)
tmp.rename(
    columns={'chrname': 'Chromosome', 'tad_start': 'Start', 'tad_stop': 'End'},
    inplace=True,
)

tmp = tmp[tmp['idx'] == 'Rao2014-IMR90-MboI-allreps-filtered-10kb_10']

df_tads = pr.PyRanges(tmp)
df_tads

# Statistics

In [None]:
df_snps.df.query('is_cancer').groupby('Chromosome')['snpId'].count()

# Conduct search

In [None]:
chrom = 'chr2'

In [None]:
sub_tads = df_tads[chrom]  # , 25_000_000:35_000_000]
sub_snps = df_snps[chrom]  # , 25_000_000:35_000_000]

In [None]:
position = 0
max_range = sub_tads.df['End'].max()

matches = []

pbar = tqdm(total=max_range - position)
while True:
    pbar.update(step_size)

    # get region
    df_region = pr.PyRanges(
        pd.DataFrame(
            {
                'Chromosome': [chrom],
                'Start': [position],
                'End': [position + region_length],
            }
        )
    )

    tad_list = sub_tads.overlap(df_region)
    snp_list = sub_snps.overlap(df_region)

    if tad_list.empty or snp_list.empty:
        position += step_size

        if position >= max_range:
            break
        continue

    # count occurrences
    tad_count = len(tad_list)
    gap_count = (
        (tad_list.df['Start'] - tad_list.df['End'].shift(1)).iloc[1:] > 0
    ).sum()

    cancer_snp_count = snp_list.df['is_cancer'].sum()
    noncancer_snp_count = (~snp_list.df['is_cancer']).sum()

    # check criteria
    if (
        tad_count >= min_tad_number
        and gap_count >= gap_number
        and cancer_snp_count >= min_border_cancer_snp_count
        and noncancer_snp_count >= min_border_noncancer_snp_count
        and cancer_snp_count > noncancer_snp_count
    ):
        print(position, tad_count, gap_count, cancer_snp_count, noncancer_snp_count)
        matches.append((position, tad_list, snp_list))

    # prepare for next iteration
    position += step_size

    if position >= max_range:
        break
pbar.close()