In [45]:
import pandas as pd
import glob
import os
from collections import defaultdict

In [46]:
base_path = os.getcwd()
binsizes = ['10kb', '50kb', '500kb', '1000kb']
sample_bins = {}
sample_id_map = {
    '4408_RDV': 'RDV',
    '4658_MM': 'MMS',
}

In [47]:
#read all overlap files and group genes by (chr, event)
for binsize in binsizes:
    files = glob.glob(os.path.join(base_path, f'*_{binsize}_overlaps.tsv'))

    for file in files:
        filename = os.path.basename(file)
        #extracting sample ID
        sample_prefix = filename.split(f'_{binsize}_')[0]
        sample = sample_id_map.get(sample_prefix, sample_prefix)

        
        df = pd.read_csv(file, sep='\t', header=None)
        df.columns = ['chr', 'start', 'end', 'event', 'gene_chr', 'gene_start', 'gene_end', 'gene_name']

        grouped = df[['chr', 'event', 'gene_name']].drop_duplicates()

        key = (sample, binsize)
        sample_bins[key] = set(tuple(x) for x in grouped.values)


In [48]:
#consistent overlaps across all bins

consistent_results = defaultdict(list)

samples = set(s for s, _ in sample_bins.keys())

for sample in samples:
    #genes from the first binsize
    common = sample_bins[(sample, binsizes[0])]
    #other bins
    for binsize in binsizes[1:]:
        common = common.intersection(sample_bins[(sample, binsize)])
    #save
    for chr_, event, gene in common:
        consistent_results[sample].append({
            'chr': chr_,
            'event': event,
            'gene_name': gene
        })

In [49]:
for sample, records in consistent_results.items():
    outname = f"{sample}_consistent_chr_event_gene_across_bins.csv"
    pd.DataFrame(records).to_csv(outname, sep='\t', index=False)