In [6]:
import pandas as pd
import re
import os
import glob
import snapatac2 as snap
from statsmodels.stats.multitest import multipletests

import seaborn as sns
import numpy as np
import utils
import subprocess

from pybedtools import BedTool
import matplotlib.pyplot as plt
from upsetplot import UpSet, from_contents
from matplotlib_venn import venn2
#import pyBigWig

In [7]:
def merge_region(region='PFC',condition="MC",folder='/data2st1/junyi/output/atac0627/dar/region_nt/',method='mementob',blacklist=['Doublet','NN',"Neuron"]):
    df_result = pd.DataFrame()
    files = glob.glob(f'{folder}/{region}*{condition}_{method}.csv')
    for file in files:
        if any([file.find(black) > -1 for black in blacklist]):
            continue
        df_dar = pd.read_csv(file,index_col=0)

        ctname = file.split('/')[-1].replace('.csv', '')
        ctname = "_".join(ctname.split('_')[1:-2])
        if len(df_dar) == 0:
            continue
        df_dar['ctname'] = ctname
        df_result = pd.concat([df_result, df_dar], axis=0, ignore_index=True)
    df_result['region'] = region
    df_result['condition'] = condition
    return df_result

def intersect_bed_files(bed_file1, bed_file2):
    bed1 = BedTool(bed_file1)
    bed2 = BedTool(bed_file2)

    # Intersect and split regions
    # Retain annotations from bed_file2 for overlapping regions
    merged = bed1.intersect(bed2, wa=True, wb=True)#.saveas("output/temp_intersect.bed")

    # Split non-overlapping regions from bed_file1
    non_overlapping = bed1.subtract(bed2)#.saveas("output/temp_non_overlapping.bed")

    return merged, non_overlapping

def intersect_bed_objects(bed_file1, bed_file2):
    bed1 = bed_file1
    bed2 = bed_file2

    # Intersect and split regions
    # Retain annotations from bed_file2 for overlapping regions
    merged = bed1.intersect(bed2, wa=True, wb=True)#.saveas("output/temp_intersect.bed")

    # Split non-overlapping regions from bed_file1
    non_overlapping = bed1.subtract(bed2)#.saveas("output/temp_non_overlapping.bed")

    return merged, non_overlapping

def get_proity_region(df_in, priority = [
    "promoter",
    "UTR",
    "exon",
    "intron",
    "intergenic"   ]

):
    df_in['annotation'] = priority[-1]
    df_in = df_in.loc[:,priority+['id']]
    priorityi = priority[::-1]
    for column in priorityi:
        df_in.loc[df_in[column]==True,"annotation"] = column
    return df_in


def is_promoter(row,interval_col="names"):
    # 解析第一个区间
    chrom_part, pos_part = row[interval_col].split(':')
    start1, end1 = map(int, pos_part.split('-'))
    strand = row['strand']

    if strand == "+":
        # 如果是正链，start1和end1不变
        # 获取第二个区间
        start2 = row['gstart']-2000
        end2 = row['gstart']
    else:
        strand = "-"
        start2 = row['gend']
        end2 = row['gend']+2000
    # 判断是否有交集
    return not (end1 < start2 or end2 < start1)

In [8]:
df_meta_dmr = pd.read_csv('/data1st2/hannan_25/data/Nanopore_processV1/nanopore_08_differential/summary/dmr_seg_anno_2tools_nofilter.csv')

In [9]:
method = 'mementob'
folder = '/data2st1/junyi/output/atac0627/darmr/All'
df_PFC_MC = merge_region(region='PFC',condition="MC",folder=folder,method=method)
df_PFC_MW = merge_region(region='PFC',condition="MW", folder=folder, method=method)
df_HIP_MC = merge_region(region='HIP',condition="MC",folder=folder,method=method)
df_HIP_MW = merge_region(region='HIP',condition="MW", folder=folder, method=method)
df_AMY_MC = merge_region(region='AMY',condition="MC",folder=folder,method=method)
df_AMY_MW = merge_region(region='AMY',condition="MW", folder=folder, method=method)
df_DAR_ALL = pd.concat([df_PFC_MC, df_PFC_MW, df_HIP_MC, df_HIP_MW, df_AMY_MC, df_AMY_MW], axis=0, ignore_index=True)

In [15]:
df_DAR_ALL = df_DAR_ALL.drop_duplicates(subset=['gene', 'region','condition'])

In [17]:
df_DAR_sign = df_DAR_ALL[df_DAR_ALL['de_pval'] <0.05]

In [52]:
df_meta_dmr['brainregion'] = df_meta_dmr['comparision'].str.split('-').str[-1]
df_meta_dmr['gender'] = df_meta_dmr['comparision'].str[0]
df_meta_dmr['region'] = "chr"+df_meta_dmr['dmr'].str.split('_').str[1]

In [83]:
# find the region only happen in female samples
df_meta_dmr_f = df_meta_dmr[df_meta_dmr['gender'] == 'F']
df_meta_dmr_m = df_meta_dmr[df_meta_dmr['gender'] == 'M']
region_f_only=(set(df_meta_dmr_f.region)).difference(set(df_meta_dmr_m.region))
df_DAR_sign_m = df_DAR_sign[~df_DAR_sign['gene'].isin(region_f_only)]

In [104]:
df_DAR_sign_m['gene'].nunique()

34858

In [107]:
df_meta_dmr_m['region'].nunique()

31838

In [98]:
df_merge_dardmr = df_DAR_sign_m.merge(df_meta_dmr_m, left_on=['gene','region'], right_on=['region','brainregion'], how='inner', suffixes=('_dar', '_dmr'))

In [109]:
df_merge_dardmr.to_csv('/data2st1/junyi/output/atac0627/darmr/All/merge_dardmr_mementob.csv', index=False)