### Intra dataset pairs


In [None]:
import warnings
warnings.filterwarnings("ignore")
import scanpy as sc
import pandas as pd
import numpy as np


def make_anndata(adata, chrom, start, end, path):
    adata.var['chr'] = chrom
    adata.var['start'] = start
    adata.var['end'] = end

    # basic stats
    sc.pp.filter_cells(adata, min_genes=0)
    sc.pp.filter_genes(adata, min_cells=0)

    # a peak need to be accessible in 1% cells
    thres = int(adata.shape[0]*0.01)
    adata = adata[:, adata.var['n_cells']>thres]

    chrs = ['chr'+str(i) for i in range(1,23)] + ['chrX', 'chrY']
    adata = adata[:, adata.var['chr'].isin(chrs)]
    
    print(adata)
    adata.write(path)
    return adata


# Define file paths and parameters (change as needed)
adata_reference = sc.read_h5ad("/home/daozhang/scDIFF/dataPreprocessing/Temp/BoneMarrowA.h5ad")
adata_query = sc.read_h5ad("/home/daozhang/scDIFF/dataPreprocessing/Temp/BoneMarrowB.h5ad")

concat_adata = sc.AnnData.concatenate(adata_reference, adata_query)
print(concat_adata)

concat_adata = make_anndata(
    concat_adata,
    concat_adata.var["chrom"],
    concat_adata.var["chromStart"],
    concat_adata.var["chromEnd"],
    "/home/daozhang/scDIFF/dataPreprocessing/Temp/BoneMarrowA_BoneMarrowB.h5ad",
)

### Cross platforms dataset pairs

In [None]:
# load data (change as needed)
adata_reference = sc.read_h5ad("/home/daozhang/scDIFF/dataPreprocessing/Temp/MosP1.h5ad")
adata_query = sc.read_h5ad("/home/daozhang/scDIFF/dataPreprocessing/Temp/Cerebellum.h5ad")

# mapping of genomes if needed
df_query = {
    "chrom": adata_query.var['chrom'].values,
    "start": adata_query.var["chromStart"].values,
    "end": adata_query.var["chromEnd"].values,
    "peaks": adata_query.var_names.values,
}
df_query = pd.DataFrame(df_query)
df_query.to_csv("preprocess_data/query.bed", header=None, index=None, sep="\t")

!CrossMap.py bed preprocess_data/mm9ToMm10.over.chain.gz preprocess_data/query.bed preprocess_data/query_crossmap.bed

df_query = pd.read_csv("preprocess_data/query_crossmap.bed", header=None, sep="\t")

seq_len = 1344

query_start_list = []
query_end_list = []
for start, end in zip(df_query[1], df_query[2]):
    mid = (start + end) // 2
    new_start = max(start, mid - seq_len // 2)
    new_end = min(end, mid + seq_len // 2)
    query_start_list.append(new_start)
    query_end_list.append(new_end)

df_query_new = {
    "chrom": df_query[0],
    "start": query_start_list,
    "end": query_end_list,
    "peaks": df_query[3],
}
df_query_new = pd.DataFrame(df_query_new)
df_query_new.to_csv("preprocess_data/query_new.bed", header=None, index=None, sep="\t")

reference_start_list = []
reference_end_list = []
for start, end in zip(adata_reference.var["chromStart"].values, adata_reference.var["chromEnd"].values):
    start, end = int(start), int(end)
    mid = (start + end) // 2
    new_start = max(start, mid - seq_len // 2)
    new_end = min(end, mid + seq_len // 2)
    reference_start_list.append(new_start)
    reference_end_list.append(new_end)

df_reference = {
    "chrom": adata_reference.var['chrom'].values,
    "start": reference_start_list,
    "end": reference_end_list,
    "peaks": adata_reference.var_names.values,
}
df_reference = pd.DataFrame(df_reference)
df_reference.to_csv("preprocess_data/reference.bed", header=None, index=None, sep="\t")

!bedtools intersect -a preprocess_data/query_new.bed -b preprocess_data/reference.bed -wo > preprocess_data/intersect.bed

df = pd.read_csv("preprocess_data/intersect.bed", header=None, sep="\t")
df = df[df[8] > seq_len // 2]

adata_query = adata_query[:, df[3]]
print(adata_query)

adata_reference = adata_reference[:, df[7]]
print(adata_reference)

adata_reference.var_names_make_unique()
adata_query.var_names = adata_reference.var_names
print(adata_query.var_names)

def make_anndata(adata, chr, start, end, path):
    adata.var['chr'] = chr
    adata.var['start'] = start
    adata.var['end'] = end
    
    sc.pp.filter_cells(adata, min_genes=0)
    sc.pp.filter_genes(adata, min_cells=0)
    
    thres = int(adata.shape[0]*0.01)
    adata = adata[:, adata.var['n_cells']>thres]

    chrs = ['chr'+str(i) for i in range(1,23)] + ['chrX', 'chrY']
    adata = adata[:, adata.var['chr'].isin(chrs)]
    
    print(adata)
    adata.write(path)
    
    return adata

adata = sc.AnnData.concatenate(adata_reference, adata_query)
print(adata)

peak = adata.var_names.values

adata = make_anndata(adata, 
             [i.split("_")[0] for i in peak],
             [int(i.split("_")[1]) for i in peak],
             [int(i.split("_")[2]) for i in peak],
             "/home/daozhang/scDIFF/dataPreprocessing/Temp/MosP1_Cerebellum.h5ad")