In [None]:
# Notebook 3: Generate ATAC AnnData Matrix 
# ATAC Processing: Build Peak × Cell Matrix from Fragments Filtered by RNA Barcodes

In [1]:

import os
import pandas as pd
import numpy as np
import anndata as ad
from scipy.sparse import lil_matrix, csr_matrix
import pyranges as pr


In [4]:

# Load filtered RNA barcodes (already QC-filtered and annotated)
rna_adata = ad.read_h5ad("results/files/ag_rna_final_cleaned_annotated.h5ad")
filtered_barcodes = set(rna_adata.obs_names)

# Load ATAC fragments and filter to RNA barcodes
fragments_path = "data/10k_PBMC_Multiome_nextgem_Chromium_X_atac_fragments.tsv.gz"
fragments_df = pd.read_csv(fragments_path, sep="\t", header=None, comment="#")
fragments_df.columns = ['chrom', 'start', 'end', 'barcode', 'count']
fragments_df = fragments_df[fragments_df['barcode'].isin(filtered_barcodes)].copy()

# Load ATAC peak regions (BED file)
peaks_path = "data/10k_PBMC_Multiome_nextgem_Chromium_X_atac_peaks.bed"
peaks_df = pd.read_csv(peaks_path, sep="\t", header=None, comment="#")
peaks_df.columns = ['chrom', 'start', 'end']
peaks_df['peak_id'] = ['peak_' + str(i) for i in range(len(peaks_df))]

# Intersect fragments with peaks using PyRanges
fragments_df.rename(columns={"chrom": "Chromosome", "start": "Start", "end": "End"}, inplace=True)
peaks_df.rename(columns={"chrom": "Chromosome", "start": "Start", "end": "End"}, inplace=True)

fragments_pr = pr.PyRanges(fragments_df[["Chromosome", "Start", "End", "barcode"]])
peaks_pr = pr.PyRanges(peaks_df)

# Perform intersection
overlap = fragments_pr.join(peaks_pr)
fragments_mapped = overlap.df[["Start", "End", "Chromosome", "barcode"]].copy()
fragments_mapped["peak_id"] = overlap.df["peak_id"].values

# Build peak-by-cell matrix
unique_barcodes = sorted(fragments_mapped['barcode'].unique())
barcode_to_index = {bc: i for i, bc in enumerate(unique_barcodes)}
peak_to_index = {pid: i for i, pid in enumerate(peaks_df['peak_id'])}

num_cells = len(unique_barcodes)
num_peaks = len(peaks_df)
atac_matrix = lil_matrix((num_cells, num_peaks), dtype=np.int32)

for row in fragments_mapped.itertuples(index=False):
    bc_idx = barcode_to_index[row.barcode]
    peak_idx = peak_to_index[row.peak_id]
    atac_matrix[bc_idx, peak_idx] += 1

# Convert to CSR format
atac_matrix_csr = atac_matrix.tocsr()

# Create ATAC AnnData object
atac_adata = ad.AnnData(X=atac_matrix_csr)
atac_adata.obs_names = pd.Index(unique_barcodes, name="barcode")
atac_adata.var_names = pd.Index(peaks_df["peak_id"].astype(str), name="peak_id")

# Save to disk
atac_adata.write("results/files/ag_atac_matrix.h5ad")
