# Setup

In [1]:
import os
import sys
import subprocess
import re
import json

import anndata as ad
import pandas as pd

In [2]:
cwd = os.path.basename(os.getcwd())

if cwd == 'footprintQTL':
    
    pass


elif cwd == 'code':
    
    os.chdir('..')


elif cwd == 'fichtner':

    os.chdir('projects/footprintQTL')

else:

    print('path and cwd: manually')
    print(os.getcwd())


sys.path.append(os.getcwd() + '/code')

In [3]:
from helpers.python.utils import ct_format, ct_format_alt, create_dir

### User variables

In [4]:
from glob_vars import ATAC_PEAKS_H5AD_OLD, ATAC_PEAKS_H5AD_NEW, RNA_H5AD_OLD, RNA_H5AD_NEW, CT_MAP_JSON

### Prep folders

In [5]:
create_dir(ATAC_PEAKS_H5AD_NEW)
create_dir(RNA_H5AD_NEW)

### Cell-type map

In [6]:
ct_map_id = os.path.basename(CT_MAP_JSON).rstrip('.json')

# Get ct-ann --> grouped-ct mappings
with open(CT_MAP_JSON, 'r') as f:
    ct_map = json.load(f)

ct_map = {key: [e for e in listt] for key, listt in ct_map.items()}
ct_map_i = {old_ct: new_ct for new_ct, old_cts in ct_map.items() for old_ct in old_cts}

# Format and copy ATAC-seq anndata

### .obs

In [7]:
atac_peaks_ad = ad.read_h5ad(ATAC_PEAKS_H5AD_OLD)

atac_peaks_ad



AnnData object with n_obs × n_vars = 288900 × 736845
    obs: 'BlacklistRatio', 'nDiFrags', 'nFrags', 'nMonoFrags', 'nMultiFrags', 'NucleosomeRatio', 'PassQC', 'PromoterRatio', 'ReadsInBlacklist', 'ReadsInPromoter', 'ReadsInTSS', 'Sample', 'TSSEnrichment', 'celltype', 'cellType', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'donor_id', 'clone', 'organoid', 'donor', 'leiden', 'stressed_vs_unstressed_celltypes', 'Batch', 'ReadsInPeaks', 'FRIP', 'barcode'
    var: 'chr', 'start', 'end', 'strand', 'peak_name', 'score', 'GC', 'nearest_gene', 'peak_type'

In [8]:
# Make var_names == .var['peak_name']
assert len(atac_peaks_ad.var['peak_name']) == atac_peaks_ad.var['peak_name'].nunique()
atac_peaks_ad.var_names = atac_peaks_ad.var['peak_name']

In [9]:
# cell_type
atac_peaks_ad.obs.drop(columns=['cellType', 'celltype'], inplace=True)

In [10]:
# batchbatch
atac_peaks_ad.obs['batch'] = atac_peaks_ad.obs['Sample']
atac_peaks_ad.obs['sample'] = atac_peaks_ad.obs['Sample'].astype('category')
atac_peaks_ad.obs.drop(columns=['Batch'], inplace=True)

In [11]:
# barcode_batch
atac_peaks_ad.obs['barcode_batch'] = atac_peaks_ad.obs.index

# Added recently without checking, might cause a bug?
if not 'barcode' in atac_peaks_ad.obs.columns.to_list():
    atac_peaks_ad['barcode'] = [regex_get(i, 'barcode') for i in atac_peaks_ad.index.tolist()]

### Filter cells

In [12]:
rna_ad = ad.read_h5ad(RNA_H5AD_OLD, backed='r')

rna_ad

AnnData object with n_obs × n_vars = 265053 × 30500 backed at '/omics/groups/OE0540/internal/projects/HCA_organoid_2/cemm_sabrina-20Jul2022/outputs_allsamples/sabrina_allsamples_rna_final_after_atac.h5ad'
    obs: 'sample', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'sample_qclvl', 'donor_id', 'clone', 'organoid', 'donor', 'clone_qclvl', 'organoid_qclvl', 'donor_qclvl', 'cellbender', 'doubletfinder', 'S_score', 'G2M_score', 'phase', 'mt_cat', 'ribo_cat', 'leiden', 'CellType', 'Batch', 'celltype_predicted_vertesy', 'stressed_vs_unstressed_celltypes', 'glia_neuron_celltpyes', 'gruffi'
    var: 'gene_ids', 'feature_types', 'genome', 'interval', 'mt', 'ribo', 'n_cells_by_counts', 'mean_counts', 'log1p

In [13]:
common_cells = set(atac_peaks_ad.obs_names) & set(rna_ad.obs_names)

In [14]:
atac_peaks_ad = atac_peaks_ad[list(common_cells), :].copy()

### Correct annotations

RNA-seq anndata and ATAC-seq anndata seemed to have different cell-type annotations. The RNA-seq ones are correct, so we must correct the ATAC-seq ones accordingly.

In [15]:
atac_peaks_ad.obs.loc[list(common_cells), 'cell_type'] = rna_ad.obs.loc[list(common_cells), 'celltype_predicted_vertesy'].astype(str)
atac_peaks_ad.obs['cell_type'] = atac_peaks_ad.obs['cell_type'].astype('category')
atac_peaks_ad.obs['cell_type'].cat.categories

Index(['DL-EN', 'Differentiating RG', 'Dividing Glia', 'Glia', 'IPC',
       'Interneurons', 'Interneurons Prog.', 'Midbrain EN', 'Stressed Neurons',
       'Stressed Prog.', 'UL-EN', 'Unclear', 'immature EN'],
      dtype='object')

Check that the correction was performed correctly

In [16]:
k1 = atac_peaks_ad.obs.loc[list(common_cells)][['cell_type']]
k2 = rna_ad.obs.loc[list(common_cells)][['celltype_predicted_vertesy']]

In [17]:
k1.loc[k2[k2.celltype_predicted_vertesy == 'Interneurons Prog.'].index]

Unnamed: 0,cell_type
CCAACCAAGATGCCTG-1_sSL0090,Interneurons Prog.
GAACCGCTCATGTCAA-1_sSL0113A,Interneurons Prog.
CTTAATGAGCGCCTAA-1_sSL0107,Interneurons Prog.
CTTACTAGTCATCCTG-1_sSL0108,Interneurons Prog.
TGTTGTTTCCGGGACT-1_sSL0096,Interneurons Prog.
...,...
ACGCTTGAGTATCGCG-1_sSL0108,Interneurons Prog.
CTTGCTCAGTAACCCG-1_sSL0172,Interneurons Prog.
GCAATGAAGTTGGCCA-1_sSL0128,Interneurons Prog.
CGCCAAATCCTTGCAC-1_sSL0094,Interneurons Prog.


In [18]:
all(k1['cell_type'] == k2['celltype_predicted_vertesy'])

True

In [19]:
# Change peak coordinatem 1-based fully closed --> system 0-based half open

atac_peaks_ad.var['start'] = atac_peaks_ad.var['start'] - 1


var_names_new = []

for name in atac_peaks_ad.var_names:

    fields = name.split(':')
    start_new = str(int(fields[1]) - 1)
    var_name_new = ':'.join([fields[0], start_new, *fields[2:]])

    var_names_new.append(var_name_new)
    


atac_peaks_ad.var_names = var_names_new
atac_peaks_ad.var['peak_name'] = var_names_new

### Custom cell-type grouping annotation

In [20]:
atac_peaks_ad.obs['cell_type_custom'] = atac_peaks_ad.obs['cell_type'].map(ct_map_i).apply(ct_format)

### Export

In [21]:
atac_peaks_ad.obs.columns.to_list()

['BlacklistRatio',
 'nDiFrags',
 'nFrags',
 'nMonoFrags',
 'nMultiFrags',
 'NucleosomeRatio',
 'PassQC',
 'PromoterRatio',
 'ReadsInBlacklist',
 'ReadsInPromoter',
 'ReadsInTSS',
 'Sample',
 'TSSEnrichment',
 'n_genes_by_counts',
 'total_counts',
 'total_counts_mt',
 'donor_id',
 'clone',
 'organoid',
 'donor',
 'leiden',
 'stressed_vs_unstressed_celltypes',
 'ReadsInPeaks',
 'FRIP',
 'barcode',
 'batch',
 'sample',
 'barcode_batch',
 'cell_type',
 'cell_type_custom']

In [22]:
atac_peaks_ad.write(ATAC_PEAKS_H5AD_NEW)