In [31]:
# vim: fdm=indent
'''
author:     Fabio Zanini
date:       06/10/23
content:    Compress Zea Mays
'''
import os
import sys
import pathlib
import gzip
import h5py
import numpy as np
import pandas as pd

import anndata
import scanpy as sc

from utils import (
    root_repo_folder,
    output_folder,
    get_tissue_data_dict,
    subannotate,
    fix_annotations,
    get_celltype_order,
    collect_gene_annotations,
    store_compressed_atlas,
    )


species = 'zea_mays'
atlas_data_folder = root_repo_folder / 'data' / 'full_atlases' / 'RNA' / species
atlas_data_file = atlas_data_folder / 'GSE155178_maize_scATAC_atlas_ACR_celltype_CPM.txt.gz'
fn_out = output_folder / f'{species}.h5'


rename_dict = {}

celltype_tissue_blacklist = {
        'whole': [
            'not_clustered',
        ],
}


coarse_cell_types = []


celltype_order = [
    ('epithelial', [
      
    ]),
    ('mesenchymal', [
       
    ]),
    ('other', [
       
    ]),
]


if __name__ == '__main__':

    # Remove existing compressed atlas file if present
    if os.path.isfile(fn_out):
        os.remove(fn_out)

    compressed_atlas = {}

    tissues = ['whole']
    for tissue in tissues:
        adata_tissue = anndata.AnnData(pd.read_csv(atlas_data_file, sep='\t', index_col=0))

        # It's already in raw counts

        # cptt throughout
        sc.pp.normalize_total(
            adata_tissue,
            target_sum=1e4,
            key_added='coverage',
        )

        # Fix cell type annotations
        adata_tissue.obs['cell_type'] = adata_tissue.obs['cell_type'].str.lower()
        adata_tissue.obs['cellType'] = fix_annotations(
            adata_tissue, 'cell_type', species, tissue,
            rename_dict, coarse_cell_types,
            blacklist=celltype_tissue_blacklist,
        )


        # Correction might declare some cells as untyped/low quality
        # they have an empty string instead of an actual annotation
        if (adata_tissue.obs['cellType'] == '').sum() > 0:
            idx = adata_tissue.obs['cellType'] != ''
            adata_tissue = adata_tissue[idx]

        celltypes = get_celltype_order(
            adata_tissue.obs['cellType'].value_counts().index,
            celltype_order,
        )

        print('Add data to celltype group')
        genes = adata_tissue.var_names
        avg_ge = pd.DataFrame(
                np.zeros((len(genes), len(celltypes)), np.float32),
                index=genes,
                columns=celltypes,
                )
        frac_ge = pd.DataFrame(
                np.zeros((len(genes), len(celltypes)), np.float32),
                index=genes,
                columns=celltypes,
                )
        ncells_ge = pd.Series(
                np.zeros(len(celltypes), np.int64), index=celltypes,
                )
        for celltype in celltypes:
            idx = adata_tissue.obs['cellType'] == celltype
            Xidx = adata_tissue[idx].X
            avg_ge[celltype] = np.asarray(Xidx.mean(axis=0))[0]
            frac_ge[celltype] = np.asarray((Xidx > 0).mean(axis=0))[0]
            ncells_ge[celltype] = idx.sum()

        compressed_atlas[tissue] = {
            'features': genes,
            'celltype': {
                'avg': avg_ge,
                'frac': frac_ge,
                'ncells': ncells_ge,
            },
        }

    print('No gene annotations available')
    gene_annos = None

    print('Store compressed atlas to file')
    store_compressed_atlas(
            fn_out,
            compressed_atlas,
            tissues,
            gene_annos,
            celltype_order,
    )

KeyError: 'cell_type'

In [28]:
import pandas as pd

file_path = root_repo_folder / 'data' / 'full_atlases' / 'RNA' / 'zea_mays'/ 'GSE155178_maize_scATAC_atlas_ACR_celltype_CPM.txt.gz'
data = pd.read_csv(file_path, sep='\t')

# print(data.columns)

data
# print(adata_tissue.obs.columns)
# print(adata_tissue.obs.head())



Unnamed: 0,unknown.5.50,axillary_meristem.5.51,parenchyma.5.52,axillary_L1_layer.5.53,ground_meristem.5.54,axillary_provascular_strands.5.55,unknown.5.56,axillary_meristem_L1_layer.5.57,pith_parenchyma.1.1,endodermis.1.10,...,QC.3.39,proximal_meristem.3.40,unknown.3.41,spikelet_meristem_spikelet_pair_meristem.8.76,spikelet_meristem.8.77,lower_floral_meristem.8.78,floral_primordia.8.79,glume_primordia.8.80,inflorescence_meristem.8.81,spikelet_meristem.8.83
chr1_100071238_100071739,1.592577,1.545852,0.850742,0.899202,0.899202,1.519199,1.188539,1.686253,0.797121,0.987944,...,1.046669,1.428550,0.991523,1.118346,1.045573,0.867808,1.408304,0.799697,1.545852,0.893413
chr1_100082086_100082587,0.992677,0.968092,1.183727,0.993554,0.953473,1.519199,0.957726,1.209352,1.017792,1.140615,...,1.046669,1.428550,0.991523,1.887779,1.045573,1.429208,1.408304,0.898104,1.545852,0.998079
chr1_100084163_100084664,3.841685,4.674350,3.973017,3.721179,4.060405,3.487974,4.061896,4.082764,3.766886,3.279741,...,2.339803,4.643617,2.594112,3.902525,3.662729,3.443760,4.452419,3.769700,4.631104,3.506066
chr1_100085443_100085944,3.841685,3.444898,3.225863,3.552178,3.981088,4.195423,3.675913,3.280518,3.243692,3.508479,...,3.464491,4.127540,4.279595,2.381192,2.797192,2.360719,3.396209,2.946813,3.245528,2.655190
chr1_100086311_100086812,1.592577,2.637868,1.361634,1.666288,1.749027,3.249096,2.210745,1.209352,1.868345,2.097383,...,2.339803,1.428550,2.847269,3.477376,1.763416,1.718100,1.408304,1.930291,1.545852,1.676924
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
chrB73V4ctg98_34026_34527,0.992677,1.545852,1.827710,1.666288,1.962481,3.249096,2.118699,1.209352,1.868345,1.718100,...,1.046669,3.015824,2.319326,3.214065,2.346291,0.867808,1.408304,1.765565,1.545852,1.536140
chrB73V4ctg98_34549_35050,2.121327,3.444898,2.522080,2.375240,2.747930,3.837819,2.653646,3.196104,2.204259,1.983577,...,2.791951,1.428550,3.611699,3.477376,2.797192,3.215839,2.747930,2.633895,1.545852,2.322566
chrB73V4ctg98_39549_40050,3.973017,5.026967,4.258517,4.160192,3.962770,2.626667,4.212200,3.630802,5.031616,5.023526,...,4.113142,3.015824,4.525720,4.069875,4.118112,4.278765,3.396209,3.968154,1.545852,4.373444
chrB73V4ctg98_6293_6794,5.221655,5.963101,5.448560,5.725079,5.420174,5.210688,5.822413,5.877446,5.645953,5.733453,...,5.427363,4.407478,5.868683,5.384233,5.442825,5.441899,5.492839,5.586374,5.210688,5.706736
