In [2]:
# vim: fdm=indent

#author:joanna ahn
#date: 28/06/23
#content: drosophila melanogaster


import os
import sys
import pathlib
import gzip
import h5py
import numpy as np
import pandas as pd
import anndata
import scanpy as sc

from utils import (
    root_repo_folder,
    output_folder,
    #get_tissue_data_dict,
    fix_annotations,
    get_celltype_order,
    collect_gene_annotations,
    store_compressed_atlas,
    )

species = 'd.melanogaster'
full_atlas_data_folder = root_repo_folder / 'data' / 'full_atlases' / 'drosophila_melanogaster'
anno_fn = root_repo_folder / 'data' / 'gene_annotations' / 'Drosophila_melanogaster.BDGP6.32.110.gft.gz'
fn_out = output_folder / f'{species}.h5'


def get_tissue_data_dict(species, full_atlas_data_folder, rename_dict):
    import re

    result = []
    filenames = os.listdir(full_atlas_data_folder)
    fns = [x for x in filenames if '.h5ad' in x]

    pattern = r'(?:s|r)_fca_biohub_(.*?)_10x\.h5ad'
    for filename in fns:
        tissue = re.search(pattern, filename).group(1)
        if tissue == "body":
             continue
        tissue = tissue.capitalize()
        tissue = rename_dict['tissues'].get(tissue, tissue)
        result.append({
            'tissue': tissue,
            'filename': full_atlas_data_folder / filename,
        })

    # assigning new value to result
    result = pd.DataFrame(result).set_index('tissue')
    # Order tissues alphabetically
    result = result.sort_index()['filename'].to_dict()
    return result

rename_dict = {
    'tissues': {
         "Body_wall": "Skin",
         "Head": "Brain",
    },
    'cell_types': {
        'sensory neuron': 'neuron',
        'epithelial cell': 'epithelial',
        'glial cell': 'glial',
        'muscle cell': 'muscle',
    }
}

celltype_order = [
    ('immune', [
        'neuron',
        'hemocyte',
    ]),
    ('epithelial', [
        'epithelial',
    ]),
    ('endothelial', [
        
    ]),
    ('mesenchymal', [
        
    ]),
    ('other', [
        'muscle',
        'glial',
    ]),
]

if __name__ == '__main__':

    if os.path.isfile(fn_out):
        os.remove(fn_out)

    compressed_atlas = {}

    tissue_sources = get_tissue_data_dict( 
        "d.melanogaster", full_atlas_data_folder, rename_dict)
    tissues = list(tissue_sources.keys())
    
    for it, tissue in enumerate(tissues):
        print(tissue)

        print('Load data for this tissue')
        adata_tissue = anndata.read(tissue_sources[tissue])

        print('Exclude cells that have inconsistencies in their annotation')
        # print(adata_tissue.obs['annotation_broad'])
        # print(adata_tissue.obs['R_annotation_broad'])
        print(adata_tissue)

        # adata_tissue = adata_tissue[adata_tissue.obs['annotation_broad'] == adata_tissue.obs['R_annotation_broad']]
        #adata_tissue.obs['annotation_broad'] = adata_tissue.obs['R_annotation_broad'].astype(str)
        
        print('Exclude "unannotated"')
        adata_tissue = adata_tissue[adata_tissue.obs['annotation_broad'] != 'unannotated']
        
        print('Restart from raw data and renormalize')
        adata_tissue = adata_tissue.raw.to_adata()

        print('Data is logp1 of cptt, so undo the log bit')
        adata_tissue.X.data[:] = np.exp(adata_tissue.X.data) - 1

        print('Now it\'s already cptt')

        print('Fix cell type annotations')
        adata_tissue.obs['cellType'] = fix_annotations(
            adata_tissue, 'annotation_broad', species, tissue,
            rename_dict, [],
        )

        print(adata_tissue.obs['cellType'].value_counts())

        # Correction might declare some cells as untyped/low quality
        # they have an empty string instead of an actual annotation
        if (adata_tissue.obs['cellType'] == '').sum() > 0:
            idx = adata_tissue.obs['cellType'] != ''
            adata_tissue = adata_tissue[idx]

        celltypes = get_celltype_order(
            adata_tissue.obs['cellType'].value_counts().index,
            celltype_order,
        )
        
        print('Average')
        genes = adata_tissue.var_names
        avg_ge = pd.DataFrame(
                    np.zeros((len(genes), len(celltypes)), np.float32),
                    index=genes,
                    columns=celltypes,
                    )
        frac_ge = pd.DataFrame(
                    np.zeros((len(genes), len(celltypes)), np.float32),
                    index=genes,
                    columns=celltypes,
                    )
        ncells_ge = pd.Series(
                    np.zeros(len(celltypes), np.int64), index=celltypes,
                    )
        for celltype in celltypes:
            idx = adata_tissue.obs['cellType'] == celltype
            Xidx = adata_tissue[idx].X
            avg_ge[celltype] = np.asarray(Xidx.mean(axis=0))[0]
            frac_ge[celltype] = np.asarray((Xidx > 0).mean(axis=0))[0]
            ncells_ge[celltype] = idx.sum()

        compressed_atlas[tissue] = {
            'features': genes,
            'celltype': {
                'avg': avg_ge,
                'frac': frac_ge,
                'ncells': ncells_ge,
            },
        }

    print('Consolidate gene list across tissues')
    needs_union = False
    genes = None
    for tissue, tdict in compressed_atlas.items():
        genest = list(tdict['features'])
        if genes is None:
            genes = genest
            continue
        if genest != genes:
            needs_union = True
            genes = set(genes) | set(genest)

    if needs_union:
        raise NotImplementedError('TODO: make union of features')

    #print('Add gene annotations')
    #gene_annos = collect_gene_annotations(anno_fn, genes)

    print('Store compressed atlas to file')
    store_compressed_atlas(
        fn_out,
        compressed_atlas,
        tissues,
        None,
        celltype_order,
    )










Antenna
Load data for this tissue
Exclude cells that have inconsistencies in their annotation
index
AAACCCAAGTTGGGAC-6e294c34__FCA55_Male_antenna_adult_5dWT_Luo_sample2        unannotated
AAACCCACACGTCGGT-6e294c34__FCA55_Male_antenna_adult_5dWT_Luo_sample2        unannotated
AAACCCAGTAGTAAGT-6e294c34__FCA55_Male_antenna_adult_5dWT_Luo_sample2     sensory neuron
AAACCCAGTGACTATC-6e294c34__FCA55_Male_antenna_adult_5dWT_Luo_sample2     sensory neuron
AAACCCAGTGCGACAA-6e294c34__FCA55_Male_antenna_adult_5dWT_Luo_sample2         glial cell
                                                                             ...       
TTTGGTTTCCTGGCTT-d5543d16__FCA3_MaleFemale_Antenna                       sensory neuron
TTTGGTTTCGGCTATA-d5543d16__FCA3_MaleFemale_Antenna                      epithelial cell
TTTGGTTTCTGGACTA-d5543d16__FCA3_MaleFemale_Antenna                       sensory neuron
TTTGTTGAGCAAGCCA-d5543d16__FCA3_MaleFemale_Antenna                      epithelial cell
TTTGTTGAGGTTCTAC-d55

In [None]:
adata_tissue.X.data[:] = np.exp(adata_tissue.X.data) - 1

In [None]:
(adata_tissue.X + 1).sum(axis=1)