In [2]:
# vim: fdm=indent

#author:joanna ahn
#date: 28/06/23
#content: compress drosophila melanogaster


import os
import numpy as np
import pandas as pd

import anndata
import scanpy as sc

from utils import (
    root_repo_folder,
    output_folder,
    #get_tissue_data_dict,
    fix_annotations,
    get_celltype_order,
    collect_gene_annotations,
    store_compressed_atlas,
    )

species = 'd.melanogaster'
full_atlas_data_folder = root_repo_folder / 'data' / 'full_atlases' / 'drosophila_melanogaster'
anno_fn = root_repo_folder / 'data' / 'gene_annotations' / 'dmel-all-r6.31.gtf.gz'  # see paper methods: https://www.science.org/doi/10.1126/science.abk2432
fn_out = output_folder / f'{species}.h5'

def get_tissue_data_dict(species, full_atlas_data_folder, rename_dict):
    result = []
    filenames = os.listdir(full_atlas_data_folder)
    fns = [x for x in filenames if '.h5ad' in x]

    for filename in fns:
        tissue_start = filename.find('biohub_') + len("biohub_")
        tissue_end = filename.rfind('_')
        tissue = filename [tissue_start:tissue_end]
        # TODO: rescue cell types that are found only in "body"
        if tissue == "body":
            continue 
          
    tissue = tissue.capitalize()
    tissue = rename_dict['tissues'].get(tissue, tissue)
          
    result.append({
        'tissue': tissue,
        'filename': full_atlas_data_folder / filename,
    })
    
    # assigning new value to result
    result = pd.DataFrame(result).set_index('tissue')
    # Order tissues alphabetically
    result = result.sort_index()['filename'].to_dict()
    return result

rename_dict = {
    'tissues': {
          'Body_wall': 'Skin',
          'Gut': 'Intestines',
          
         
    },
    'cell_types': {
        'sensory neuron': 'neuron',
        'epithelial cell': 'epithelial',
        'glial cell': 'glial',
        'muscle cell': 'muscle',
        'fat cell': 'adipocyte',
        'somatic precursor cell': 'stem cell',
        'gland': 'gland cell',
        'cardial cell': 'cardial',
        'tracheolar cell': 'tracheolar',
        'male germline cell': 'male germline',
        'female germline cell': 'female germline',
        'artefact': "",
        # TODO: split the various types in the following "systems"
        'excretory system': "",
        'male reproductive system': "",
        'female reproductive system': "",
    }
}

celltype_tissue_blacklist = {
}
      
coarse_cell_types = [
      
]


celltype_order = [
    ('immune', [
        'neuron',
        'hemocyte',
    ]),
    ('epithelial', [
        'epithelial',
    ]),
    ('endothelial', [
        
    ]),
    ('mesenchymal', [
        'adipocyte',
        'stem cell',
        'oenocyte',
    ]),
    ('other', [
        'muscle',
        'glial',
        'pigment cell',
        'gland cell',
        'tracheolar',
        'cardial',
        'male germline',
        'female germline',
        'ovary'
    ]),
    ('does not exist', [

    ])
]

if __name__ == '__main__':
#Remove existing compressed atlas file if present 
    if os.path.isfile(fn_out):
        os.remove(fn_out)

    compressed_atlas = {}

    tissue_sources = get_tissue_data_dict( 
        "d.melanogaster", full_atlas_data_folder, rename_dict)
    tissues = list(tissue_sources.keys())

#skip_tissues = ['Antenna', 'Brain', 'Gut', 'Haltere', 'Heart', 'Leg', 'Male_reproductive_glands', 'Malpighian_tubule', 'Oenocyte']
for it, tissue in enumerate(tissues):
        # if tissue in skip_tissues:
        #      continue
        
        print(tissue)

        print('Load data for this tissue')
        adata_tissue = anndata.read(tissue_sources[tissue])

        print('Exclude cells that have inconsistencies in their annotation')
        #adata_tissue = adata_tissue[adata_tissue.obs['annotation_broad'] == adata_tissue.obs['R_annotation_broad']] (no longer needed?)
        if 'R_annoation_broad' in adata_tissue.obs:
                adata_tissue.obs['annotation_broad'] = adata_tissue.obs['R_annotation_broad'].astype(str)
        elif 'S_annotation_broad' in adata_tissue.obs:
                adata_tissue.obs['annotation_broad'] = adata_tissue.obs['S_annotation_broad'].astype(str)
        
        print('Exclude "unannotated"')
        adata_tissue = adata_tissue[adata_tissue.obs['annotation_broad'] != 'unannotated']
        
        print('Restart from raw data and renormalize')
        adata_tissue = adata_tissue.raw.to_adata()

        #Renormalise to cptt
        print('Data is logp1 of cptt, so undo the log bit')
        adata_tissue.X.data[:] = np.exp(adata_tissue.X.data) - 1

        # print('Fix cell type annotations')
        adata_tissue.obs['cellType'] = fix_annotations(
            adata_tissue, 'annotation_broad', species, tissue,
            rename_dict, coarse_cell_types,
        )

        print(adata_tissue.obs['cellType'].value_counts())

        # Correction might declare some cells as untyped/low quality
        # they have an empty string instead of an actual annotation
        if (adata_tissue.obs['cellType'] == '').sum() > 0:
            idx = adata_tissue.obs['cellType'] != ''
            adata_tissue = adata_tissue[idx]

        celltypes = get_celltype_order(
            adata_tissue.obs['cellType'].value_counts().index,
            celltype_order,
        )
    
        print('Average')
        genes = adata_tissue.var_names
        avg_ge = pd.DataFrame(
                    np.zeros((len(genes), len(celltypes)), np.float32),
                    index=genes,
                    columns=celltypes,
                    )
        frac_ge = pd.DataFrame(
                    np.zeros((len(genes), len(celltypes)), np.float32),
                    index=genes,
                    columns=celltypes,
                    )
        ncells_ge = pd.Series(
                    np.zeros(len(celltypes), np.int64), index=celltypes,
                    )
        for celltype in celltypes:
            idx = adata_tissue.obs['cellType'] == celltype
            Xidx = adata_tissue[idx].X
            avg_ge[celltype] = np.asarray(Xidx.mean(axis=0))[0]
            frac_ge[celltype] = np.asarray((Xidx > 0).mean(axis=0))[0]
            ncells_ge[celltype] = idx.sum()

        compressed_atlas[tissue] = {
            "features": genes,
            "celltype": {
                "avg": avg_ge,
                "frac": frac_ge,
                "ncells": ncells_ge,
            },
        }

        print('Consolidate gene list across tissues')
        needs_union = False
        genes = None
        for tissue, tdict in compressed_atlas.items():
                genest = list(tdict['features'])
                if genes is None:
                    genes = genest
                    continue
                if genest != genes:
                    needs_union = True
                    genes = set(genes) | set(genest)

        if needs_union:
                raise NotImplementedError('TODO: make union of features')           

        # print('Add gene annotations')
        gene_annos = collect_gene_annotations(anno_fn, genes)
        
        print('Store compressed atlas to file')
        store_compressed_atlas(
                fn_out,
                compressed_atlas,
                tissues,
                None,
                celltype_order,
            )



Trachea
Load data for this tissue
Exclude cells that have inconsistencies in their annotation
Exclude "unannotated"
Restart from raw data and renormalize
Data is logp1 of cptt, so undo the log bit
              10337
tracheolar     9508
neuron          502
muscle          218
adipocyte        61
Name: cellType, dtype: int64
Average
Consolidate gene list across tissues
Store compressed atlas to file


In [None]:
tissue_specific = "Gut"
tissue_specific_obs = adata_tissue.obs[adata_tissue.obs['tissue'] == tissue_specific]
print(tissue_specific_obs.columns)

Index(['age', 'batch', 'batch_id', 'celda_decontx__clusters',
       'celda_decontx__contamination',
       'celda_decontx__doublemad_predicted_outliers', 'dissection_lab',
       'fca_id', 'fly_genetics', 'id', 'log_n_counts', 'log_n_genes',
       'n_counts', 'n_genes', 'note', 'percent_mito', 'sample_id',
       'scrublet__doublet_scores', 'scrublet__predicted_doublets',
       'scrublet__predicted_doublets_based_on_10x_chromium_spec', 'sex',
       'tissue', 'leiden_res0.4', 'leiden_res0.6', 'leiden_res0.8',
       'leiden_res1.0', 'leiden_res1.2', 'leiden_res1.4', 'leiden_res1.6',
       'leiden_res1.8', 'leiden_res10.0', 'leiden_res2.0', 'leiden_res4.0',
       'leiden_res6.0', 'leiden_res8.0', 'annotation',
       'annotation__ontology_id', 'annotation_broad',
       'annotation_broad__ontology_id', 'annotation_broad_extrapolated',
       'annotation_broad_extrapolated__ontology_id', 'R_annotation',
       'R_annotation__ontology_id', 'R_annotation_broad',
       'R_annotation_b

In [None]:
table_of_contents = tissue_specific_obs.columns
print(table_of_contents)


Index(['age', 'batch', 'batch_id', 'celda_decontx__clusters',
       'celda_decontx__contamination',
       'celda_decontx__doublemad_predicted_outliers', 'dissection_lab',
       'fca_id', 'fly_genetics', 'id', 'log_n_counts', 'log_n_genes',
       'n_counts', 'n_genes', 'note', 'percent_mito', 'sample_id',
       'scrublet__doublet_scores', 'scrublet__predicted_doublets',
       'scrublet__predicted_doublets_based_on_10x_chromium_spec', 'sex',
       'tissue', 'leiden_res0.4', 'leiden_res0.6', 'leiden_res0.8',
       'leiden_res1.0', 'leiden_res1.2', 'leiden_res1.4', 'leiden_res1.6',
       'leiden_res1.8', 'leiden_res10.0', 'leiden_res2.0', 'leiden_res4.0',
       'leiden_res6.0', 'leiden_res8.0', 'annotation',
       'annotation__ontology_id', 'annotation_broad',
       'annotation_broad__ontology_id', 'annotation_broad_extrapolated',
       'annotation_broad_extrapolated__ontology_id', 'R_annotation',
       'R_annotation__ontology_id', 'R_annotation_broad',
       'R_annotation_b