In [1]:
import pickle
from datetime import date
import os
import loompy
import pandas as pd
import scanpy as sc
import anndata as ad
import numpy as np
from src.classes import *

  numba.core.entrypoints.init_all()
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [3]:
def convert_loom_to_anndata(loom_file, ca_list=[], ra_list=[], ca_index='CellID', ra_index='Accession'):
    attr_lists = [ca_list, ra_list]
    
    # if attr lists are empy, keep original columns/rows
    for idx, attr_list in enumerate(attr_lists):
        if len(attr_lists[idx]) == 0:
            if idx == 0: 
                attr_lists[idx] = loom_file.ca.keys()
            elif idx == 1: 
                attr_lists[idx] = loom_file.ra.keys()
    
    # select index columns for the dataframes
    attr_indexes = [ca_index, ra_index]
    for idx, index in enumerate(attr_indexes):
        if type(index) == int:
            attr_indexes[idx] = attr_lists[idx][index]
        elif type(index) == str:
            assert index in attr_lists[idx]
    print(f'The indeces for var and obs will be assigned to {attr_indexes[0]} and {attr_indexes[1]}')
    
    # create var and obs dataframes with defined columns and indexes (indices)
    ad_attr = [pd.DataFrame(), pd.DataFrame()]
    for idx, attr_list in enumerate(attr_lists):
        for attr in attr_list:
            if idx == 0: 
                ad_attr[idx][attr] = loom_file.ca[attr]
            elif idx == 1: 
                ad_attr[idx][attr] = loom_file.ra[attr]
        ad_attr[idx].index = ad_attr[idx][attr_indexes[idx]]

    adata = ad.AnnData(X = loom_file[:, :].T, var=ad_attr[1], obs=ad_attr[0])
        
    return adata

In [4]:
today = date.today()
print(today)

input_folder = '../../data/hesc_tx/'

output_path = '../../data/hesc_tx/'
output_folder = f'{output_path}L5_{today.strftime("%d%m%y")}/'
print(output_folder)
os.makedirs(output_folder, exist_ok=True)

2022-12-01
../../data/hesc_tx/L5_011222/


In [41]:
pickle_loc = '../../data/hesc_tx/L5_agg_all_subset_notation.pkl'
recreate_file = 1

In [38]:
L5_all = loompy.connect(f'{input_folder}l5_all.agg.loom')
L5_all

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
,,,,,,,Age_6w,0,0,0,0,0,0,0,0,0,0,...
,,,,,,,Age_?,0,0,0,0,0,0,0,0,0,0,...
,,,,,,,"Age_p12, p35",0,0,0,0,0,0,0,0,0,0,...
,,,,,,,"Age_p16, p24",0,0,0,0,0,0,0,0,0,0,...
,,,,,,,Age_p19,0,0,0,0,0,0,0,0,0,0,...
,,,,,,,"Age_p19, p21",99,62,81,157,103,20,94,74,9,86,...
,,,,,,,Age_p20,0,0,0,0,0,0,0,0,0,0,...
,,,,,,,Age_p21,0,0,0,0,0,0,0,0,0,0,...
,,,,,,,"Age_p21, p23",70,35,37,102,52,11,61,33,5,77,...
,,,,,,,Age_p21-23,0,0,0,0,0,0,0,0,0,0,...


In [43]:
if not os.path.isfile(pickle_loc) or recreate_file == 1:
    print('Creating Anndata from the original loom file')
    L5_all = loompy.connect(f'{input_folder}l5_all.agg.loom')
    ca_selection = ['Class', 'ClusterName', 'Description', 'Location_based_on',
                    'Neurotransmitter', 'Region', 'MarkerGenes',
                    'TaxonomyRank1', 'TaxonomyRank2', 'TaxonomyRank3', 'TaxonomyRank4']
    ra_selection = ['Accession', 'Gene']
    adata_all = convert_loom_to_anndata(L5_all, ca_list=ca_selection, ra_list=ra_selection,ca_index='ClusterName')
    if adata_all.obs.index.name == 'ClusterName':
        adata_all.obs.index.name = 'ClusterName_idx'
    L5_all.close()
    # save pickle if you want
    with open(pickle_loc, 'wb') as pickle_file:
        pickle.dump(adata_all, pickle_file)
else:
    print('loading saved pickle')
    with open(pickle_loc, 'rb') as pickle_file:
        adata_all = pickle.load(pickle_file)
    
adata_all

Creating Anndata from the original loom file
The indeces for var and obs will be assigned to ClusterName and Accession


  adata = ad.AnnData(X = loom_file[:, :].T, var=ad_attr[1], obs=ad_attr[0])


AnnData object with n_obs × n_vars = 265 × 27998
    obs: 'Class', 'ClusterName', 'Description', 'Location_based_on', 'Neurotransmitter', 'Region', 'MarkerGenes', 'TaxonomyRank1', 'TaxonomyRank2', 'TaxonomyRank3', 'TaxonomyRank4'
    var: 'Accession', 'Gene'

In [44]:
adata_all.obs['TaxonomyRank4'].value_counts()

Telencephalon projecting excitatory neurons    24
Di- and mesencephalon excitatory neurons       22
Telencephalon inhibitory interneurons          20
Cholinergic and monoaminergic neurons          16
Di- and mesencephalon inhibitory neurons       16
Peptidergic neurons                            15
Hindbrain neurons                              15
Spinal cord excitatory neurons                 11
Spinal cord inhibitory neurons                 10
Enteric neurons                                 9
Oligodendrocytes                                9
Olfactory inhibitory neurons                    9
Enteric glia                                    8
Peripheral sensory peptidergic neurons          8
Astrocytes                                      7
Cerebellum neurons                              6
Peripheral sensory non-peptidergic neurons      6
Telencephalon projecting inhibitory neurons     6
Non-glutamatergic neuroblasts                   5
Sympathetic noradrenergic neurons               5


In [45]:
celltypes_to_keep_dict  = {
    'MBDOP2': 'Dopaminergic neurons; mouse' , 
    'MBDOP1': 'Dopaminergic neurons; mouse' , 
    'MOL1': 'Oligodendrocytes',
    'COP1': 'Oligodendrocytes',
    'MFOL1': 'Oligodendrocytes',
    'MFOL2': 'Oligodendrocytes',
    'MSN1': 'D1 Medium Spiny Neurons; mouse',
    'MSN2': 'D2 Medium Spiny Neurons; mouse',
    'MSN3': 'D2 Medium Spiny Neurons; mouse',
    'MSN4': 'D1 Medium Spiny Neurons; mouse',
    'MSN5': 'D1/D2 Medium Spiny Neurons, striatum',
    'MSN6': 'D1 Medium Spiny Neurons; mouse',
    'TEGLU1': 'Cortical projection neurons; mouse',
    'TEGLU2': 'Cortical projection neurons; mouse',
    'TEGLU3': 'Cortical projection neurons; mouse',
    'TEGLU4': 'Cortical projection neurons; mouse',
    'TEGLU5': 'Cortical projection neurons; mouse',
    'TEGLU6': 'Cortical projection neurons; mouse',
    'TEGLU7': 'Cortical projection neurons; mouse',
    'TEGLU8': 'Cortical projection neurons; mouse',
    'TEGLU9': 'Cortical projection neurons; mouse',
    'TEGLU10': 'Cortical projection neurons; mouse',
    'TEGLU11': 'Cortical projection neurons; mouse',
    'TEGLU12': 'Cortical projection neurons; mouse',
    'TEGLU13': 'Cortical projection neurons; mouse',
    'TEGLU14': 'Cortical projection neurons; mouse',
    'TEGLU15': 'Cortical projection neurons; mouse',
    'TEGLU16': 'Cortical projection neurons; mouse',
    'TEGLU17': 'Cortical projection neurons; mouse',
    'TEGLU18': 'Cortical projection neurons; mouse',
    'TEGLU19': 'Cortical projection neurons; mouse',
    'TEGLU20': 'Cortical projection neurons; mouse',
    'TECHO': 'Cholinergic interneurons; mouse',
    'DECHO1': 'Cholinergic interneurons; mouse',
    'VLMC1': 'Vascular leptomeningeal cells; mouse',
    'VLMC2': 'Vascular leptomeningeal cells; mouse',
    'ABC': 'Vascular leptomeningeal cells; mouse',
    'ACTE1': 'Astrocytes; mouse',
    'ACTE2': 'Astrocytes; mouse',
    'ACMB': 'Astrocytes; mouse',
    'ACNT1': 'Astrocytes; mouse',
    'ACNT2': 'Astrocytes; mouse',
    'VECA' : 'Vascular; mouse',
    'VSMCA' : 'Vascular; mouse',
    'PER1' : 'Vascular; mouse',
    'PER2' : 'Vascular; mouse',
    'PER3' : 'Vascular; mouse',
    'VECC' : 'Vascular; mouse',
    'VECV' : 'Vascular; mouse',
    'PVM1' : 'Immune cells; mouse',
    'PVM2' : 'Immune cells; mouse',
    'MGL3' : 'Immune cells; mouse',
    'MGL2' : 'Immune cells; mouse',
    'MGL1' : 'Immune cells; mouse',
    'RGDG' : 'Dentate gyrus radial glia-like cells',
    'RGSZ' : 'Subventricular zone radial glia-like cells',
    'SEPNBL' : 'Glutamatergic neuroblasts, pallidum; mouse'
#     ,
#     'TEINH2': 'Inhibitory neurons, septal nucleus',
#     'MEINH1': 'Inhibitory neurons, midbrain',
#     'MEINH14': 'Inhibitory neurons, midbrain'
}

celltypes_to_discard = ['Ependymal cells']
# supplemented by Taxonomy Rank 4 (or any other column) anotation
annotated_column, annotation_column = 'ClusterName', 'TaxonomyRank4'
annot_cluster = adata_all.obs[[annotated_column, annotation_column]]
annot_dict = annot_cluster.groupby(annotated_column).first().to_dict()[annotation_column]

In [46]:
adata_all.obs['Celltype_assigned'] = adata_all.obs['ClusterName'].replace(celltypes_to_keep_dict).replace(annot_dict)
adata_all.obs['Celltype_assigned'].value_counts(sort=True)

Di- and mesencephalon excitatory neurons       22
Cortical projection neurons; mouse             20
Telencephalon inhibitory interneurons          20
Di- and mesencephalon inhibitory neurons       16
Peptidergic neurons                            15
Hindbrain neurons                              15
Cholinergic and monoaminergic neurons          12
Spinal cord excitatory neurons                 11
Spinal cord inhibitory neurons                 10
Enteric neurons                                 9
Oligodendrocytes                                9
Olfactory inhibitory neurons                    9
Enteric glia                                    8
Peripheral sensory peptidergic neurons          8
Vascular; mouse                                 7
Cerebellum neurons                              6
Peripheral sensory non-peptidergic neurons      6
Astrocytes; mouse                               5
Sympathetic noradrenergic neurons               5
Immune cells; mouse                             5


In [47]:
selected_regions = ['Striatum', 'Midbrain ventral', 'Cortex', 'CNS']
adata_selected = adata_all[adata_all.obs['Region'].str.contains('|'.join(selected_regions)) | 
                           adata_all.obs['ClusterName'].str.contains('|'.join(celltypes_to_keep_dict.keys())) , :]
# adata_selected = adata_selected[adata_selected.obs['Celltype_assigned'] != 'Ependymal cells', :]
adata_selected

View of AnnData object with n_obs × n_vars = 87 × 27998
    obs: 'Class', 'ClusterName', 'Description', 'Location_based_on', 'Neurotransmitter', 'Region', 'MarkerGenes', 'TaxonomyRank1', 'TaxonomyRank2', 'TaxonomyRank3', 'TaxonomyRank4', 'Celltype_assigned'
    var: 'Accession', 'Gene'

In [48]:
# This makes it a “real” AnnData object
adata_selected.obs["value"] = 0 

  adata_selected.obs["value"] = 0


In [49]:
adata_selected.var['Gene_no_alt'] = [x.split('.')[0] if '.' in x  else x for x in adata_selected.var['Gene']]

In [50]:
adata_selected = merge_gene_symbol_duplicates(adata_selected, symbol_column='Gene_no_alt')
adata_selected.var.index = adata_selected.var['Gene_no_alt']
adata_selected.var.index.name = 'symbol'
adata_selected.var

Scaled from 27998 genes incl. alternative splicing to 27794 genes without alternative splicing variants


Unnamed: 0_level_0,Accession,Gene,Gene_no_alt,value
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0610007P14RIK,ENSMUSG00000021252,0610007P14Rik,0610007P14RIK,0
0610009B22RIK,ENSMUSG00000007777,0610009B22Rik,0610009B22RIK,0
0610009L18RIK,ENSMUSG00000043644,0610009L18Rik,0610009L18RIK,0
0610009O20RIK,ENSMUSG00000024442,0610009O20Rik,0610009O20RIK,0
0610010F05RIK,ENSMUSG00000042208,0610010F05Rik,0610010F05RIK,0
...,...,...,...,...
ZYG11A,ENSMUSG00000034645,Zyg11a,ZYG11A,0
ZYG11B,ENSMUSG00000034636,Zyg11b,ZYG11B,0
ZYX,ENSMUSG00000029860,Zyx,ZYX,0
ZZEF1,ENSMUSG00000055670,Zzef1,ZZEF1,0


In [51]:
export_name = f'{output_folder}L5_agg_CTX_M_STR_CNS_selection_1000_astmerged_Tax4'

In [52]:
export_name

'../../data/hesc_tx/L5_011222/L5_agg_CTX_M_STR_CNS_selection_1000_astmerged_Tax4'

In [53]:
adata_selected.write_loom(f'{export_name}.loom')