In [1]:
# Imports
import numpy as np
import pandas as pd
import scanpy.api as sc
import timeit
import glob
import os

In [2]:
# Set path prefix
path_prefix = '/soe/apblair/sc_analysis/HoC_data/human/matrices_tsv/*.csv'

In [3]:
# Create ensembl to Hugo conversion dictionary
symbol2ensemble = pd.read_csv('/projects/sysbio/users/hzgong/pablo/cluster/brain_of_cells/v4/resources/gene_id_conversion.tsv', sep='\t', index_col=1)['ensembl']
symbol2ensemble = symbol2ensemble[symbol2ensemble.notnull()]
ensembl2symbol = dict(zip(symbol2ensemble.values,symbol2ensemble.index.values))

In [4]:
# Check how many different cell ages are present
cell_batch = set()
for batches in glob.glob(path_prefix):
    cell_batch.add(batches.split('/')[-1][2:5])

In [5]:
cell_dict = {key: [] for key in list(cell_batch)}
for sc_data in glob.glob(path_prefix):
    
    # Create directory path and denote by cell type and age
    directory = '/soe/apblair/sc_analysis/' + sc_data.split('/')[-1].split('.')[0]
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print ('Error: Creating directory. ' +  directory)

    for age in cell_dict.keys():
            if age == sc_data.split('/')[-1][2:5]:
                # Create anndata structure for cell batch data
                adata = sc.read(sc_data, cache=True, first_column_names=True).T
                # Add cell type to the cell batch's barcode
                adata.obs = adata.obs.rename(index={key: sc_data.split('/')[-1].split('.')[0].split('_')[-1] + "_" + key for key in adata.obs.index.tolist()})
                # Add metadata batch information to anndata object
                adata.obs['batch_name'] = sc_data.split('/')[-1].split('.')[0]
                # Convert gene ID's from Ensembl to Hugo
                adata.var = adata.var.rename(index=ensembl2symbol)
                # Append cell data to dictionary
                cell_dict[age].append({sc_data.split('/')[-1].split('.')[0].split('_')[-1]: adata})
                

In [6]:
# # Concatenate each cell batch's data sets
concatenated_cell_dict = {key: None for key in list(cell_batch)}
for keys, values in cell_dict.items():
    concatenated_cell_dict[keys] = values[0][list(values[0].keys())[-1]].concatenate([list(items.values())[0] for items in values[1:]], join='outer')

Making variable names unique for controlled concatenation.
Making variable names unique for controlled concatenation.


In [7]:
# Check concatenation
check_concatenation = set()
for keys, values in concatenated_cell_dict.items():
    for items in values.obs.index.tolist():
        check_concatenation.add(items.split("_")[0])
    print(keys, values)
    print(check_concatenation)

13W AnnData object with n_obs × n_vars = 23594 × 33694 
    obs: 'batch_name', 'batch'
{'LA', 'LV', 'RV', 'IVS', 'RA'}
11W AnnData object with n_obs × n_vars = 19411 × 33694 
    obs: 'batch_name', 'batch'
{'LA', 'LV', 'RV', 'IVS', 'RA'}


In [10]:
# Create a batch name to integer mapping dictionary,
# in case a user is interested in running a batch
# correction with regress.out function

batch_mapping_dict = {}

inc = 0
for keys in concatenated_cell_dict.keys():
    for k in concatenated_cell_dict[keys].obs['batch'].unique():
        batch_mapping_dict[k] = inc
        inc += 1

for keys in concatenated_cell_dict.keys():
    concatenated_cell_dict[keys].obs["batch"].replace(batch_mapping_dict, inplace=True)

In [12]:
concatenated_cell_dict['13W'].obs

Unnamed: 0,batch_name,batch
RV_AAACCTGAGTGGTAAT-1-0,EF13W3D_RV,5
RV_AAACCTGCAAGTACCT-1-0,EF13W3D_RV,5
RV_AAACCTGCAATGTTGC-1-0,EF13W3D_RV,5
RV_AAACCTGCACTCGACG-1-0,EF13W3D_RV,5
RV_AAACCTGGTAAGGGCT-1-0,EF13W3D_RV,5
RV_AAACCTGGTAATAGCA-1-0,EF13W3D_RV,5
RV_AAACCTGGTCAGAAGC-1-0,EF13W3D_RV,5
RV_AAACCTGTCAATCACG-1-0,EF13W3D_RV,5
RV_AAACCTGTCACCGTAA-1-0,EF13W3D_RV,5
RV_AAACCTGTCCCTAACC-1-0,EF13W3D_RV,5
