In [1]:
import numpy as np
import pandas as pd
import scanpy.api as sc
import timeit
import glob
import os

In [2]:
# Set path prefix
path_prefix = '/soe/apblair/sc_analysis/HoC_data/human/matrices_tsv/*.csv'

In [3]:
# Create ensembl to Hugo conversion dictionary
symbol2ensemble = pd.read_csv('/projects/sysbio/users/hzgong/pablo/cluster/brain_of_cells/v4/resources/gene_id_conversion.tsv', sep='\t', index_col=1)['ensembl']
symbol2ensemble = symbol2ensemble[symbol2ensemble.notnull()]
ensembl2symbol = dict(zip(symbol2ensemble.values,symbol2ensemble.index.values))

In [4]:
# Check how many different cell ages are present
cell_batch = set()
for batches in glob.glob(path_prefix):
    cell_batch.add(batches.split('/')[-1][2:5])

In [5]:
# Create a dictionary where the key is the cell age and the value
# is the cell batch's left ventricle data set in an anndata structure
cell_dict = {k:None for k in cell_batch}
for key in cell_dict.keys():
    for batches in glob.glob(path_prefix):
        if batches.split('/')[-1][2:5] == key and batches.split('/')[-1].split('_')[-1].split('.')[0] == 'LV':
            # Create anndata structure for left ventricle data set
            lv_cell = sc.read(batches, cache=True, first_column_names=True).T
            # Add cell type to the cell batch's barcode
            lv_cell.obs = lv_cell.obs.rename(index={key: batches.split('/')[-1].split('_')[-1].split('.')[0] + "_" + key for key in lv_cell.obs.index.tolist()})
            # Add metadata batch information to anndata object
            lv_cell.obs['batch_name'] = batches.split('/')[-1].split('.')[0]
            # Convert gene ID's from Ensembl to Hugo
            lv_cell.var = lv_cell.var.rename(index=ensembl2symbol)
            cell_dict[key] = lv_cell

In [6]:
for sc_data in glob.glob(path_prefix):
    
    # Create directory path and denote by cell type and age
    directory = '/soe/apblair/sc_analysis/' + sc_data.split('/')[-1].split('.')[0]
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print ('Error: Creating directory. ' +  directory)
  
    # Pass over the left ventricle data set, because it was the first anndata object created for the cell batch
    if sc_data.split('/')[-1].split('.')[0].split('_')[-1] == 'LV':
        pass
    
    else:
        print(sc_data.split('/')[-1].split('.')[0].split('_')[-1])
        for age in cell_dict.keys():
            if age == sc_data.split('/')[-1][2:5]:
                # Create anndata structure for cell batch data
                adata = sc.read(sc_data, cache=True, first_column_names=True).T
                # Add cell type to the cell batch's barcode
                adata.obs = adata.obs.rename(index={key: sc_data.split('/')[-1].split('.')[0].split('_')[-1] + "_" + key for key in adata.obs.index.tolist()})
                # Add metadata batch information to anndata object
                adata.obs['batch_name'] = sc_data.split('/')[-1].split('.')[0]
                # Convert gene ID's from Ensembl to Hugo
                adata.var = adata.var.rename(index=ensembl2symbol)
                # Concatenate the anndata structure for each cell type in the batch
                cell_dict[age].concatenate(adata, join='outer')
        break

Making variable names unique for controlled concatenation.


In [10]:
cell_dict['13W'].obs.index.tolist()[-1]

'LV_TTTGTCATCTTTACGT-1'