In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import anndata
import scanpy as sc
import scanpy_scripts as ss
import bbknn
import phate

np.set_printoptions(linewidth=180)
sc.settings.verbosity = 1
expr_cmap = ss.lib.expression_colormap()
ss.lib.set_figsize((4, 4))

In [65]:
def read_10x_mtx(path, genome='GRCh38', **kwargs):
    try:
        ad = sc.read_10x_mtx(path, **kwargs)
        version = 'v3'
    except:
        ad = sc.read_10x_mtx(f'{path}/{genome}', **kwargs)
        version = 'v2'
    return ad, version

In [2]:
cellranger_dir = '../data/skin_organoid_koehler'

In [35]:
sample_info = pd.read_csv(f'{cellranger_dir}/samples.tsv', sep='\t')

In [36]:
sample_info

Unnamed: 0,day,week,strain,path
0,day-133,17-20_fetal_wks,DSP-1,filtered_feature_bc_matrix
1,day-133,17-20_fetal_wks,DSP-2,filtered_feature_bc_matrix
2,day-133,17-20_fetal_wks,DSP-3,filtered_feature_bc_matrix
3,day-133,17-20_fetal_wks,WA25-1,filtered_feature_bc_matrix
4,day-133,17-20_fetal_wks,WA25-2,filtered_feature_bc_matrix
5,day-133,17-20_fetal_wks,WA25-3,filtered_feature_bc_matrix
6,day-29,4-7_fetal_wks,DSP,filtered_gene_bc_matrices/GRCh38
7,day-29,4-7_fetal_wks,WA25,filtered_gene_bc_matrices/GRCh38
8,day-48,7-10_fetal_wks,DSP,filtered_feature_bc_matrix
9,day-48,7-10_fetal_wks,WA25,filtered_feature_bc_matrix


In [66]:
ads = []

In [67]:
for i, row in sample_info.iterrows():
    mtx_dir = f'{cellranger_dir}/{row["day"]}_skin_organoids_{row["week"]}/cellranger_outputs/{row["strain"]}/outs/{row["path"]}'
    ad, version = read_10x_mtx(mtx_dir, var_names='gene_ids')
    ad.obs['day'] = row['day']
    ad.obs['week'] = row['week']
    ad.obs['strain'] = row['strain']
    ad.obs['version'] = version
    ads.append(ad)

In [77]:
for ad in ads:
    day = ad.obs['day'][0]
    week = ad.obs['week'][0]
    strain = ad.obs['strain'][0]
    ad.write(f'{day}_{week}_{strain}.raw.h5ad', compression='lzf')

... storing 'day' as categorical
... storing 'week' as categorical
... storing 'strain' as categorical
... storing 'version' as categorical
... storing 'gene_symbols' as categorical
... storing 'feature_types' as categorical
... storing 'day' as categorical
... storing 'week' as categorical
... storing 'strain' as categorical
... storing 'version' as categorical
... storing 'gene_symbols' as categorical
... storing 'feature_types' as categorical
... storing 'day' as categorical
... storing 'week' as categorical
... storing 'strain' as categorical
... storing 'version' as categorical
... storing 'gene_symbols' as categorical
... storing 'feature_types' as categorical
... storing 'day' as categorical
... storing 'week' as categorical
... storing 'strain' as categorical
... storing 'version' as categorical
... storing 'gene_symbols' as categorical
... storing 'feature_types' as categorical
... storing 'day' as categorical
... storing 'week' as categorical
... storing 'strain' as categoric

In [95]:
for ad in ads:
    ss.lib.run_scrublet(ad)

[AnnData object with n_obs × n_vars = 12403 × 33538 
     obs: 'day', 'week', 'strain', 'version'
     var: 'gene_symbols', 'feature_types',
 AnnData object with n_obs × n_vars = 11330 × 33538 
     obs: 'day', 'week', 'strain', 'version'
     var: 'gene_symbols', 'feature_types',
 AnnData object with n_obs × n_vars = 14624 × 33538 
     obs: 'day', 'week', 'strain', 'version'
     var: 'gene_symbols', 'feature_types',
 AnnData object with n_obs × n_vars = 13184 × 33538 
     obs: 'day', 'week', 'strain', 'version'
     var: 'gene_symbols', 'feature_types',
 AnnData object with n_obs × n_vars = 16779 × 33538 
     obs: 'day', 'week', 'strain', 'version'
     var: 'gene_symbols', 'feature_types',
 AnnData object with n_obs × n_vars = 11960 × 33538 
     obs: 'day', 'week', 'strain', 'version'
     var: 'gene_symbols', 'feature_types',
 AnnData object with n_obs × n_vars = 9144 × 33694 
     obs: 'day', 'week', 'strain', 'version'
     var: 'gene_symbols',
 AnnData object with n_obs × n_

In [68]:
organoid_ad = anndata.AnnData.concatenate(*ads, batch_key='sample_id')

In [69]:
organoid_ad

AnnData object with n_obs × n_vars = 147089 × 32991 
    obs: 'day', 'sample_id', 'strain', 'version', 'week'
    var: 'gene_symbols-0', 'feature_types-0', 'gene_symbols-1', 'feature_types-1', 'gene_symbols-2', 'feature_types-2', 'gene_symbols-3', 'feature_types-3', 'gene_symbols-4', 'feature_types-4', 'gene_symbols-5', 'feature_types-5', 'gene_symbols-6', 'gene_symbols-7', 'gene_symbols-8', 'feature_types-8', 'gene_symbols-9', 'feature_types-9', 'gene_symbols-10', 'gene_symbols-11'

In [72]:
organoid_ad.var = organoid_ad.var[['gene_symbols-0']].rename(columns={'gene_symbols-0': 'gene_symbols'})

In [88]:
organoid_ad

AnnData object with n_obs × n_vars = 147089 × 32991 
    obs: 'day', 'sample_id', 'strain', 'version', 'week'
    var: 'gene_symbols', 'gene_ids'

In [80]:
organoid_ad.obs.sample_id.value_counts().sort_index()

0     12403
1     11330
2     14624
3     13184
4     16779
5     11960
6      9144
7      9356
8     13914
9     15903
10     9556
11     8936
Name: sample_id, dtype: int64

In [85]:
organoid_ad.obs['version'] = organoid_ad.obs['version'].astype(str)
organoid_ad.obs.loc[organoid_ad.obs.sample_id.isin(['6','7','10','11']), 'version'] = 'v2'
organoid_ad.obs['version'] = organoid_ad.obs['version'].astype('category')

In [86]:
organoid_ad.obs.version.value_counts().sort_index()

v2     36992
v3    110097
Name: version, dtype: int64

In [89]:
organoid_ad.obs.strain.value_counts().sort_index()

DSP       32614
DSP-1     12403
DSP-2     11330
DSP-3     14624
WA25      34195
WA25-1    13184
WA25-2    16779
WA25-3    11960
Name: strain, dtype: int64

In [92]:
organoid_ad.obs['strain'].str.partition('-')[0]

AAACCCAAGAAATTGC-1-0      DSP
AAACCCAAGAATACAC-1-0      DSP
AAACCCAAGCTAAACA-1-0      DSP
AAACCCAAGTGAACAT-1-0      DSP
AAACCCACAATAGTCC-1-0      DSP
                         ... 
TTTGTCACATGCCCGA-1-11    WA25
TTTGTCAGTAGTACCT-1-11    WA25
TTTGTCATCAAGAAGT-1-11    WA25
TTTGTCATCAGGCAAG-1-11    WA25
TTTGTCATCTTGTACT-1-11    WA25
Name: 0, Length: 147089, dtype: object

In [93]:
organoid_ad.obs['strain'] = organoid_ad.obs['strain'].astype(str).str.partition('-')[0].values
organoid_ad.obs['strain'] = organoid_ad.obs['strain'].astype('category')

In [3]:
organoid_ad.var['gene_ids'] = organoid_ad.var_names

In [4]:
organoid_ad.var_names = organoid_ad.var['gene_symbols'].values

Variable names are not unique. To make them unique, call `.var_names_make_unique`.


In [94]:
organoid_ad.write('organoid_concatenated.raw.h5ad', compression='lzf')