## Notebook setup

In [None]:
import scanpy as sc
import scanpy.external as sce
import numpy as np
import pandas as pd
import warnings, scipy.sparse as sp, matplotlib, matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.pyplot import rc_context
from collections import Counter
import matplotlib.font_manager
import pyreadr
import rpy2
from rpy2.robjects.packages import importr
import rpy2.robjects as robjects
import magic
#import seaborn as sns
import palantir
import loompy
#from scipy.sparse import csgraph

matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rcParams['font.family'] = 'sans-serif'
matplotlib.rcParams['font.sans-serif'] = 'Arial'
matplotlib.rc('font', size=14)

pd.set_option('display.max_rows', 200)

sc.set_figure_params(dpi=80, dpi_save=300, color_map='Spectral_r', vector_friendly=True, transparent=True)
sc.settings.verbosity = 0 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()

In [None]:
# preset color palettes and color maps
user_defined_palette =  [ '#F6222E', '#16FF32', '#3283FE', '#FEAF16', '#BDCDFF', '#3B00FB', '#1CFFCE', '#C075A6', '#F8A19F', '#B5EFB5', '#FBE426', '#C4451C', 
                          '#2ED9FF', '#c1c119', '#8b0000', '#FE00FA', '#1CBE4F', '#1C8356', '#0e452b', '#AA0DFE', '#B5EFB5', '#325A9B', '#90AD1C']

user_defined_cmap_markers = LinearSegmentedColormap.from_list('mycmap', ["#E6E6FF", "#CCCCFF", "#B2B2FF", "#9999FF",  "#6666FF",   "#3333FF", "#0000FF"])
user_defined_cmap_degs = LinearSegmentedColormap.from_list('mycmap', ["#0000FF", "#3333FF", "#6666FF", "#9999FF", "#B2B2FF", "#CCCCFF", "#E6E6FF", "#E6FFE6", "#CCFFCC", "#B2FFB2", "#99FF99", "#66FF66", "#33FF33", "#00FF00"])

In [None]:
%matplotlib inline 

## Perform quality control and clean-up samples

### Load cellranger output files

In [None]:
# directory where the 10X MEX-format gene-barcode matrices are stored
path_to_10xdata = "../data/external/10xGenomics/Chromium/Foxn1_lineage_tracing/20mo_Foxn1CreRosa26tdTom_f/cr-results/outs/filtered_feature_bc_matrix/"

adata_20mo_tmt_f = sc.read_10x_mtx(path = path_to_10xdata,
                        var_names = "gene_symbols",
                        cache = False)
adata_20mo_tmt_f.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`


In [None]:
# directory where the 10X MEX-format gene-barcode matrices are stored
path_to_10xdata = "../data/external/10xGenomics/Chromium/Foxn1_lineage_tracing/20mo_Foxn1CreRosa26tdTom_m/cr-results/outs/filtered_feature_bc_matrix/"

adata_20mo_tmt_m = sc.read_10x_mtx(path = path_to_10xdata,
                        var_names = "gene_symbols",
                        cache = False)
adata_20mo_tmt_m.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`

In [None]:
# directory where the 10X MEX-format gene-barcode matrices are stored
path_to_10xdata = "../data/external/10xGenomics/Chromium/Foxn1_lineage_tracing/20mo_wt_ctrl_m/cr-results/outs/filtered_feature_bc_matrix/"

adata_20mo_wt_m = sc.read_10x_mtx(path = path_to_10xdata,
                        var_names = "gene_symbols",
                        cache = True)
adata_20mo_wt_m.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`

In [None]:
adata = sc.concat(
    [adata_20mo_tmt_f, adata_20mo_tmt_m, adata_20mo_wt_m], # add more annadata objects here separated by commas
    join='outer', 
    label = 'sample', 
    keys = ['mo20_CD45neg_Foxn1creTom_f', 'mo20_CD45neg_Foxn1creTom_m', 'mo20_CD45neg_wt_m'], # or use your sample_names list (as used above) here. 
    # Make sure the order of the batch categories matches that of the AnnData objects 
    index_unique = '@'
)

In [None]:
adata.raw = adata # keep a copy of the raw adata 
np.random.seed(42) 
index_list = np.arange(adata.shape[0]) # randomize the order of cells for plotting
np.random.shuffle(index_list)
adata = adata[index_list]

In [None]:
adata.shape

In [None]:
# metadata
adata.obs['stage'] = '20mo'
adata.obs['day'] = 'd0'

In [None]:
adata.uns['stage_colors'] =  ['#FF8072']
adata.uns['day_colors'] = ['#0080FF']

### Calculate quality control metrics and perform standard data clean-up

In [None]:
sc.pp.calculate_qc_metrics(adata, inplace=True)
#store all unfiltered/unprocessed data prior to downstream analysis
adata.obs['original_total_counts'] = adata.obs['total_counts']
adata.obs['log10_original_total_counts'] = np.log10(adata.obs['original_total_counts'])

In [None]:
# mitochondrial genes
adata.var['mt'] = adata.var_names.str.startswith(('MT-', 'mt-')) 
# ribosomal genes
adata.var['ribo'] = adata.var_names.str.startswith(('RPS','RPL', 'Rps', 'Rpl'))
# hemoglobin genes.
adata.var['hb'] = adata.var_names.str.startswith(('^Hb', '^HB'))

# for each cell compute fraction of counts in mitochondrial genes and ribosomal genes vs. all genes 
adata.obs['mito_frac'] = np.sum(adata[:,adata.var['mt']==True].X, axis=1) / np.sum(adata.X, axis=1)
adata.obs['ribo_frac'] = np.sum(adata[:,adata.var['ribo']==True].X, axis=1) / np.sum(adata.X, axis=1)
adata.obs['hb_frac'] = np.sum(adata[:,adata.var['hb']==True].X, axis=1) / np.sum(adata.X, axis=1)

#### Identify doublet cells

In [None]:
sc.external.pp.scrublet(adata, threshold=0.35, random_state=42) # choose threshold manually

In [None]:
# check manual threshold
sc.external.pl.scrublet_score_distribution(adata)

#### Remove not expressed genes

In [None]:
# remove genes that are not expressed in any cells (remove columns with all 0s)
sc.pp.filter_genes(adata, min_cells=1)

#### Remove ribosomal protein genes

In [None]:
adata = adata[:,adata.var['ribo']==False]
adata.shape
adata = adata[:,adata.var['hb']==False]
adata.shape

#### Normalize for each cell's library size

In [None]:
sc.pp.normalize_per_cell(adata, counts_per_cell_after=10**4)

#### Log-transform counts

In [None]:
sc.pp.log1p(adata)

### Select subset of principal components 

In [None]:
rng = np.random.RandomState(42)
sc.tl.pca(adata, n_comps=200, svd_solver='arpack', random_state=rng, use_highly_variable=False)

In [None]:
def observe_variance(anndata_object):
    fig = plt.figure(figsize=(10,5))
    ax1 = fig.add_subplot(121)
    ax2 = fig.add_subplot(122)
    # variance per principal component
    x = range(len(anndata_object.uns['pca']['variance_ratio']))
    y = anndata_object.uns['pca']['variance_ratio']
    ax1.scatter(x,y,s=4)
    ax1.set_xlabel('PC')
    ax1.set_ylabel('Fraction of variance explained\n')
    ax1.set_title('Fraction of variance explained per PC\n')
    # cumulative variance explained
    cml_var_explained = np.cumsum(anndata_object.uns['pca']['variance_ratio'])
    x = range(len(anndata_object.uns['pca']['variance_ratio']))
    y = cml_var_explained
    ax2.scatter(x,y,s=4)
    ax2.set_xlabel('PC')
    ax2.set_ylabel('Cumulative fraction of variance\nexplained')
    ax2.set_title('Cumulative fraction of variance\nexplained by PCs')
    fig.tight_layout()
    plot = plt.show
    return(plot)
observe_variance(adata)

In [None]:
rng = np.random.RandomState(42)
sc.tl.pca(adata, n_comps=70,  svd_solver='arpack', random_state=rng, use_highly_variable=False)

In [None]:
sc.pp.neighbors(adata, n_neighbors=15, random_state=42)

In [None]:
sc.tl.umap(adata)

#### Sample metadata

In [None]:
sc.set_figure_params(dpi=80, dpi_save=300, color_map='viridis', vector_friendly=False, transparent=True)
sc.pl.umap(
    adata, 
    color=['stage', 'day', 'sample'], 
    color_map='Spectral_r', 
    use_raw=False,
    ncols=15,
    wspace = 0.3,
    outline_width=[0.6, 0.05],
    size=15,
    frameon=False,
    add_outline=True,
    sort_order = False
)

#### QC metrics

In [None]:
sc.set_figure_params(dpi=80, dpi_save=300, color_map='viridis', vector_friendly=False, transparent=True)
sc.pl.umap(
    adata, 
    color=['log10_original_total_counts', 'n_genes_by_counts','ribo_frac', 'mito_frac'], 
    palette=user_defined_palette,  
    color_map='Spectral_r',
    use_raw=False,
    ncols=5,
    wspace = 0.2,
    outline_width=[0.6, 0.05],
    size=15,
    frameon=False,
    add_outline=True,
    sort_order = False
)

#### Potential contaminant populations

In [None]:
sc.set_figure_params(dpi=80, dpi_save=300, color_map='viridis', vector_friendly=False, transparent=True)
sc.pl.umap(
    adata, 
    color=['Ptprc', 'Spi1', 'Igkc', 'Gcm2', 'Pth'], 
    palette=user_defined_palette,  
    color_map='Spectral_r',
    use_raw=False,
    ncols=5,
    wspace = 0.1,
    outline_width=[0.6, 0.05],
    size=15,
    frameon=False,
    add_outline=True,
    sort_order = False
)

#### Canonical markers

In [None]:
sc.set_figure_params(dpi=80, dpi_save=300, color_map='viridis', vector_friendly=False, transparent=True)
sc.pl.umap(
    adata, 
    color=['Epcam', 'H2-Aa', 'Pdgfra', 'Pecam1', 'Cdh5', 'Nkain4', 'Upk3b', 'Acta2', 'Myl9', 'S100b'], 
    palette=user_defined_palette,  
    color_map='Spectral_r', 
    use_raw=False,
    ncols=5,
    wspace = 0.1,
    outline_width=[0.6, 0.05],
    size=15,
    frameon=False,
    add_outline=True,
    sort_order = False
)

### Run unsupervised clustering analysis leiden

In [None]:
for resolution_parameter in [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0]:
    sc.tl.leiden(adata, resolution=resolution_parameter, random_state=42, 
                        key_added='leiden_'+str(resolution_parameter))

We can visualize the clustering to see which clusters match with the cells that we would like to filter out. Inspect the list of QC metrics and canonical markers to make your choice.

In [None]:
sc.set_figure_params(dpi=80, dpi_save=300, color_map='viridis', vector_friendly=False, transparent=True)
sc.pl.umap(
    adata, 
    color=['leiden_0.2', 'leiden_0.4', 'leiden_0.6', 'leiden_0.8', 'leiden_1.0'], 
    palette=user_defined_palette,  
    color_map='Spectral_r', 
    use_raw=False,
    ncols=5,
    wspace = 0.7,
    outline_width=[0.6, 0.05],
    size=15,
    frameon=False,
    add_outline=True,
    sort_order = False
)

In [None]:
sc.set_figure_params(dpi=80, dpi_save=300, color_map='viridis', vector_friendly=False, transparent=True)
sc.pl.umap(
    adata, 
    color=['leiden_1.2',  'leiden_1.4', 'leiden_1.6', 'leiden_1.8', 'leiden_2.0'], 
    palette=user_defined_palette,  
    color_map='Spectral_r', 
    use_raw=False,
    ncols=5,
    size=15,
    wspace = 0.7,
    outline_width=[0.6, 0.05],
    frameon=False,
    add_outline=True,
    sort_order = False
)

In [None]:
sc.set_figure_params(dpi=80, dpi_save=300, color_map='viridis', vector_friendly=False, transparent=True)
sc.pl.umap(
    adata, 
    color=['leiden_1.2'], 
    palette=user_defined_palette,  
    color_map='Spectral_r', 
    use_raw=False,
    ncols=5,
    size=15,
    wspace = 0.7,
    outline_width=[0.6, 0.05],
    frameon=False,
    add_outline=True,
    sort_order = False
)

### Filter out bad quality cells by cluster

Choose which clusters you want to remove, and add them to the list clusters_to_remove below.

In [None]:
clusters_to_remove = ['0', '1', '2', '3', '4', '6', '9', '11', '12', '13', '22', '23', '24']
cluster_filter = [x not in clusters_to_remove for x in adata.obs['leiden_1.2']]
print('Total number of cells pre-filtering: ' + str(adata.shape[0]))
print('Number of cells to keep after filtering: ' + str(sum(cluster_filter)))
adata_filtered = adata[cluster_filter]

In [None]:
sc.set_figure_params(dpi=80, dpi_save=300, color_map='viridis', vector_friendly=False, transparent=True)
sc.pl.umap(
    adata_filtered, 
    color=['leiden_1.2'], 
    palette=user_defined_palette,  
    color_map='Spectral_r', 
    use_raw=False,
    ncols=5,
    wspace = 0.5,
    outline_width=[0.6, 0.05],
    size=15,
    frameon=False,
    add_outline=True,
    sort_order = False
)

In [None]:
QC1 = adata_filtered

In [None]:
adata = adata_filtered

In [None]:
adata.shape

### Filter out doublets and cell contaminants

This would be also a good spot to remove the doublets you have identified above or any other cell population that does not match your experimental strategy

In [None]:
adata = adata[adata.obs['predicted_doublet'] == False]

In [None]:
sc.pl.umap(
    adata, 
    color=['predicted_doublet', 'doublet_score'], 
    palette=user_defined_palette,  
    color_map='Spectral_r', 
    use_raw=False,
    ncols=5,
    wspace = 0.1,
    outline_width=[0.6, 0.05],
    size=15,
    frameon=False,
    add_outline=True,
    sort_order = False
)

Let's now inspect if the right number of doublet cells has been removed from the dataset.

In [None]:
adata.shape

### Reanalyze data after removal of cells

Once we have removed all unwanted cells, we need to reanalyze our data in a similar way to steps 4 and 5. 

In [None]:
rng = np.random.RandomState(42)
sc.tl.pca(adata, n_comps=200, svd_solver='arpack', random_state=rng, use_highly_variable=False)

In [None]:
observe_variance(adata)

🕹️ Remember to choose a different number of principal components based on the re-analysis

In [None]:
rng = np.random.RandomState(42)
sc.tl.pca(adata, n_comps=65, svd_solver='arpack', random_state=rng, use_highly_variable=False)

In [None]:
sc.pp.neighbors(adata, n_neighbors=15)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.set_figure_params(dpi=80, dpi_save=300, color_map='viridis', vector_friendly=False, transparent=True)
sc.pl.umap(
    adata, 
    color=['Epcam', 'H2-Aa', 'Pdgfra', 'Pecam1',  'Cdh5', 'Vwf', 'Plvap', 'Bmp4', 'Nkain4', 'Upk3b', 'Acta2', 'Myl9', 'S100b'], 
    palette=user_defined_palette,  
    color_map='Spectral_r', 
    use_raw=False,
    ncols=5,
    wspace = 0.1,
    outline_width=[0.6, 0.05],
    size=15,
    frameon=False,
    add_outline=True,
    sort_order = False
)

In [None]:
sc.set_figure_params(dpi=80, dpi_save=300, color_map='viridis', vector_friendly=False, transparent=True)
sc.pl.umap(
    adata, 
    color=['stage', 'day', 'sample'],   
    color_map='Spectral_r', 
    use_raw=False,
    ncols=5,
    wspace = 0.3,
    outline_width=[0.6, 0.05],
    size=15,
    frameon=False,
    add_outline=True,
    sort_order = False
)

In [None]:
crosstb = pd.crosstab(adata.obs['sample'], adata.obs['keep_cell'], normalize='index')

In [None]:
with rc_context({'figure.figsize': (3, 7)}):
    ax = crosstb.plot(kind="barh", stacked=True, edgecolor = "black", color=user_defined_palette)
    ax.grid(False)
    ax.add_artist(lines.Line2D([0,0], [0,100], color='black',  lw=1,  ls='--'))
    ax.plot(legend=None)

## Data for Fig. 1 [pt2]

In [None]:
sc.pp.highly_variable_genes(adata, n_top_genes=3500, n_bins=20, flavor='seurat',  inplace=True)

In [None]:
rng = np.random.RandomState(42)
sc.tl.pca(adata, n_comps=200, svd_solver='arpack', random_state=rng, use_highly_variable=True)

In [None]:
observe_variance(adata)

In [None]:
rng = np.random.RandomState(42)
sc.tl.pca(adata, n_comps=55, svd_solver='arpack', random_state=rng, use_highly_variable=True)

In [None]:
sce.pp.harmony_integrate(adata, 'sample')

In [None]:
sc.pp.neighbors(adata, n_neighbors=15, use_rep='X_pca_harmony')
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color=['stage', 'day', 'sample'], 
                        color_map='Spectral_r',  use_raw=False, ncols=4, wspace = 0.3,
                        outline_width=[0.6, 0.05], size=15,  frameon=False, add_outline=True, sort_order = False)

In [None]:
sc.set_figure_params(dpi=80, dpi_save=300, color_map='viridis', vector_friendly=False, transparent=True)
sc.pl.umap(
    adata, 
    color=['Epcam', 'H2-Aa', 'Pdgfra', 'Pecam1',  'Cdh5', 'Vwf', 'Plvap', 'Bmp4', 'Nkain4', 'Upk3b', 'Acta2', 'Myl9', 'S100b'], 
    palette=user_defined_palette,  
    color_map='Spectral_r', 
    use_raw=False,
    ncols=5,
    wspace = 0.1,
    outline_width=[0.6, 0.05],
    size=15,
    frameon=False,
    add_outline=True,
    sort_order = False
)

In [None]:
for resolution_parameter in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
    sc.tl.leiden(adata, resolution=resolution_parameter, random_state=42, 
                        key_added='leiden_'+str(resolution_parameter))

We can visualize the clustering to see which clusters match with the cells that we would like to filter out. Inspect the list of QC metrics and canonical markers to make your choice.

In [None]:
sc.set_figure_params(dpi=80, dpi_save=300, color_map='viridis', vector_friendly=False, transparent=True)
sc.pl.umap(
    adata, 
    color=['leiden_0.1', 'leiden_0.2', 'leiden_0.3', 'leiden_0.4', 'leiden_0.5', 
           'leiden_0.6', 'leiden_0.7', 'leiden_0.8','leiden_0.9', 'leiden_1.0'], 
    palette=user_defined_palette,  
    color_map='Spectral_r', 
    use_raw=False,
    ncols=5,
    wspace = 0.7,
    outline_width=[0.6, 0.05],
    size=15,
    frameon=False,
    add_outline=True,
    sort_order = False
)

### Create group annotation

In [None]:
adata.obs['cell_type'] = ['FB' if (x=='0' or x=='2' or x=='9') else 
                          'EC' if (x=='1' or x=='6' or x=='10') else
                          'TEC' if (x=='3' or x=='5' or x=='11') else 
                          '6:MEC' if x=='8' else
                          '7:vSMC/PC' if (x=='4' or x=='7') else
                          '8:nmSC' if (x=='12') else 'ERROR' for x in adata.obs['leiden_0.5']] 

In [None]:
adata.uns['cell_type_colors'] = ['#d62728','#19c9b3', '#FFA5D2', '#ff7f0e','#199919', '#aa40fc']

In [None]:
sc.pl.umap(adata, color=['cell_type', 'stage', 'day'], 
                        color_map='Spectral_r',  use_raw=False, ncols=4, wspace = 0.3,
                        outline_width=[0.6, 0.05], size=15,  frameon=False, add_outline=True, sort_order = False)

In [None]:
path_to_h5ad = '../output/metadata/anndata_objects/Fig1pt2.h5ad'

In [None]:
adata.write(path_to_h5ad)

In [None]:
adata = sc.read_h5ad(path_to_h5ad)
adata.uns['log1p']["base"] = None