# Import and settings

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os, sys, joblib, gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import anndata
import scanpy as sc
import sctk as sk
import scrublet

expr_cmap = sk.expression_colormap()
np.set_printoptions(linewidth=180)
sc.settings.verbosity = 1



# Load data

## Read metadata

In [3]:
METADATA_TSV = "../data/misc/fetal_skin_samples.txt"
RACHEL19_ANNOT_TSV = "../data/misc/fetal_annotation_rachel-201904.txt"

In [4]:
df = pd.read_csv(METADATA_TSV, sep="\t")

In [5]:
df

Unnamed: 0,sanger_id,chemistry,donor,gender,pcw,sorting,sample
0,4834STDY7002871,SC3Pv2,F15,male,12,CD45P,F15_male_12+0PCW
1,4834STDY7002872,SC3Pv2,F15,male,12,CD45N,F15_male_12+0PCW
2,4834STDY7002879,SC3Pv2,F16,male,8,CD45P,F16_male_8+1PCW
3,4834STDY7002880,SC3Pv2,F16,male,8,CD45N,F16_male_8+1PCW
4,4834STDY7002883,SC3Pv2,F17,male,9,Total,F17_male_9+1PCW
5,4834STDY7038752,SC3Pv2,F17,male,9,CD45P,F17_male_9+1PCW
6,4834STDY7038753,SC3Pv2,F17,male,9,CD45N,F17_male_9+1PCW
7,FCAImmP7241240,SC3Pv2,F19,female,10,CD45P,F19_female_10+2PCW
8,FCAImmP7241241,SC3Pv2,F19,female,10,CD45N,F19_female_10+2PCW
9,FCAImmP7316886,SC3Pv2,F33,female,9,CD45P,F33_female_9+5PCW


In [6]:
df = df[~df.chemistry.str.endswith("VDJ") & ~df.donor.isin(["F15"])].reset_index(
    drop=True
)

In [7]:
len(df.sanger_id.unique())

45

In [8]:
df.sorting.value_counts()

CD45N     21
CD45P     21
CD45en     2
Total      1
Name: sorting, dtype: int64

In [9]:
df.chemistry.value_counts()

SC3Pv2     24
SC5P-R2    21
Name: chemistry, dtype: int64

In [10]:
len(df.donor.unique())

15

## Read count matrices

In [11]:
samples = []
ads = []
n_sample = df.shape[0]
fig, axs = plt.subplots(
    ncols=5,
    nrows=int(np.ceil(n_sample / 5)),
    sharex=True,
    sharey=True,
    figsize=(25, np.ceil(n_sample / 5) * 5),
)
sc.settings.verbosity = 0
for i, sid in enumerate(df["sanger_id"]):
    h5cr = f"../data/h5cr/{sid}_raw.h5"
    source_csv = f"../data/h5cr/{sid}_cell_source.csv"

    if not (os.path.exists(h5cr) and os.path.exists(source_csv)):
        continue
    samples.append(sid)
    print(sid)

    # Read 10X raw matrix here to make knee plot, will subset to called cells later
    ad = sc.read_10x_h5(h5cr)
    ad.var_names_make_unique()
    src = pd.read_csv(source_csv)
    sc.pp.filter_cells(ad, min_counts=1)

    # Make knee plot
    colors = pd.Series(["k"] * ad.shape[0])
    k_cr_called = ad.obs_names.str.replace("-1", "").isin(
        src.Barcode[src.CellSource == "CellRanger"]
    )
    k_ed_called = ad.obs_names.str.replace("-1", "").isin(
        src.Barcode[src.CellSource == "EmptyDrops"]
    )
    k_both_called = ad.obs_names.str.replace("-1", "").isin(
        src.Barcode[src.CellSource == "Both"]
    )
    colors[k_both_called] = "g"
    colors[k_cr_called] = "b"
    colors[k_ed_called] = "r"
    sk.plot_metric_by_rank(
        ad,
        ax=axs[i // 5, i - (i // 5) * 5],
        c=colors.values,
        swap_axis=True,
        title=f"{sid}, {df.chemistry[df.sanger_id==sid].iloc[0]}, {df.sorting[df.sanger_id==sid].iloc[0]}",
        s=5,
    )

    # Now subset to called cells
    ad = ad[ad.obs_names.str.replace("-1", "").isin(src.Barcode.values), :]
    k_cr_called = ad.obs_names.str.replace("-1", "").isin(
        src.Barcode[src.CellSource == "CellRanger"]
    )
    k_ed_called = ad.obs_names.str.replace("-1", "").isin(
        src.Barcode[src.CellSource == "EmptyDrops"]
    )
    k_both_called = ad.obs_names.str.replace("-1", "").isin(
        src.Barcode[src.CellSource == "Both"]
    )
    cell_caller = np.array(["Both"] * ad.shape[0])
    cell_caller[k_cr_called] = "CellRanger"
    cell_caller[k_ed_called] = "EmptyDrops"
    ad.obs["cell_caller"] = cell_caller

    # Run scrublet
    sk.run_scrublet(ad)
    ad.obs.rename(columns={"bh_pval": "bh_doublet_pval"}, inplace=True)
    ads.append(ad)

fig.savefig("knee_plots_per_run.png", bbox_inches="tight")
sc.settings.verbosity = 1

4834STDY7002879


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


4834STDY7002880


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


4834STDY7002883


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


4834STDY7038752


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


4834STDY7038753


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7241240


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7241241


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7316886


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7316887


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7316888


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7316896


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7316897


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7352189


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7352190


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7352191


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7462240


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7462241


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7528290


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7528291


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7528296


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7555848


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7555858


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7579212


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7579213


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7579224


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7803024


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7803025


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7803026


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7803027


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7803034


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7803035


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7803042


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7803043


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7862094


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7862095


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7862096


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7964502


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7964503


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7964504


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7964505


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7964506


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7964507


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7964508


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7964509


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


FCAImmP7964510


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.


# Process

## Pool samples

In [12]:
fetal_ad = anndata.AnnData.concatenate(
    *ads, batch_key="sanger_id", batch_categories=samples
)

In [13]:
fetal_ad

AnnData object with n_obs × n_vars = 235201 × 33694
    obs: 'n_counts', 'cell_caller', 'scrublet_score', 'scrublet_score_z', 'cluster_scrublet_score', 'bh_doublet_pval', 'scrublet_done', 'sanger_id'
    var: 'gene_ids'

In [14]:
fetal_ad.obs = (
    fetal_ad.obs.reset_index()
    .merge(df, how="left", on=["sanger_id"])
    .set_index("index")
)

In [15]:
fetal_ad.obs.cell_caller.value_counts()

Both    193381
Empt     35637
Cell      6183
Name: cell_caller, dtype: int64

In [16]:
fetal_ad.obs["chemistry_sorting"] = fetal_ad.obs.chemistry + "_" + fetal_ad.obs.sorting

In [17]:
fetal_ad.obs.groupby(["chemistry_sorting", "sanger_id"]).size().groupby(
    "chemistry_sorting"
).mean()

chemistry_sorting
SC3Pv2_CD45N      5001.090909
SC3Pv2_CD45P      3251.750000
SC3Pv2_Total      1374.000000
SC5P-R2_CD45N     8103.800000
SC5P-R2_CD45P     4978.777778
SC5P-R2_CD45en    6973.500000
dtype: float64

In [7]:
fetal_ad

AnnData object with n_obs × n_vars = 235201 × 33694
    obs: 'bh_doublet_pval', 'cell_caller', 'cluster_scrublet_score', 'doublet_pval', 'mt_prop', 'n_counts', 'n_genes', 'sanger_id', 'scrublet_score', 'chemistry', 'donor', 'gender', 'pcw', 'sorting', 'sample', 'chemistry_sorting'
    var: 'gene_ids', 'cc'

In [8]:
annot = pd.read_csv(
    RACHEL19_ANNOT_TSV,
    sep="\t",
    header=None,
    names=["index", "annot"],
    index_col=0,
)

In [14]:
fetal_ad.obs["annot"] = fetal_ad.obs.merge(
    annot, how="left", left_index=True, right_index=True
)["annot"].astype(str)

In [15]:
fetal_ad.obs.head()

Unnamed: 0_level_0,bh_doublet_pval,cell_caller,cluster_scrublet_score,doublet_pval,mt_prop,n_counts,n_genes,sanger_id,scrublet_score,chemistry,donor,gender,pcw,sorting,sample,chemistry_sorting,annot
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
AAACCTGCACTCTGTC-1-4834STDY7002879,0.907861,Empt,0.164557,0.455284,0.202597,385.0,239,4834STDY7002879,0.10559,SC3Pv2,F16,male,8,CD45P,F16_male_8+1PCW,SC3Pv2_CD45P,
AAACCTGGTCAGTGGA-1-4834STDY7002879,0.907861,Both,0.157082,0.5,0.062532,5917.0,1776,4834STDY7002879,0.225806,SC3Pv2,F16,male,8,CD45P,F16_male_8+1PCW,SC3Pv2_CD45P,fs_Macrophage
AAAGATGGTCGATTGT-1-4834STDY7002879,0.907861,Both,0.157082,0.5,0.030894,10261.0,2750,4834STDY7002879,0.149606,SC3Pv2,F16,male,8,CD45P,F16_male_8+1PCW,SC3Pv2_CD45P,fs_Monocyte
AAAGCAAAGATGTGGC-1-4834STDY7002879,0.882352,Both,0.225806,0.150885,0.012647,7749.0,2308,4834STDY7002879,0.20197,SC3Pv2,F16,male,8,CD45P,F16_male_8+1PCW,SC3Pv2_CD45P,fs_Macrophage
AAAGCAACAAGCCGTC-1-4834STDY7002879,0.907861,Empt,0.164557,0.455284,0.197686,1037.0,606,4834STDY7002879,0.225806,SC3Pv2,F16,male,8,CD45P,F16_male_8+1PCW,SC3Pv2_CD45P,


## Save object

In [26]:
fetal_ad.write("../data/h5ad/fetal_skin_raw.20190926.h5ad", compression="gzip")

... storing 'cell_caller' as categorical
... storing 'sanger_id' as categorical
... storing 'chemistry' as categorical
... storing 'donor' as categorical
... storing 'gender' as categorical
... storing 'sorting' as categorical
... storing 'sample' as categorical
... storing 'chemistry_sorting' as categorical
