In [3]:
import os
import muon as mu
import numpy as np
import scanpy as sc
import scirpy as ir
from cycler import cycler
from matplotlib import cm as mpl_cm
from matplotlib import pyplot as plt

# temporary fix for deprecated matplotlib functionality
import IPython.display
from matplotlib_inline.backend_inline import set_matplotlib_formats

IPython.display.set_matplotlib_formats = set_matplotlib_formats

sc.set_figure_params(figsize=(4, 4))
sc.settings.verbosity = 2  # verbosity: errors (0), warnings (1), info (2), hints (3)

In [10]:
use_3k = True  # use 3k or 10k dataset

# Path to saved file
file_path = "data/wu2020_3k.h5mu" if use_3k else "data/wu2020.h5mu"

# Check if the file already exists
if os.path.exists(file_path):
    print("Loading wu2020 dataset from disk...")
    mdata = mu.read(file_path)
else:
    print("Downloading wu2020 dataset...")
    if use_3k:
        mdata = ir.datasets.wu2020_3k()
    else:
        mdata = ir.datasets.wu2020()
    print("Saving dataset to disk...")
    os.makedirs("data", exist_ok=True)
    mdata.write(file_path)

print(mdata)

Downloading file 'wu2020_3k.h5mu' from 'https://scverse-exampledata.s3.eu-west-1.amazonaws.com/scirpy/wu2020_3k.h5mu' to '/Users/karl/Library/Caches/scirpy/0.22.3'.


Downloading wu2020 dataset...


100%|█████████████████████████████████████| 17.3M/17.3M [00:00<00:00, 22.7GB/s]


Saving dataset to disk...
MuData object with n_obs × n_vars = 3000 × 30727
  2 modalities
    gex:	3000 x 30727
      obs:	'cluster_orig', 'patient', 'sample', 'source'
      uns:	'cluster_orig_colors'
      obsm:	'X_umap_orig'
    airr:	3000 x 0
      obs:	'high_confidence', 'is_cell', 'clonotype_orig'
      obsm:	'airr'


  self._update_attr("var", axis=0, join_common=join_common)
  self._update_attr("obs", axis=1, join_common=join_common)
  self._update_attr("var", axis=0, join_common=join_common)
  self._update_attr("obs", axis=1, join_common=join_common)


In [11]:
sc.pp.filter_genes(mdata["gex"], min_cells=10)
sc.pp.filter_cells(mdata["gex"], min_genes=100)

filtered out 18877 genes that are detected in less than 10 cells


In [12]:
sc.pp.normalize_per_cell(mdata["gex"])
sc.pp.log1p(mdata["gex"])
sc.pp.highly_variable_genes(mdata["gex"], flavor="cell_ranger", n_top_genes=5000)
sc.tl.pca(mdata["gex"])
sc.pp.neighbors(mdata["gex"])

normalizing by total count per cell
    finished (0:00:00): normalized adata.X and added
    'n_counts', counts per cell before normalization (adata.obs)
extracting highly variable genes
    finished (0:00:00)
computing PCA
    with n_comps=50


  sc.pp.normalize_per_cell(mdata["gex"])
  normalize_per_cell(


    finished (0:00:00)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished (0:00:00)


In [13]:
ir.pp.index_chains(mdata)

Filtering chains...
Indexing VJ chains...
Indexing VDJ chains...
build result array


In [14]:
ir.tl.chain_qc(mdata)

Stored result in `mdata.obs["airr:receptor_type"]`.
Stored result in `mdata.obs["airr:receptor_subtype"]`.
Stored result in `mdata.obs["airr:chain_pairing"]`.


In [15]:
mu.pp.filter_obs(mdata, "airr:chain_pairing", lambda x: x != "multichain")
mu.pp.filter_obs(mdata, "airr:chain_pairing", lambda x: ~np.isin(x, ["orphan VDJ", "orphan VJ"]))

In [16]:
# using default parameters, `ir_dist` will compute nucleotide sequence identity
ir.pp.ir_dist(mdata)
ir.tl.define_clonotypes(mdata, receptor_arms="all", dual_ir="primary_only")
mdata.obs.groupby("gex:source", dropna=False).size()

Computing sequence x sequence distance matrix for VJ sequences.
Computing sequence x sequence distance matrix for VDJ sequences.
Initializing lookup tables. 
Computing clonotype x clonotype distances.
Stored result in `mdata.obs["airr:clone_id"]`.
Stored result in `mdata.obs["airr:clone_id_size"]`.


  mdata.obs.groupby("gex:source", dropna=False).size()


gex:source
Blood    107
NAT      756
Tumor    982
dtype: int64

In [17]:
print(mdata)

MuData object with n_obs × n_vars = 1845 × 30727
  2 modalities
    gex:	1845 x 11850
      obs:	'cluster_orig', 'patient', 'sample', 'source', 'n_genes', 'n_counts'
      var:	'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
      uns:	'cluster_orig_colors', 'log1p', 'hvg', 'pca', 'neighbors'
      obsm:	'X_umap_orig', 'X_pca'
      varm:	'PCs'
      obsp:	'distances', 'connectivities'
    airr:	1845 x 0
      obs:	'high_confidence', 'is_cell', 'clonotype_orig', 'receptor_type', 'receptor_subtype', 'chain_pairing', 'clone_id', 'clone_id_size'
      uns:	'chain_indices', 'ir_dist_nt_identity', 'clone_id'
      obsm:	'airr', 'chain_indices'


In [18]:
mdata.update()
# Get metadata from both modalities
gex_obs = mdata.mod["gex"].obs
airr_obs = mdata.mod["airr"].obs

  self._update_attr("var", axis=0, join_common=join_common)
  self._update_attr("obs", axis=1, join_common=join_common)


In [19]:
gex_obs

Unnamed: 0,cluster_orig,patient,sample,source,n_genes,n_counts
RN2_AGAGCGACAGATTGCT-1,4.4-FOS,Renal2,RN2,NAT,740,1712.0
LN1_GTCATTTCAATGAAAC-1,8.2-Tem,Lung1,LN1,NAT,1264,3195.0
LN2_GCACTCTCAGGGATTG-2,4.4-FOS,Lung2,LN2,NAT,2086,6866.0
LN4_GAAACTCTCATCATTC-1,8.2-Tem,Lung4,LN4,NAT,250,357.0
LT6_AGTTGGTGTACCGCTG-1,4.2-RPL32,Lung6,LT6,Tumor,919,3646.0
...,...,...,...,...,...,...
LT6_CTCAGAATCAGAGGTG-1,4.6a-Treg,Lung6,LT6,Tumor,1670,3470.0
LT5_GACCTGGAGGAGTAGA-1,4.5-IL6ST,Lung5,LT5,Tumor,1265,2866.0
RT3_GCAGTTAGTATGAAAC-1,4.2-RPL32,Renal3,RT3,Tumor,574,1773.0
RT1_TAAGAGATCCTTAATC-1,4.5-IL6ST,Renal1,RT1,Tumor,673,1977.0


In [20]:
airr_obs

Unnamed: 0,high_confidence,is_cell,clonotype_orig,receptor_type,receptor_subtype,chain_pairing,clone_id,clone_id_size
RN2_AGAGCGACAGATTGCT-1,True,True,renal2.tnb.C1362,TCR,TRA+TRB,single pair,0,1
LN1_GTCATTTCAATGAAAC-1,True,True,lung1.tn.C25,TCR,TRA+TRB,single pair,1,1
LN2_GCACTCTCAGGGATTG-2,True,True,lung2.tn.C5631,TCR,TRA+TRB,single pair,2,1
LN4_GAAACTCTCATCATTC-1,True,True,lung4.tn.C3988,TCR,TRA+TRB,single pair,3,1
LT6_AGTTGGTGTACCGCTG-1,True,True,lung6.tnb.C999,TCR,TRA+TRB,single pair,4,1
...,...,...,...,...,...,...,...,...
LT6_CTCAGAATCAGAGGTG-1,True,True,lung6.tnb.C122,TCR,TRA+TRB,single pair,1522,1
LT5_GACCTGGAGGAGTAGA-1,True,True,lung5.tn.C1505,TCR,TRA+TRB,single pair,1523,1
RT3_GCAGTTAGTATGAAAC-1,True,True,renal3.tnb.C176,TCR,TRA+TRB,single pair,928,4
RT1_TAAGAGATCCTTAATC-1,True,True,renal1.tnb.C83,TCR,TRA+TRB,single pair,1524,1


In [21]:
# Merge metadata (index = cell barcodes)
merged_obs = gex_obs.join(airr_obs, how="inner")
merged_obs

Unnamed: 0,cluster_orig,patient,sample,source,n_genes,n_counts,high_confidence,is_cell,clonotype_orig,receptor_type,receptor_subtype,chain_pairing,clone_id,clone_id_size
RN2_AGAGCGACAGATTGCT-1,4.4-FOS,Renal2,RN2,NAT,740,1712.0,True,True,renal2.tnb.C1362,TCR,TRA+TRB,single pair,0,1
LN1_GTCATTTCAATGAAAC-1,8.2-Tem,Lung1,LN1,NAT,1264,3195.0,True,True,lung1.tn.C25,TCR,TRA+TRB,single pair,1,1
LN2_GCACTCTCAGGGATTG-2,4.4-FOS,Lung2,LN2,NAT,2086,6866.0,True,True,lung2.tn.C5631,TCR,TRA+TRB,single pair,2,1
LN4_GAAACTCTCATCATTC-1,8.2-Tem,Lung4,LN4,NAT,250,357.0,True,True,lung4.tn.C3988,TCR,TRA+TRB,single pair,3,1
LT6_AGTTGGTGTACCGCTG-1,4.2-RPL32,Lung6,LT6,Tumor,919,3646.0,True,True,lung6.tnb.C999,TCR,TRA+TRB,single pair,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LT6_CTCAGAATCAGAGGTG-1,4.6a-Treg,Lung6,LT6,Tumor,1670,3470.0,True,True,lung6.tnb.C122,TCR,TRA+TRB,single pair,1522,1
LT5_GACCTGGAGGAGTAGA-1,4.5-IL6ST,Lung5,LT5,Tumor,1265,2866.0,True,True,lung5.tn.C1505,TCR,TRA+TRB,single pair,1523,1
RT3_GCAGTTAGTATGAAAC-1,4.2-RPL32,Renal3,RT3,Tumor,574,1773.0,True,True,renal3.tnb.C176,TCR,TRA+TRB,single pair,928,4
RT1_TAAGAGATCCTTAATC-1,4.5-IL6ST,Renal1,RT1,Tumor,673,1977.0,True,True,renal1.tnb.C83,TCR,TRA+TRB,single pair,1524,1


In [23]:
# Step 1: Filter for Renal2 patient
renal2_obs = merged_obs[merged_obs["patient"] == "Renal2"]
# Step 2: Find clonotypes found in blood
blood_obs = merged_obs[merged_obs["source"] == "Blood"]
blood_clonotypes = set(blood_obs["clonotype_orig"].dropna())

blood_obs

Unnamed: 0,cluster_orig,patient,sample,source,n_genes,n_counts,high_confidence,is_cell,clonotype_orig,receptor_type,receptor_subtype,chain_pairing,clone_id,clone_id_size
LB6_GACTGCGTCGTTACGA-1,4.1-Trm,Lung6,LB6,Blood,956,2742.0,True,True,lung6.tnb.C10105,TCR,TRA+TRB,single pair,15,1
LB6_CCTCTGAGTATAGTAG-1,8.3a-Trm,Lung6,LB6,Blood,1749,8397.0,True,True,lung6.tnb.C9344,TCR,TRA+TRB,single pair,53,1
RB2_CAAGTTGCAATAGAGT-1,8.3c-Trm,Renal2,RB2,Blood,899,2340.0,True,True,renal2.tnb.C6,TCR,TRA+TRB,single pair,68,20
LB6_CCTAGCTAGGACCACA-1,8.2-Tem,Lung6,LB6,Blood,834,1992.0,True,True,lung6.tnb.C2059,TCR,TRA+TRB,single pair,97,1
RB3_AGTGAGGGTTGGTGGA-1,4.5-IL6ST,Renal3,RB3,Blood,819,2723.0,True,True,renal3.tnb.C1629,TCR,TRA+TRB,single pair,106,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
RB2_CGGTTAAGTGATGTGG-1,8.3b-Trm,Renal2,RB2,Blood,1031,2829.0,True,True,renal2.tnb.C23,TCR,TRA+TRB,single pair,101,19
LB6_TGAGAGGAGATCTGAA-1,4.3-TCF7,Lung6,LB6,Blood,1269,4970.0,True,True,lung6.tnb.C11312,TCR,TRA+TRB,single pair,1493,1
LB6_GTATTCTAGCTGCCCA-1,4.3-TCF7,Lung6,LB6,Blood,1014,3831.0,True,True,lung6.tnb.C10712,TCR,TRA+TRB,single pair,1509,1
RB1_GAAACTCCAATCGGTT-1,8.3c-Trm,Renal1,RB1,Blood,1033,2386.0,True,True,renal1.tnb.C1460,TCR,TRA+TRB,single pair,1510,1


In [24]:
# Step 3: Filter Renal2 cells with clonotypes NOT found in blood
renal2_independent = renal2_obs[
    (~renal2_obs["clonotype_orig"].isin(blood_clonotypes)) &
    (renal2_obs["clonotype_orig"].notna())
]
renal2_independent

Unnamed: 0,cluster_orig,patient,sample,source,n_genes,n_counts,high_confidence,is_cell,clonotype_orig,receptor_type,receptor_subtype,chain_pairing,clone_id,clone_id_size
RN2_AGAGCGACAGATTGCT-1,4.4-FOS,Renal2,RN2,NAT,740,1712.0,True,True,renal2.tnb.C1362,TCR,TRA+TRB,single pair,0,1
RN2_CTCGAAATCACAATGC-1,4.1-Trm,Renal2,RN2,NAT,982,2338.0,True,True,renal2.tnb.C2060,TCR,TRA+TRB,single pair,46,1
RN2_CTCGGAGTCCAGATCA-1,8.3a-Trm,Renal2,RN2,NAT,1989,6864.0,True,True,renal2.tnb.C1242,TCR,TRA+TRB,single pair,61,1
RN2_GCATGATTCTAGCACA-1,4.6a-Treg,Renal2,RN2,NAT,963,2458.0,True,True,renal2.tnb.C2838,TCR,TRA+TRB,single pair,67,1
RN2_AGTAGTCTCAACACTG-1,3.1-MT,Renal2,RN2,NAT,741,1682.0,True,True,renal2.tnb.C915,TCR,TRA+TRB,single pair,124,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
RT2_TGGCCAGCAAACGTGG-1,4.1-Trm,Renal2,RT2,Tumor,1240,3205.0,True,True,renal2.tnb.C85,TCR,TRA+TRB,single pair,1465,1
RT2_ACTTTCAAGGACATTA-1,8.3c-Trm,Renal2,RT2,Tumor,1410,4375.0,True,True,renal2.tnb.C13,TCR,TRA+TRB,single pair,244,2
RT2_TCACGAATCGTGGGAA-1,4.1-Trm,Renal2,RT2,Tumor,1196,2851.0,True,True,renal2.tnb.C174,TCR,TRA+TRB,single pair,1498,1
RN2_CGTTGGGCACAACGCC-1,8.3a-Trm,Renal2,RN2,NAT,605,1349.0,True,True,renal2.tnb.C2357,TCR,TRA+TRB,single pair,1508,1


In [25]:
NAT_clonotypes = renal2_independent[renal2_independent["source"] == "NAT"]["clonotype_orig"].unique()
Tumor_clonotypes = renal2_independent[renal2_independent["source"] == "Tumor"]["clonotype_orig"].unique()

renal2_ind_tumor_only = renal2_independent[
    (renal2_independent["clonotype_orig"].isin(Tumor_clonotypes)) &
    (~renal2_independent["clonotype_orig"].isin(NAT_clonotypes))
]
renal2_ind_NAT_only = renal2_independent[
    (renal2_independent["clonotype_orig"].isin(NAT_clonotypes)) &
    (~renal2_independent["clonotype_orig"].isin(Tumor_clonotypes))
]
renal2_ind_Dual = renal2_independent[
    (renal2_independent["clonotype_orig"].isin(NAT_clonotypes)) &
    (renal2_independent["clonotype_orig"].isin(Tumor_clonotypes))
]

total_ind = renal2_independent["clone_id_size"].sum()
n_ind = renal2_ind_NAT_only[renal2_ind_NAT_only['clone_id_size'] == 1]["clone_id_size"].sum()
N_ind = renal2_ind_NAT_only[renal2_ind_NAT_only['clone_id_size'] >= 2]["clone_id_size"].sum()
D_ind = renal2_ind_Dual["clone_id_size"].sum()
T_ind = renal2_ind_tumor_only[renal2_ind_tumor_only['clone_id_size'] >= 2]["clone_id_size"].sum()
t_ind = renal2_ind_tumor_only[renal2_ind_tumor_only['clone_id_size'] == 1]["clone_id_size"].sum()

print(f'{total_ind = }')
print(f'{n_ind = }')
print(f'{N_ind = }')
print(f'{D_ind = }')
print(f'{T_ind = }')
print(f'{t_ind = }')
print(f'{n_ind + N_ind + D_ind + T_ind + t_ind = }')

print(f'{n_ind/total_ind = }')
print(f'{N_ind/total_ind = }')
print(f'{D_ind/total_ind = }')
print(f'{T_ind/total_ind = }')
print(f'{t_ind/total_ind = }')

total_ind = np.int64(134)
n_ind = np.int64(72)
N_ind = np.int64(12)
D_ind = np.int64(8)
T_ind = np.int64(17)
t_ind = np.int64(25)
n_ind + N_ind + D_ind + T_ind + t_ind = np.int64(134)
n_ind/total_ind = np.float64(0.5373134328358209)
N_ind/total_ind = np.float64(0.08955223880597014)
D_ind/total_ind = np.float64(0.05970149253731343)
T_ind/total_ind = np.float64(0.12686567164179105)
t_ind/total_ind = np.float64(0.1865671641791045)
