## Doublet Removal - Pseudo Doublets

In [5]:
import scanpy as sc
import numpy as np
import pandas as pd
from matplotlib import rcParams
import matplotlib.pyplot as plt
%matplotlib inline
import scipy
import os
import anndata

sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

# data files are in the filtered matrix folder from CellRanger which has been renamed to the sample name
sample_name = r'sc85_1g' #r'sc70_1' #r'sc72_1'
data_path = r"../raw_data/"
data_files_path = data_path + sample_name + r""

results_path = 'results/'
results_file = results_path + sample_name + '-db.h5ad'  # the file that will store the analysis results
metrics_file = results_path + sample_name + '_db_metrics.csv'  # the file that will store the metrics
# make results folder if it doesn't exist
if not os.path.exists(results_path):
    os.makedirs(results_path, exist_ok=False)

scanpy==1.10.3 anndata==0.11.0 umap==0.5.7 numpy==1.26.4 scipy==1.14.1 pandas==2.2.3 scikit-learn==1.5.2 statsmodels==0.14.4 igraph==0.11.8 louvain==0.8.2 pynndescent==0.5.13


#### Setup R environment

In [26]:
import os
os.environ['R_HOME'] = r"C:\Program Files\R\R-4.4.2"  
import anndata2ri # order matters, comes after defining 'R_HOME
import logging
import rpy2.rinterface_lib.callbacks as rcb
import rpy2.robjects as ro
rcb.logger.setLevel(logging.ERROR)
ro.pandas2ri.activate()
anndata2ri.activate()

%reload_ext rpy2.ipython

  anndata2ri.activate()


In [27]:
%%R
.libPaths(c("C:/Users/leeh1/AppData/Local/R/win-library/4.4", .libPaths()))
library(Seurat)
library(scater)
library(scDblFinder)
library(BiocParallel)


    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
    

Loading required package: SeuratObject
Loading required package: sp

Attaching package: 'SeuratObject'

The following objects are masked from 'package:base':

    intersect, t

Loading required package: SingleCellExperiment
Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats

Attaching package: 'MatrixGenerics'

The following objects are masked from 'package:matrixStats':

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts,

#### Load data

Read in count data into an [`AnnData`](https://anndata.readthedocs.io/en/latest/anndata.AnnData.html) object.  
The matrix.mtx file contains a sparse matrix of the counts.  
Barcodes file contains the sample names in the format: AAACCCAAGACCATAA-1  
Features file contains the gene id in the format: ENSMUSG00000051951, Xkr4, Gene Expression

The number of counts per cell will be relatively low since only UMIs are counted by Cellranger

In [6]:
adata = sc.read_10x_mtx(
    data_files_path,              # the directory with the `.mtx` file
    var_names='gene_symbols',     # use gene symbols for the variable names (variables-axis index)
    cache=True)                   # write a cache file for faster subsequent reading

# adata.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`
print('Data matrix is sparse:', scipy.sparse.issparse(adata.X))
print()

# make the obs names unique by adding the sample name
adata.obs_names = [g.split("-")[0] + '_' + sample_name for g in adata.obs_names]
print(adata.obs_names[0:2])
print()
print('Number of cells =', f"{adata.n_obs:,.0f}")
print('Number of genes =', f"{adata.n_vars:,.0f}")
print('Number of counts =', f"{adata.X.sum():,.0f}")
print('Mean counts per cell =', f"{adata.X.sum()/adata.n_obs:,.0f}")
adata

... writing an h5ad cache file to speedup reading next time
Data matrix is sparse: True

Index(['AAACAGCCAACCGCCA_sc85_1g', 'AAACAGCCAAGGTATA_sc85_1g'], dtype='object')

Number of cells = 15,830
Number of genes = 36,530
Number of counts = 13,291,714
Mean counts per cell = 840


AnnData object with n_obs × n_vars = 15830 × 36530
    var: 'gene_ids', 'feature_types'

In [7]:
n_cells = adata.n_obs
n_genes = adata.n_vars
n_counts = adata.X.sum()    
counts_per_cell = round(n_counts / n_cells)

df_metrics = pd.DataFrame([n_cells, n_genes, n_counts, counts_per_cell], \
                  index = ['Number of cells', 'Number of genes', 'Number of counts','Mean counts per cell']).T
df_metrics.iloc[:, 1:] = df_metrics.iloc[:,1:].applymap('{:,.0f}'.format)
df_metrics.index = [sample_name]
df_metrics

  df_metrics.iloc[:, 1:] = df_metrics.iloc[:,1:].applymap('{:,.0f}'.format)
Name: Number of genes, dtype: object' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df_metrics.iloc[:, 1:] = df_metrics.iloc[:,1:].applymap('{:,.0f}'.format)
Name: Number of counts, dtype: object' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df_metrics.iloc[:, 1:] = df_metrics.iloc[:,1:].applymap('{:,.0f}'.format)
Name: Mean counts per cell, dtype: object' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df_metrics.iloc[:, 1:] = df_metrics.iloc[:,1:].applymap('{:,.0f}'.format)


Unnamed: 0,Number of cells,Number of genes,Number of counts,Mean counts per cell
sc85_1g,15830.0,36530,13291714,840


### Import Doublet IDs

In [9]:
doublets = np.load('doublets_sc85_1g.npy')
print(len(doublets))

4753


### Isolating Singlets

In [17]:
adata.obs['doublet_class'] = [1.0 if name in doublets else 0.0 for name in adata.obs_names]

In [18]:
print(adata.obs['doublet_class'])

AAACAGCCAACCGCCA_sc85_1g    0.0
AAACAGCCAAGGTATA_sc85_1g    1.0
AAACAGCCAATAGCAA_sc85_1g    1.0
AAACAGCCACCAGGTT_sc85_1g    1.0
AAACAGCCATGAATCT_sc85_1g    0.0
                           ... 
TTTGTTGGTTAAGTGT_sc85_1g    0.0
TTTGTTGGTTACGCGG_sc85_1g    0.0
TTTGTTGGTTAGGATT_sc85_1g    0.0
TTTGTTGGTTGCCTCA_sc85_1g    0.0
TTTGTTGGTTGGCGTG_sc85_1g    0.0
Name: doublet_class, Length: 15830, dtype: float64


In [30]:
bdata = adata[adata.obs['doublet_class'] == 0.0].copy()

In [31]:
print(len(bdata))

11077


## Creating Pseudo Doublets

In [32]:
singlet_data = bdata.X.T
n_pseudo_doubs = int(0.1 * bdata.shape[0])
print("Number of pseudo_doublets to be generated: " + str(n_pseudo_doubs))

Number of pseudo_doublets to be generated: 1107


In [33]:
%%R -i singlet_data -i n_pseudo_doubs -o pseudo_doublets -o pd_count_matrix -o names

pseudo_doublets = getArtificialDoublets(
    x=singlet_data,   
    n=n_pseudo_doubs,  
    resamp=0.1, 
    propRandom=0,  
    selMode="proportional",  
    meta.triplets=FALSE  
)

pd_count_matrix <- pseudo_doublets$counts
str(pd_count_matrix)
names <- pd_count_matrix@Dimnames[[2]]
str(names)

Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
  ..@ i       : int [1:975229] 19 42 86 104 127 170 180 205 224 231 ...
  ..@ p       : int [1:1108] 0 1030 2060 2779 3675 4429 5328 6257 6913 7848 ...
  ..@ Dim     : int [1:2] 25051 1107
  ..@ Dimnames:List of 2
  .. ..$ : NULL
  .. ..$ : chr [1:1107] "rDbl.1" "rDbl.2" "rDbl.3" "rDbl.4" ...
  ..@ x       : num [1:975229] 1 1 1 1 1 1 1 2 1 1 ...
  ..@ factors : list()
 chr [1:1107] "rDbl.1" "rDbl.2" "rDbl.3" "rDbl.4" "rDbl.5" "rDbl.6" ...


In [34]:
print("pd_count_matrix shape:", pd_count_matrix)

pd_count_matrix shape: <Compressed Sparse Column sparse matrix of dtype 'float64'
	with 975229 stored elements and shape (25051, 1107)>
  Coords	Values
  (19, 0)	1.0
  (42, 0)	1.0
  (86, 0)	1.0
  (104, 0)	1.0
  (127, 0)	1.0
  (170, 0)	1.0
  (180, 0)	1.0
  (205, 0)	2.0
  (224, 0)	1.0
  (231, 0)	1.0
  (314, 0)	1.0
  (337, 0)	1.0
  (350, 0)	1.0
  (406, 0)	1.0
  (437, 0)	2.0
  (449, 0)	1.0
  (478, 0)	1.0
  (480, 0)	1.0
  (499, 0)	1.0
  (544, 0)	1.0
  (550, 0)	1.0
  (608, 0)	1.0
  (610, 0)	1.0
  (634, 0)	1.0
  (664, 0)	2.0
  :	:
  (24254, 1106)	0.5
  (24355, 1106)	0.5
  (24409, 1106)	0.5
  (24562, 1106)	2.0
  (24567, 1106)	0.5
  (24617, 1106)	0.5
  (24687, 1106)	0.5
  (24792, 1106)	0.5
  (24839, 1106)	1.0
  (24908, 1106)	0.5
  (24911, 1106)	0.5
  (25011, 1106)	0.5
  (25015, 1106)	19.0
  (25017, 1106)	47.5
  (25019, 1106)	1.5
  (25023, 1106)	1.0
  (25029, 1106)	1.0
  (25032, 1106)	6.0
  (25034, 1106)	0.5
  (25035, 1106)	3.0
  (25036, 1106)	3.0
  (25037, 1106)	2.0
  (25040, 1106)	0.5
  (25044

In [35]:
print("names", names)

names ['rDbl.1' 'rDbl.2' 'rDbl.3' ... 'rDbl.1105' 'rDbl.1106' 'rDbl.1107']


#### Checking `bdata`

In [36]:
print(bdata.X)

<Compressed Sparse Column sparse matrix of dtype 'float32'
	with 5434695 stored elements and shape (11077, 25051)>
  Coords	Values
  (4530, 0)	1.0
  (6108, 0)	1.0
  (6236, 0)	1.0
  (7478, 0)	1.0
  (7906, 0)	1.0
  (8707, 0)	1.0
  (7653, 2)	1.0
  (10528, 4)	1.0
  (9, 5)	1.0
  (218, 5)	1.0
  (829, 5)	1.0
  (927, 5)	1.0
  (1080, 5)	1.0
  (1315, 5)	1.0
  (1457, 5)	1.0
  (1827, 5)	1.0
  (2542, 5)	1.0
  (2570, 5)	1.0
  (2768, 5)	1.0
  (2912, 5)	1.0
  (3050, 5)	1.0
  (3624, 5)	1.0
  (3667, 5)	1.0
  (3692, 5)	1.0
  (4559, 5)	1.0
  :	:
  (2836, 25050)	1.0
  (2855, 25050)	1.0
  (2877, 25050)	1.0
  (3721, 25050)	1.0
  (4567, 25050)	1.0
  (4699, 25050)	1.0
  (4727, 25050)	1.0
  (4825, 25050)	1.0
  (5319, 25050)	1.0
  (5967, 25050)	1.0
  (6073, 25050)	1.0
  (6201, 25050)	1.0
  (6377, 25050)	1.0
  (6836, 25050)	1.0
  (6954, 25050)	1.0
  (7014, 25050)	1.0
  (7362, 25050)	1.0
  (8248, 25050)	1.0
  (8276, 25050)	1.0
  (8510, 25050)	1.0
  (8850, 25050)	1.0
  (9598, 25050)	1.0
  (10063, 25050)	1.0
  (1061

#### Creating `pseudo_doublet_adata`

In [37]:
import anndata
pseudo_doublet_adata = anndata.AnnData(pd_count_matrix)
pseudo_doublet_adata = pseudo_doublet_adata.T
pseudo_doublet_adata.obs_names = names
pseudo_doublet_adata.var_names = bdata.var_names
pseudo_doublet_adata.obs["doublet"] = 1.0

In [38]:
print(pseudo_doublet_adata.shape)
print(pseudo_doublet_adata.obs_names)
print(pseudo_doublet_adata.var_names)
print(pseudo_doublet_adata.obs)
print(pseudo_doublet_adata.X)

(1107, 25051)
Index(['rDbl.1', 'rDbl.2', 'rDbl.3', 'rDbl.4', 'rDbl.5', 'rDbl.6', 'rDbl.7',
       'rDbl.8', 'rDbl.9', 'rDbl.10',
       ...
       'rDbl.1098', 'rDbl.1099', 'rDbl.1100', 'rDbl.1101', 'rDbl.1102',
       'rDbl.1103', 'rDbl.1104', 'rDbl.1105', 'rDbl.1106', 'rDbl.1107'],
      dtype='object', length=1107)
Index(['LOC139071432', 'LOC107385520', 'LOC139072245', 'LOC139063175',
       'LOC139064302', 'LOC139072279', 'LOC129159217', 'LOC139071444',
       'LOC129160091', 'LOC107372990',
       ...
       'KEG92_t17', 'KEG92_t18', 'KEG92_t19', 'KEG92_p03', 'KEG92_p02',
       'KEG92_t20', 'KEG92_p01', 'KEG92_t21', 'KEG92_t22', 'KEG92_t23'],
      dtype='object', length=25051)
           doublet
rDbl.1         1.0
rDbl.2         1.0
rDbl.3         1.0
rDbl.4         1.0
rDbl.5         1.0
...            ...
rDbl.1103      1.0
rDbl.1104      1.0
rDbl.1105      1.0
rDbl.1106      1.0
rDbl.1107      1.0

[1107 rows x 1 columns]
<Compressed Sparse Row sparse matrix of dtype 'float64

#### Concatenating adata objects

In [39]:
adata_combined = anndata.concat([pseudo_doublet_adata, bdata], axis=0, join="inner")

In [40]:
print(adata_combined.shape)

(12184, 25051)


In [41]:
print(adata_combined.obs)
print(adata_combined.var)
print(adata_combined.X)

Empty DataFrame
Columns: []
Index: [rDbl.1, rDbl.2, rDbl.3, rDbl.4, rDbl.5, rDbl.6, rDbl.7, rDbl.8, rDbl.9, rDbl.10, rDbl.11, rDbl.12, rDbl.13, rDbl.14, rDbl.15, rDbl.16, rDbl.17, rDbl.18, rDbl.19, rDbl.20, rDbl.21, rDbl.22, rDbl.23, rDbl.24, rDbl.25, rDbl.26, rDbl.27, rDbl.28, rDbl.29, rDbl.30, rDbl.31, rDbl.32, rDbl.33, rDbl.34, rDbl.35, rDbl.36, rDbl.37, rDbl.38, rDbl.39, rDbl.40, rDbl.41, rDbl.42, rDbl.43, rDbl.44, rDbl.45, rDbl.46, rDbl.47, rDbl.48, rDbl.49, rDbl.50, rDbl.51, rDbl.52, rDbl.53, rDbl.54, rDbl.55, rDbl.56, rDbl.57, rDbl.58, rDbl.59, rDbl.60, rDbl.61, rDbl.62, rDbl.63, rDbl.64, rDbl.65, rDbl.66, rDbl.67, rDbl.68, rDbl.69, rDbl.70, rDbl.71, rDbl.72, rDbl.73, rDbl.74, rDbl.75, rDbl.76, rDbl.77, rDbl.78, rDbl.79, rDbl.80, rDbl.81, rDbl.82, rDbl.83, rDbl.84, rDbl.85, rDbl.86, rDbl.87, rDbl.88, rDbl.89, rDbl.90, rDbl.91, rDbl.92, rDbl.93, rDbl.94, rDbl.95, rDbl.96, rDbl.97, rDbl.98, rDbl.99, rDbl.100, ...]

[12184 rows x 0 columns]
Empty DataFrame
Columns: []
Index: [LOC13

In [42]:
adata_combined.write("pseudo_doublets_sc85_1g_2.0.h5ad")

### Exporting Pseudo Doublets

In [43]:
with open("./results/pseudo_sc85_1g_2.0_ids.txt", "w") as txt_file:
    for cell in names:
        txt_file.write(cell + "\n")