In [1]:
import scanpy as sc
import pandas as pd
import scipy.io
import os

# Parameters

In [2]:
# This parameters cell will be overridden by values specified at execution time.
path_data = "../test"
path_out = "../test"
sample_name = "test"

In [3]:
!ls $path_data

test.doublet-score.pickle       test_sparse_counts_barcodes.csv
test.doublets.pickle            test_sparse_counts_genes.csv
test.filtered.h5ad              test_sparse_molecule_counts.mtx
test.raw.h5ad                   test_sparse_read_counts.mtx
test_dense.csv


# Dense Count Matrix

## Load

In [4]:
df_dense = pd.read_csv(
    os.path.join(path_data, sample_name + "_dense.csv"),
    index_col=0
)
df_dense

Unnamed: 0,CLUSTER,ARHGAP33,UPF1,SPPL2B,C19ORF60,TBXA2R,PAF1,MARK4,CCDC124,CEACAM21,...,AC026803.1,METAZOA_SRP.6,SEPT14P19,CTD-3113P16.11,LLNLF-173C4.2,WASH5P,LINC01002,AC008993.2,BISPR,LLNLR-245B6.1
120703436113835,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
120703436351717,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
120726897253220,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
120769892956524,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
120778570709861,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240695043287403,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
241030031863077,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
241038756308851,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
241038775465902,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Drop the CLUSTER Column

In [5]:
df_dense.drop(columns=["CLUSTER"], inplace=True)

In [6]:
df_dense

Unnamed: 0,ARHGAP33,UPF1,SPPL2B,C19ORF60,TBXA2R,PAF1,MARK4,CCDC124,CEACAM21,TRAPPC6A,...,AC026803.1,METAZOA_SRP.6,SEPT14P19,CTD-3113P16.11,LLNLF-173C4.2,WASH5P,LINC01002,AC008993.2,BISPR,LLNLR-245B6.1
120703436113835,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
120703436351717,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
120726897253220,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
120769892956524,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
120778570709861,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240695043287403,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
241030031863077,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
241038756308851,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
241038775465902,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Convert to AnnData

In [7]:
df_dense.index = df_dense.index.astype(str)

In [8]:
adata_filtered = sc.AnnData(df_dense)

In [9]:
adata_filtered

AnnData object with n_obs × n_vars = 1087 × 1320

In [10]:
adata_filtered.var

ARHGAP33
UPF1
SPPL2B
C19ORF60
TBXA2R
...
WASH5P
LINC01002
AC008993.2
BISPR
LLNLR-245B6.1


In [11]:
adata_filtered.obs

120703436113835
120703436351717
120726897253220
120769892956524
120778570709861
...
240695043287403
241030031863077
241038756308851
241038775465902
241184489491892


In [12]:
print("FILTERED", adata_filtered)

FILTERED AnnData object with n_obs × n_vars = 1087 × 1320


## Write to Disk

In [13]:
adata_filtered.write_h5ad(
    os.path.join(path_out, sample_name + ".filtered.h5ad")
)

# Sparse Count Matrix

## Load

In [14]:
mol_counts = scipy.io.mmread(os.path.join(path_data, sample_name + "_sparse_molecule_counts.mtx"))

In [15]:
mol_counts

<5941x1347 sparse matrix of type '<class 'numpy.int64'>'
	with 64426 stored elements in COOrdinate format>

In [16]:
# currently not being used
read_counts = scipy.io.mmread(os.path.join(path_data, sample_name + "_sparse_read_counts.mtx"))

In [17]:
read_counts

<5941x1347 sparse matrix of type '<class 'numpy.int64'>'
	with 64426 stored elements in COOrdinate format>

In [18]:
features = pd.read_csv(os.path.join(path_data, sample_name + "_sparse_counts_genes.csv"), index_col=0, header=None)[1].values
features

array(['ARHGAP33', 'UPF1', 'SPPL2B', ..., 'AC008993.2', 'BISPR',
       'LLNLR-245B6.1'], dtype=object)

In [19]:
barcodes = pd.read_csv(
    os.path.join(path_data, sample_name + "_sparse_counts_barcodes.csv"),
    index_col=0,
    header=None
)[1].values.astype(str)
barcodes

array(['120703424092390', '120703436056350', '120703436113835', ...,
       '241184490047774', '241184504207219', '241184535430878'],
      dtype='<U21')

## Convert to AnnData

In [20]:
# .X contains molecule counts
adata_raw = sc.AnnData(mol_counts.todense())

In [21]:
adata_raw.obs_names = barcodes
adata_raw.var_names = features

In [22]:
adata_raw

AnnData object with n_obs × n_vars = 5941 × 1347

In [23]:
adata_raw.obs

120703424092390
120703436056350
120703436113835
120703436319462
120703436351717
...
241184489491892
241184489753509
241184490047774
241184504207219
241184535430878


In [24]:
adata_raw.var

ARHGAP33
UPF1
SPPL2B
C19ORF60
TBXA2R
...
WASH5P
LINC01002
AC008993.2
BISPR
LLNLR-245B6.1


In [25]:
# molecule counts
adata_raw.X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [26]:
print("RAW", adata_raw)

RAW AnnData object with n_obs × n_vars = 5941 × 1347


## Write to Disk

In [27]:
adata_raw.write_h5ad(
    os.path.join(path_out, sample_name + ".raw.h5ad")
)