In [1]:
import pandas as pd
import numpy as np
from cnmf import cNMF
import scanpy as sc
import os
import scipy.sparse as sp
import yaml
import requests
from starcat import starCAT, BuildConsensusReference

def download_file_from_google_drive(file_id, destination):
    URL = "https://drive.google.com/uc?export=download"

    session = requests.Session()

    response = session.get(URL, params = { 'id' : file_id }, stream = True)
    token = get_confirm_token(response)

    if token:
        params = { 'id' : file_id, 'confirm' : token }
        response = session.get(URL, params = params, stream = True)

    save_response_content(response, destination)    

def get_confirm_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value

    return None

def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk:  # filter out keep-alive new chunks
                f.write(chunk)

In [2]:
test_dir = '.'
datafn = os.path.join(test_dir, 'example_data.h5ad')
download_file_from_google_drive("1Rr3DWumYaFDOiPpf8sAFw0X2BkJlZNFP", datafn)

## Test initialization

In [3]:
cat_obj = starCAT()

Using reference from starCAT database
Downloading reference TCAT.V1 to cache
Making empty cache directory "./cache"
Caching reference to ./cache/TCAT.V1


In [4]:
cat_obj.ref.iloc[0:5, 0:5]

Unnamed: 0,A1BG,AARD,AARSD1,ABCA1,ABCB1
CellCycle-G2M,2.032614,22.965553,17.423538,3.478179,2.297279
Translation,35.445282,0.0,9.245893,0.477994,0.0
HLA,18.192997,14.63267,2.686475,3.937182,0.0
ISG,0.436212,0.0,18.078197,17.354506,0.0
Mito,10.293049,0.0,52.669895,14.615502,3.341488


In [5]:
cat_obj.scores, cat_obj.score_data

(None,
 {'scores': {'continuous': [{'name': 'ASA',
     'normalization': 'normalized',
     'columns': ['TIMD4/TIM3', 'ICOS/CD38', 'CTLA4/CD38', 'OX40/EBI3']},
    {'name': 'Proliferation',
     'normalization': 'normalized',
     'columns': ['CellCycle-G2M', 'CellCycle-S', 'CellCycle-Late-S']}],
   'discrete': [{'name': 'ASA_binary',
     'normalization': 'normalized',
     'columns': ['TIMD4/TIM3', 'ICOS/CD38', 'CTLA4/CD38', 'OX40/EBI3'],
     'threshold': 0.0625},
    {'name': 'Proliferation_binary',
     'normalization': 'normalized',
     'columns': ['CellCycle-G2M', 'CellCycle-S', 'CellCycle-Late-S'],
     'threshold': 0.1},
    {'name': 'Multinomial_Label',
     'normalization': 'normalized',
     'file': 'multinomial_lineage_classifier.py',
     'function': 'compute_lineage'}]}})

In [6]:
cat_obj.ref_name, cat_obj.score_path

('TCAT.V1', './cache/TCAT.V1/TCAT.V1.scores.yaml')

In [7]:
cat_obj = starCAT(reference = './cache/TCAT.V1/TCAT.V1.reference.tsv')


Using user specified reference spectra file ./cache/TCAT.V1/TCAT.V1.reference.tsv
No scores provided


In [8]:
cat_obj.ref.iloc[0:5, 0:5]

Unnamed: 0,A1BG,AARD,AARSD1,ABCA1,ABCB1
CellCycle-G2M,2.032614,22.965553,17.423538,3.478179,2.297279
Translation,35.445282,0.0,9.245893,0.477994,0.0
HLA,18.192997,14.63267,2.686475,3.937182,0.0
ISG,0.436212,0.0,18.078197,17.354506,0.0
Mito,10.293049,0.0,52.669895,14.615502,3.341488


In [9]:
cat_obj.scores, cat_obj.score_data

(None, None)

In [10]:
reference = 'other'
cat_obj = starCAT(reference = reference)

Using reference from starCAT database


Exception: other is not found in list of pre-built reference names. It is also not a valid path to a reference file which would need to end in .tsv or .txt. Please provide a valid file path or a reference string from among the following [TCAT.V1,MYELOID.GLIOMA.V1,BONEMARROW.CD34POS.HSPC.V1]

## Test ```load_counts```

In [11]:
adata = cat_obj.load_counts(datafn)

In [12]:
test_dir

'.'

In [13]:
X = pd.DataFrame(adata.X.todense(), index = adata.obs.index, columns = adata.var.index)
X_filt = X.head(5000)

In [14]:
fname = 'haoetal_pbmc_multimodal.merged.T.raw.ADTfixedADT_70.20221022FiltForcNMF'

In [15]:
X_filt.to_csv(os.path.join(test_dir, '%s.txt' % fname), sep = '\t')


In [16]:
# X_filt.to_csv(os.path.join(test_dir, '%s.txt' % fname), sep = '\t')
# save_df_to_npz(X_filt, os.path.join(test_dir, '%s.npz' % fname))

In [17]:
counts_fn = os.path.join(test_dir, '%s.txt' % fname)
adata = cat_obj.load_counts(counts_fn)

In [18]:
! test -f pbmc3k_filtered_gene_bc_matrices.tar.gz || curl https://cf.10xgenomics.com/samples/cell/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz -o pbmc3k_filtered_gene_bc_matrices.tar.gz
! tar -xzf pbmc3k_filtered_gene_bc_matrices.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 7443k  100 7443k    0     0  10.8M      0 --:--:-- --:--:-- --:--:-- 10.8M--:--:--     0


In [23]:
mtxfn = 'filtered_gene_bc_matrices/hg19/matrix.mtx'
adata = cat_obj.load_counts(counts_fn)

In [24]:
adata, adata.X

(AnnData object with n_obs × n_vars = 5000 × 20807,
 array([[0., 0., 0., ..., 0., 0., 4.],
        [0., 0., 0., ..., 2., 2., 6.],
        [0., 0., 0., ..., 5., 0., 5.],
        ...,
        [0., 0., 0., ..., 5., 1., 5.],
        [0., 0., 0., ..., 4., 4., 7.],
        [0., 0., 0., ..., 2., 5., 7.]]))

In [25]:
display(adata.obs.head(2)), display(adata.var.head(2))

CATGCCTAGTCGATAA-1-gPlexA4
AAGACCTGTAGCGTCC-1-gPlexC6


OR4F5
OR4F29


(None, None)

## Test ```fit_transform```

In [26]:
cat_obj = starCAT('TCAT.V1')

Using reference from starCAT database
Loading reference from existing cache file for reference TCAT.V1


In [27]:
adata = cat_obj.load_counts(datafn)

In [28]:
usage, scores = cat_obj.fit_transform(adata)

3412 out of 3412 genes in the reference overlap with the query


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [29]:
usage.head(2)

Unnamed: 0,CellCycle-G2M,Translation,HLA,ISG,Mito,Doublet-RBC,gdT,CellCycle-S,Cytotoxic,Doublet-Platelet,...,Tfh-2,OX40/EBI3,CD172a/MERTK,IEG3,Doublet-Fibroblast,SOX4/TOX2,CD40LG/TXNIP,Tph,Exhaustion,Tfh-1
CATGCCTAGTCGATAA-1-gPlexA4,3.9e-05,0.001042,0.001223,0.000162,0.004898,0.002502,0.092426,0.000356,0.226792,0.00054,...,0.001774,0.001226,0.002313,0.000944,0.034961,0.000796,0.036057,0.001384,0.018341,0.002657
AAGACCTGTAGCGTCC-1-gPlexC6,0.000246,0.100023,0.002991,0.042354,0.005411,0.003336,0.024678,0.008877,0.006033,0.003284,...,0.012168,0.002587,0.074996,0.012574,0.010415,0.007521,0.06545,0.000279,0.00143,0.021316


In [30]:
scores.head(2)

Unnamed: 0,ASA,Proliferation,ASA_binary,Proliferation_binary,Multinomial_Label
CATGCCTAGTCGATAA-1-gPlexA4,0.001556,0.00052,False,False,CD8_TEMRA
AAGACCTGTAGCGTCC-1-gPlexC6,0.012503,0.01191,False,False,CD4_Naive


In [31]:
adata.X = np.array(adata.X.todense())

In [32]:
usage, scores = cat_obj.fit_transform(adata)

3412 out of 3412 genes in the reference overlap with the query


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [33]:
usage.head(2)

Unnamed: 0,CellCycle-G2M,Translation,HLA,ISG,Mito,Doublet-RBC,gdT,CellCycle-S,Cytotoxic,Doublet-Platelet,...,Tfh-2,OX40/EBI3,CD172a/MERTK,IEG3,Doublet-Fibroblast,SOX4/TOX2,CD40LG/TXNIP,Tph,Exhaustion,Tfh-1
CATGCCTAGTCGATAA-1-gPlexA4,3.9e-05,0.001042,0.001223,0.000162,0.004898,0.002502,0.09242,0.000356,0.226792,0.00054,...,0.001775,0.001226,0.002313,0.000944,0.034966,0.000796,0.036061,0.001384,0.018342,0.002658
AAGACCTGTAGCGTCC-1-gPlexC6,0.000246,0.100023,0.002991,0.042353,0.005411,0.003336,0.02468,0.008878,0.006033,0.003284,...,0.012167,0.002587,0.074998,0.012574,0.010415,0.007521,0.065452,0.000279,0.00143,0.021316


In [34]:
scores.head(2)

Unnamed: 0,ASA,Proliferation,ASA_binary,Proliferation_binary,Multinomial_Label
CATGCCTAGTCGATAA-1-gPlexA4,0.001556,0.00052,False,False,CD8_TEMRA
AAGACCTGTAGCGTCC-1-gPlexC6,0.012503,0.011911,False,False,CD4_Naive


In [35]:
scores['Multinomial_Label'].value_counts()

Multinomial_Label
CD4_Naive    4408
CD4_CM       2949
CD4_EM       1825
CD8_TEMRA    1367
CD8_EM       1150
CD8_Naive    1072
CD8_CM        916
Treg          661
gdT           360
MAIT          292
Name: count, dtype: int64

In [36]:
cat_obj_noscore = starCAT(reference = './cache/TCAT.V1/TCAT.V1.reference.tsv')

Using user specified reference spectra file ./cache/TCAT.V1/TCAT.V1.reference.tsv
No scores provided


In [37]:
usage, scores = cat_obj_noscore.fit_transform(adata)

3412 out of 3412 genes in the reference overlap with the query




In [38]:
usage.head(2)

Unnamed: 0,CellCycle-G2M,Translation,HLA,ISG,Mito,Doublet-RBC,gdT,CellCycle-S,Cytotoxic,Doublet-Platelet,...,Tfh-2,OX40/EBI3,CD172a/MERTK,IEG3,Doublet-Fibroblast,SOX4/TOX2,CD40LG/TXNIP,Tph,Exhaustion,Tfh-1
CATGCCTAGTCGATAA-1-gPlexA4,3.9e-05,0.001042,0.001223,0.000162,0.004898,0.002502,0.09242,0.000356,0.226792,0.00054,...,0.001775,0.001226,0.002313,0.000944,0.034966,0.000796,0.036061,0.001384,0.018342,0.002658
AAGACCTGTAGCGTCC-1-gPlexC6,0.000246,0.100023,0.002991,0.042353,0.005411,0.003336,0.02468,0.008878,0.006033,0.003284,...,0.012167,0.002587,0.074998,0.012574,0.010415,0.007521,0.065452,0.000279,0.00143,0.021316


In [39]:
scores

In [40]:
cat_obj_noscore.score_path

## Test ```save_usage```

In [41]:
test_dir

'.'

In [42]:
fname = 'haoetal_pbmc_multimodal.merged.T.raw.ADTfixedADT_70.20221022FiltForcNMF'

In [43]:
name = 'starCAT_%s' % fname
name

'starCAT_haoetal_pbmc_multimodal.merged.T.raw.ADTfixedADT_70.20221022FiltForcNMF'

In [44]:
output_dir = test_dir
output_dir

'.'

In [45]:
cat_obj.usage.head(2)

Unnamed: 0,CellCycle-G2M,Translation,HLA,ISG,Mito,Doublet-RBC,gdT,CellCycle-S,Cytotoxic,Doublet-Platelet,...,Tfh-2,OX40/EBI3,CD172a/MERTK,IEG3,Doublet-Fibroblast,SOX4/TOX2,CD40LG/TXNIP,Tph,Exhaustion,Tfh-1
CATGCCTAGTCGATAA-1-gPlexA4,1.668303e-07,4e-06,5e-06,6.942827e-07,2.1e-05,1.1e-05,0.000397,2e-06,0.000975,2e-06,...,8e-06,5e-06,1e-05,4e-06,0.00015,3e-06,0.000155,6e-06,7.9e-05,1.1e-05
AAGACCTGTAGCGTCC-1-gPlexC6,1.502442e-06,0.00061,1.8e-05,0.0002583635,3.3e-05,2e-05,0.000151,5.4e-05,3.7e-05,2e-05,...,7.4e-05,1.6e-05,0.000458,7.7e-05,6.4e-05,4.6e-05,0.000399,2e-06,9e-06,0.00013


In [46]:
cat_obj.scores.head(2)

Unnamed: 0,ASA,Proliferation,ASA_binary,Proliferation_binary,Multinomial_Label
CATGCCTAGTCGATAA-1-gPlexA4,0.001556,0.00052,False,False,CD8_TEMRA
AAGACCTGTAGCGTCC-1-gPlexC6,0.012503,0.011911,False,False,CD4_Naive


In [47]:
cat_obj.save_results(output_dir, name)

Saving usages to ./starCAT_haoetal_pbmc_multimodal.merged.T.raw.ADTfixedADT_70.20221022FiltForcNMF.rf_usage_normalized.txt
Saving scores to ./starCAT_haoetal_pbmc_multimodal.merged.T.raw.ADTfixedADT_70.20221022FiltForcNMF.scores.txt


## Test ```build_reference```

In [48]:
! wget https://zenodo.org/records/13368101/files/Example_refbuilder.tar.gz

--2026-02-18 12:40:52--  https://zenodo.org/records/13368101/files/Example_refbuilder.tar.gz
Resolving zenodo.org (zenodo.org)... 137.138.52.235, 188.185.43.153, 137.138.153.219, ...
Connecting to zenodo.org (zenodo.org)|137.138.52.235|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 24294951 (23M) [application/octet-stream]
Saving to: ‘Example_refbuilder.tar.gz’


2026-02-18 12:40:53 (19.1 MB/s) - ‘Example_refbuilder.tar.gz’ saved [24294951/24294951]



In [49]:
! tar -zxvf Example_refbuilder.tar.gz   

Example_refbuilder/
Example_refbuilder/UKCOVID.20231005.Iter200/
Example_refbuilder/UKCOVID.20231005.Iter200/cnmf_tmp/
Example_refbuilder/UKCOVID.20231005.Iter200/cnmf_tmp/UKCOVID.20231005.Iter200.tpm_stats.df.npz
Example_refbuilder/UKCOVID.20231005.Iter200/UKCOVID.20231005.Iter200.overdispersed_genes.txt
Example_refbuilder/UKCOVID.20231005.Iter200/UKCOVID.20231005.Iter200.gene_spectra_tpm.k_44.dt_0_2.txt
Example_refbuilder/UKCOVID.20231005.Iter200/UKCOVID.20231005.Iter200.gene_spectra_score.k_44.dt_0_2.txt
Example_refbuilder/HIVVaccine.20230907.Iter200/
Example_refbuilder/HIVVaccine.20230907.Iter200/cnmf_tmp/
Example_refbuilder/HIVVaccine.20230907.Iter200/cnmf_tmp/HIVVaccine.20230907.Iter200.tpm_stats.df.npz
Example_refbuilder/HIVVaccine.20230907.Iter200/HIVVaccine.20230907.Iter200.overdispersed_genes.txt
Example_refbuilder/HIVVaccine.20230907.Iter200/HIVVaccine.20230907.Iter200.gene_spectra_tpm.k_31.dt_0_15.txt
Example_refbuilder/HIVVaccine.20230907.Iter200/HIVVaccine.20230907.Iter20

In [50]:
! rm Example_refbuilder.tar.gz   

In [51]:
cnmf_paths = ['Example_refbuilder/COMBAT.20230927.Iter200/',
              'Example_refbuilder/HIVVaccine.20230907.Iter200',
              'Example_refbuilder/UKCOVID.20231005.Iter200']
Ks = [35, 31, 44]
dts = [0.15, 0.15, 0.2]
outdir = './Example_refbuilder'
prefix = 'starcat_ref'

In [52]:
refbuilder = BuildConsensusReference(cnmf_paths, ks=Ks, density_thresholds=dts,
                                    output_dir=outdir, prefix=prefix)

In [53]:
clus_df, spectra_tpm_grouped, spectra_scores_grouped, hvgs_union, top_genes = refbuilder.cluster_cnmf_results()

In [54]:
clus_df.head()

Unnamed: 0,COMBAT.20230927.Iter200,HIVVaccine.20230907.Iter200,UKCOVID.20231005.Iter200
cGEP1,COMBAT.20230927.Iter200:23,,UKCOVID.20231005.Iter200:30
cGEP2,COMBAT.20230927.Iter200:10,HIVVaccine.20230907.Iter200:10,UKCOVID.20231005.Iter200:15
cGEP3,COMBAT.20230927.Iter200:15,HIVVaccine.20230907.Iter200:24,UKCOVID.20231005.Iter200:17
cGEP4,COMBAT.20230927.Iter200:25,HIVVaccine.20230907.Iter200:25,UKCOVID.20231005.Iter200:27
cGEP5,COMBAT.20230927.Iter200:29,HIVVaccine.20230907.Iter200:30,UKCOVID.20231005.Iter200:35


In [55]:
query_adata = sc.read('./Example_refbuilder/example_memoryT_query.h5ad')

In [56]:
custom_tcat = starCAT(reference='./Example_refbuilder/starcat_refstarcat_consensus_spectra_normalized.filtered.txt', 
               cachedir='./Example_refbuilder/starcat_cache')

Using user specified reference spectra file ./Example_refbuilder/starcat_refstarcat_consensus_spectra_normalized.filtered.txt
No scores provided


In [57]:
usage, _ = custom_tcat.fit_transform(query_adata)

2093 out of 3093 genes in the reference overlap with the query


## End

In [58]:
! pip --version
! pip list

pip 25.3 from /home/dak26/miniforge3/envs/cnmf_env_20260122/lib/python3.10/site-packages/pip (python 3.10)
Package                   Version     Editable project location
------------------------- ----------- ----------------------------------------------------------
anndata                   0.10.7
anyio                     4.12.1
argon2-cffi               25.1.0
argon2-cffi-bindings      25.1.0
array-api-compat          1.13.0
arrow                     1.4.0
asttokens                 3.0.1
async-lru                 2.1.0
attrs                     25.4.0
babel                     2.17.0
backports.zstd            1.3.0
beautifulsoup4            4.14.3
biopython                 1.86
bleach                    6.3.0
Brotli                    1.2.0
cached-property           1.5.2
certifi                   2026.1.4
cffi                      2.0.0
charset-normalizer        3.4.4
click                     8.3.1
cloudpickle               3.1.2
cnmf                      1.7.0
colorama          