In [11]:
import scvelo as scv
import anndata as ad
import scanpy as sc
import pandas as pd
import mygene
from scipy.sparse import csr_matrix, save_npz, load_npz

In [2]:
# 1. Load raw counts
#    Assuming `rna_counts.tsv` has a header row of cell names and the first column as gene names.
counts = pd.read_csv("/home/users/kzlin/kzlinlab/data/greanleaf_brain_multiome/rna_counts.tsv.gz", sep='\t', index_col=0)

# 2. Load cell metadata
#    Assuming `rna_cell_metadata.txt` has rows of cells and columns for metadata,
#    and that one of these columns (likely the index) matches the cell names in `counts.columns`.
cell_metadata = pd.read_csv("/home/users/kzlin/kzlinlab/data/greanleaf_brain_multiome/rna_cell_metadata.txt", sep='\t', index_col=0)

In [3]:
counts.index

Index(['ENSG00000243485', 'ENSG00000237613', 'ENSG00000186092',
       'ENSG00000238009', 'ENSG00000239945', 'ENSG00000239906',
       'ENSG00000241599', 'ENSG00000236601', 'ENSG00000284733',
       'ENSG00000235146',
       ...
       'ENSG00000198712', 'ENSG00000228253', 'ENSG00000198899',
       'ENSG00000198938', 'ENSG00000198840', 'ENSG00000212907',
       'ENSG00000198886', 'ENSG00000198786', 'ENSG00000198695',
       'ENSG00000198727'],
      dtype='object', length=33355)

In [6]:
print(counts.shape)
print(type(counts))

(33355, 57868)
<class 'pandas.core.frame.DataFrame'>


In [5]:
counts

Unnamed: 0,hft_w20_p3_r1_AAACCCAAGCTGCGAA,hft_w20_p3_r1_AAACCCAAGGTAGTAT,hft_w20_p3_r1_AAACCCACAACTCCAA,hft_w20_p3_r1_AAACCCACATAGTCAC,hft_w20_p3_r1_AAACCCAGTACAGGTG,hft_w20_p3_r1_AAACCCAGTACGGTTT,hft_w20_p3_r1_AAACCCAGTACTCGCG,hft_w20_p3_r1_AAACCCAGTATGTCCA,hft_w20_p3_r1_AAACCCAGTGTATTCG,hft_w20_p3_r1_AAACCCAGTTGCTCAA,...,hft_w16_p7_r2_TTTGGTTTCACGACTA,hft_w16_p7_r2_TTTGGTTTCCCTTTGG,hft_w16_p7_r2_TTTGGTTTCCTCAGAA,hft_w16_p7_r2_TTTGGTTTCGGAATGG,hft_w16_p7_r2_TTTGTTGCAATTCGTG,hft_w16_p7_r2_TTTGTTGCAGCACCCA,hft_w16_p7_r2_TTTGTTGCAGGCTACC,hft_w16_p7_r2_TTTGTTGGTCGCTTAA,hft_w16_p7_r2_TTTGTTGGTCGTACAT,hft_w16_p7_r2_TTTGTTGGTTAGTTCG
ENSG00000243485,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000237613,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000186092,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000238009,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000239945,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000212907,0,2,3,0,0,0,0,0,2,1,...,1,1,0,0,0,0,1,0,0,0
ENSG00000198886,6,76,39,7,34,42,48,19,40,29,...,9,21,2,5,30,15,21,12,10,3
ENSG00000198786,1,5,5,3,11,3,5,7,9,2,...,2,7,1,2,3,3,5,3,0,0
ENSG00000198695,0,1,0,0,1,0,0,1,0,2,...,0,0,0,0,0,0,0,0,0,0


In [9]:
sparse_counts = csr_matrix(counts.values)

In [12]:
# Suppose sparse_counts is a csr_matrix
# Save it to a file
save_npz("/home/users/kzlin/kzlinlab/projects/veloUncertainty/out/kevin/Writeup13/rna_sparse_counts.npz", sparse_counts)

# Later, you can load it back:
# loaded_sparse_counts = load_npz("/home/users/kzlin/kzlinlab/projects/veloUncertainty/out/kevin/Writeup13/rna_sparse_counts.npz")

In [None]:
pd.DataFrame(counts.index).to_csv("/home/users/kzlin/kzlinlab/projects/veloUncertainty/out/kevin/Writeup13/rna_gene_names.txt", index=False, header=False)
pd.DataFrame(counts.columns).to_csv("/home/users/kzlin/kzlinlab/projects/veloUncertainty/out/kevin/Writeup13/rna_cell_names.txt", index=False, header=False)

In [14]:
# 3. Load spliced and unspliced counts
#    Similar structure to raw counts, with gene names as row index and cell names as column headers.
spliced = pd.read_csv("/home/users/kzlin/kzlinlab/data/greanleaf_brain_multiome/rna_spliced_counts.tsv.gz", sep='\t', index_col=0)

In [18]:
spliced.index

Index(['ENSG00000243485', 'ENSG00000237613', 'ENSG00000186092',
       'ENSG00000238009', 'ENSG00000239945', 'ENSG00000239906',
       'ENSG00000241599', 'ENSG00000236601', 'ENSG00000284733',
       'ENSG00000235146',
       ...
       'ENSG00000198712', 'ENSG00000228253', 'ENSG00000198899',
       'ENSG00000198938', 'ENSG00000198840', 'ENSG00000212907',
       'ENSG00000198886', 'ENSG00000198786', 'ENSG00000198695',
       'ENSG00000198727'],
      dtype='object', length=32648)

In [19]:
sparse_spliced_counts = csr_matrix(spliced.values)
save_npz("/home/users/kzlin/kzlinlab/projects/veloUncertainty/out/kevin/Writeup13/rna_sparse_spliced_counts.npz", sparse_spliced_counts)
pd.DataFrame(spliced.index).to_csv("/home/users/kzlin/kzlinlab/projects/veloUncertainty/out/kevin/Writeup13/rna_spliced_gene_names.txt", index=False, header=False)
pd.DataFrame(spliced.columns).to_csv("/home/users/kzlin/kzlinlab/projects/veloUncertainty/out/kevin/Writeup13/rna_spliced_cell_names.txt", index=False, header=False)

In [20]:
unspliced = pd.read_csv("/home/users/kzlin/kzlinlab/data/greanleaf_brain_multiome/rna_unspliced_counts.tsv.gz", sep='\t', index_col=0)

In [21]:
sparse_unspliced_counts = csr_matrix(unspliced.values)
save_npz("/home/users/kzlin/kzlinlab/projects/veloUncertainty/out/kevin/Writeup13/rna_sparse_unspliced_counts.npz", sparse_unspliced_counts)
pd.DataFrame(unspliced.index).to_csv("/home/users/kzlin/kzlinlab/projects/veloUncertainty/out/kevin/Writeup13/rna_unspliced_gene_names.txt", index=False, header=False)
pd.DataFrame(unspliced.columns).to_csv("/home/users/kzlin/kzlinlab/projects/veloUncertainty/out/kevin/Writeup13/rna_unspliced_cell_names.txt", index=False, header=False)

In [23]:
unspliced.columns

Index(['hft_w20_p3_r1_AAACCCAAGCTGCGAA', 'hft_w20_p3_r1_AAACCCAAGGTAGTAT',
       'hft_w20_p3_r1_AAACCCACAACTCCAA', 'hft_w20_p3_r1_AAACCCACATAGTCAC',
       'hft_w20_p3_r1_AAACCCAGTACAGGTG', 'hft_w20_p3_r1_AAACCCAGTACGGTTT',
       'hft_w20_p3_r1_AAACCCAGTACTCGCG', 'hft_w20_p3_r1_AAACCCAGTATGTCCA',
       'hft_w20_p3_r1_AAACCCAGTGTATTCG', 'hft_w20_p3_r1_AAACCCAGTTGCTCAA',
       ...
       'hft_w16_p7_r2_TTTGGTTTCACGACTA', 'hft_w16_p7_r2_TTTGGTTTCCCTTTGG',
       'hft_w16_p7_r2_TTTGGTTTCCTCAGAA', 'hft_w16_p7_r2_TTTGGTTTCGGAATGG',
       'hft_w16_p7_r2_TTTGTTGCAATTCGTG', 'hft_w16_p7_r2_TTTGTTGCAGCACCCA',
       'hft_w16_p7_r2_TTTGTTGCAGGCTACC', 'hft_w16_p7_r2_TTTGTTGGTCGCTTAA',
       'hft_w16_p7_r2_TTTGTTGGTCGTACAT', 'hft_w16_p7_r2_TTTGTTGGTTAGTTCG'],
      dtype='object', length=57868)

In [24]:
# 4. Ensure that cell order and gene order align across all matrices
#    It's crucial that `counts`, `spliced`, and `unspliced` have the same gene index order and cell columns.
#    Also ensure that `cell_metadata`'s index matches the cell column names.
#    If not, you can reorder them. For example:
spliced = spliced[counts.columns]
unspliced = unspliced[counts.columns]
cell_metadata = cell_metadata.loc[counts.columns]

In [25]:
# Find the common set of genes
common_genes = counts.index.intersection(spliced.index).intersection(unspliced.index)
print(common_genes)

Index(['ENSG00000243485', 'ENSG00000237613', 'ENSG00000186092',
       'ENSG00000238009', 'ENSG00000239945', 'ENSG00000239906',
       'ENSG00000241599', 'ENSG00000236601', 'ENSG00000284733',
       'ENSG00000235146',
       ...
       'ENSG00000198712', 'ENSG00000228253', 'ENSG00000198899',
       'ENSG00000198938', 'ENSG00000198840', 'ENSG00000212907',
       'ENSG00000198886', 'ENSG00000198786', 'ENSG00000198695',
       'ENSG00000198727'],
      dtype='object', length=32648)


In [None]:
# 5. Create the AnnData object
adata = ad.AnnData(
    X=counts.T.values,
    obs=cell_metadata,
    var=pd.DataFrame(index=counts.index)  # var is for gene metadata; here just set the index.
)