In [1]:
import anndata as ad
import pandas as pd
import scanpy as sc
from pathlib import Path                
from scipy.io import mmwrite            

In [2]:
!pip list

Package             Version
------------------- ------------
absl-py             2.1.0
aiohttp             3.9.5
aiosignal           1.3.1
anndata             0.10.8
appnope             0.1.4
array_api_compat    1.7.1
asttokens           2.4.1
async-timeout       4.0.3
attrs               23.2.0
chex                0.1.86
click               8.1.8
comm                0.2.2
contextlib2         21.6.0
contourpy           1.2.1
cycler              0.12.1
debugpy             1.6.7
decorator           5.1.1
docrep              0.3.2
etils               1.5.2
exceptiongroup      1.2.0
executing           2.0.1
filelock            3.15.4
flax                0.8.5
fonttools           4.53.1
frozenlist          1.4.1
fsspec              2024.6.1
get-annotations     0.1.2
h5py                3.11.0
idna                3.7
igraph              0.11.8
importlib_metadata  8.0.0
importlib_resources 6.4.0
ipykernel           6.29.5
ipython             8.18.1
jax                 0.4.30
jaxlib          

In [3]:
# from https://cellxgene.cziscience.com/collections/1ca90a2d-2943-483d-b678-b809bf464c30
# These are the "Microglia-PVM - MTG: Seattle Alzheimer's Disease Atlas (SEA-AD)" cells

adata = ad.read_h5ad("/Users/kevinlin/Library/CloudStorage/Dropbox/Collaboration-and-People/sumie-katie/out/ADRC_workshop_2025/ec43c19b-1693-42c8-9200-423d649aa8cf.h5ad")
adata

AnnData object with n_obs × n_vars = 40000 × 36412
    obs: 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'is_primary_data', 'Neurotypical reference', 'Class', 'Subclass', 'Supertype', 'Age at death', 'Years of education', 'Cognitive status', 'ADNC', 'Braak stage', 'Thal phase', 'CERAD score', 'APOE4 status', 'Lewy body disease pathology', 'LATE-NC stage', 'Microinfarct pathology', 'Specimen ID', 'donor_id', 'PMI', 'Number of UMIs', 'Genes detected', 'Fraction mitochrondrial UMIs', 'suspension_type', 'development_stage_ontology_term_id', 'Continuous Pseudo-progression Score', 'tissue_type', 'cell_type', 'assay', 'disease', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type'
    uns: 'ADNC_colors', 'APOE

In [4]:
# Replace all the genes by canonical names
print(adata.var_names)
adata.var_names = adata.var["feature_name"].astype(str)
print(adata.var_names)

Index(['ENSG00000000003', 'ENSG00000000005', 'ENSG00000000419',
       'ENSG00000000457', 'ENSG00000000460', 'ENSG00000000938',
       'ENSG00000000971', 'ENSG00000001036', 'ENSG00000001084',
       'ENSG00000001167',
       ...
       'ENSG00000288616', 'ENSG00000288631', 'ENSG00000288642',
       'ENSG00000288649', 'ENSG00000288675', 'ENSG00000288701',
       'ENSG00000288702', 'ENSG00000288705', 'ENSG00000288709',
       'ENSG00000288722'],
      dtype='object', length=36412)
Index(['TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'C1orf112', 'FGR', 'CFH', 'FUCA2',
       'GCLC', 'NFYA',
       ...
       'ELOA3DP', 'ELOA3P', 'CDR1', 'ACTL10', 'PANO1', 'PRRC2B', 'UGT1A3',
       'UGT1A5', 'F8A2', 'F8A1'],
      dtype='object', name='feature_name', length=36412)


In [5]:
# remove every entry stored in .uns
adata.uns.clear()          # empties the dict in-place
adata

AnnData object with n_obs × n_vars = 40000 × 36412
    obs: 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'is_primary_data', 'Neurotypical reference', 'Class', 'Subclass', 'Supertype', 'Age at death', 'Years of education', 'Cognitive status', 'ADNC', 'Braak stage', 'Thal phase', 'CERAD score', 'APOE4 status', 'Lewy body disease pathology', 'LATE-NC stage', 'Microinfarct pathology', 'Specimen ID', 'donor_id', 'PMI', 'Number of UMIs', 'Genes detected', 'Fraction mitochrondrial UMIs', 'suspension_type', 'development_stage_ontology_term_id', 'Continuous Pseudo-progression Score', 'tissue_type', 'cell_type', 'assay', 'disease', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type'
    obsm: 'X_scVI', 'X_umap'


In [6]:
# remove every matrix stored in .layers
adata.layers.clear()      # empties the mapping in-place
adata

AnnData object with n_obs × n_vars = 40000 × 36412
    obs: 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'is_primary_data', 'Neurotypical reference', 'Class', 'Subclass', 'Supertype', 'Age at death', 'Years of education', 'Cognitive status', 'ADNC', 'Braak stage', 'Thal phase', 'CERAD score', 'APOE4 status', 'Lewy body disease pathology', 'LATE-NC stage', 'Microinfarct pathology', 'Specimen ID', 'donor_id', 'PMI', 'Number of UMIs', 'Genes detected', 'Fraction mitochrondrial UMIs', 'suspension_type', 'development_stage_ontology_term_id', 'Continuous Pseudo-progression Score', 'tissue_type', 'cell_type', 'assay', 'disease', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type'
    obsm: 'X_scVI', 'X_umap'


In [7]:
# delete all connectivities / distance matrices
adata.obsp.clear()        # empties the mapping in-place
adata

AnnData object with n_obs × n_vars = 40000 × 36412
    obs: 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'is_primary_data', 'Neurotypical reference', 'Class', 'Subclass', 'Supertype', 'Age at death', 'Years of education', 'Cognitive status', 'ADNC', 'Braak stage', 'Thal phase', 'CERAD score', 'APOE4 status', 'Lewy body disease pathology', 'LATE-NC stage', 'Microinfarct pathology', 'Specimen ID', 'donor_id', 'PMI', 'Number of UMIs', 'Genes detected', 'Fraction mitochrondrial UMIs', 'suspension_type', 'development_stage_ontology_term_id', 'Continuous Pseudo-progression Score', 'tissue_type', 'cell_type', 'assay', 'disease', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type'
    obsm: 'X_scVI', 'X_umap'

In [8]:
# This is the raw count data
adata.raw.X[0:5,0:5].toarray()

array([[0., 0., 2., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 3., 0., 0.],
       [0., 0., 0., 2., 1.],
       [0., 0., 0., 0., 0.]], dtype=float32)

In [9]:
# 1) Make a *stand-alone* AnnData that holds exactly what is in `adata.raw`
adata_raw = adata.raw.to_adata()         # copies only X/var/obs_names; cheap for sparse matrices

# 2) Choose an output directory
out_dir = Path("/Users/kevinlin/Library/CloudStorage/Dropbox/Collaboration-and-People/sumie-katie/out/ADRC_workshop_2025/adata_raw_10x")
out_dir.mkdir(exist_ok=True)

# 3) (a) Write the matrix in Matrix Market format
mmwrite(out_dir / "matrix.mtx", adata_raw.X.T.tocsr())   # .tocsr() is a no-op if already CSR

#    (b) Write the feature table
#        • column order: <gene_id> <gene_name> <feature_type>
#        • 10x v3+ spec calls this 'features.tsv' (earlier kits used 'genes.tsv')
features = adata_raw.var
features_out = pd.DataFrame({
    "gene_id"      : features.index,           # or another stable identifier
    "gene_name"    : features["feature_name"],
    "feature_type" : features.get("feature_biotype", "Gene")  # fall back to "Gene"
})
features_out.to_csv(out_dir / "features.tsv", sep="\t", header=False, index=False)

#    (c) Write the barcode file
pd.Series(adata_raw.obs_names).to_csv(out_dir / "barcodes.tsv", header=False, index=False)

In [10]:
# delete raw data
adata.raw = None        # empties the mapping in-place
adata

AnnData object with n_obs × n_vars = 40000 × 36412
    obs: 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'is_primary_data', 'Neurotypical reference', 'Class', 'Subclass', 'Supertype', 'Age at death', 'Years of education', 'Cognitive status', 'ADNC', 'Braak stage', 'Thal phase', 'CERAD score', 'APOE4 status', 'Lewy body disease pathology', 'LATE-NC stage', 'Microinfarct pathology', 'Specimen ID', 'donor_id', 'PMI', 'Number of UMIs', 'Genes detected', 'Fraction mitochrondrial UMIs', 'suspension_type', 'development_stage_ontology_term_id', 'Continuous Pseudo-progression Score', 'tissue_type', 'cell_type', 'assay', 'disease', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type'
    obsm: 'X_scVI', 'X_umap'

In [11]:
# This is the normalized data
adata.X[0:5,0:5].toarray()

array([[0.       , 0.       , 1.0842869, 0.       , 0.       ],
       [0.       , 0.       , 0.       , 0.       , 0.       ],
       [0.       , 0.       , 1.8683271, 0.       , 0.       ],
       [0.       , 0.       , 0.       , 1.5740774, 1.0692327],
       [0.       , 0.       , 0.       , 0.       , 0.       ]],
      dtype=float32)

In [12]:
# Save obs metadata to CSV
adata.obs.to_csv("/Users/kevinlin/Library/CloudStorage/Dropbox/Collaboration-and-People/sumie-katie/out/ADRC_workshop_2025/adata_obs.csv")
adata.obs = pd.DataFrame(index=adata.obs.index)  # Keep only the cell barcodes
adata

AnnData object with n_obs × n_vars = 40000 × 36412
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type'
    obsm: 'X_scVI', 'X_umap'

In [13]:
# Save var metadata to CSV
adata.var.to_csv("/Users/kevinlin/Library/CloudStorage/Dropbox/Collaboration-and-People/sumie-katie/out/ADRC_workshop_2025/adata_var.csv")
adata.var = pd.DataFrame(index=adata.var.index)  # Keep only the cell barcodes
adata

AnnData object with n_obs × n_vars = 40000 × 36412
    obsm: 'X_scVI', 'X_umap'

In [14]:
adata.write_h5ad(
"/Users/kevinlin/Library/CloudStorage/Dropbox/Collaboration-and-People/sumie-katie/out/ADRC_workshop_2025/adata_simplified.h5ad",
    compression='gzip', 
    compression_opts=9
)
