In [25]:
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [29]:
import pandas as pd
pd.set_option('display.max_rows', 1000)
import sys
from pathlib import Path

repo_dir = Path.cwd().parent.absolute()
sys.path.append(str(repo_dir))



In [30]:
from src.utils import setup_data_dir
from pathlib import Path
setup_data_dir()
data_dir = repo_dir / "data"

File already exists at /Users/rj/personal/GenePT-tools/data/GenePT_emebdding_v2.zip
Extracting files...
Extracting GenePT_emebdding_v2/
Skipping GenePT_emebdding_v2/NCBI_UniProt_summary_of_genes.json - already exists with same size
Skipping GenePT_emebdding_v2/GenePT_gene_embedding_ada_text.pickle - already exists with same size
Skipping GenePT_emebdding_v2/GenePT_gene_protein_embedding_model_3_text.pickle. - already exists with same size
Skipping GenePT_emebdding_v2/NCBI_summary_of_genes.json - already exists with same size
Extraction complete!
Setup finished!


In [31]:
import requests

dataset = "https://datasets.cellxgene.cziscience.com/10df7690-6d10-4029-a47e-0f071bb2df83.h5ad"
# dataset_id = "10df7690-6d10-4029-a47e-0f071bb2df83"

file_path = data_dir / "1m_cells.h5ad"  # adjust this path as needed




In [32]:

if not file_path.exists():
    response = requests.get(dataset, stream=True)
    with open(file_path, 'wb') as file:
        for chunk in response.iter_content(chunk_size=8192):
            if chunk:  # filter out keep-alive chunks
                file.write(chunk)


In [33]:
# import anndata as ad

# # Read the h5ad file in backed mode (memory-mapped, read-only)
# adata = ad.read_h5ad(file_path, backed='r')

In [38]:
import h5py

with h5py.File(file_path, 'r') as f:
    # Look at the structure of the X group
    print("Contents of X group:", list(f['X'].keys()))
    
    # Look at obs and var to get dimensions
    print("\nContents of obs group:", list(f['obs'].keys()))
    print("Contents of var group:", list(f['var'].keys()))
    
    # If X contains a sparse matrix, it likely has 'data', 'indices', and 'indptr'
    if 'data' in f['X']:
        print("\nShape of X/data:", f['X']['data'].shape)
        print("Shape of X/indices:", f['X']['indices'].shape)
        print("Shape of X/indptr:", f['X']['indptr'].shape)

Contents of X group: ['data', 'indices', 'indptr']

Contents of obs group: ['10X_run', '_index', '_scvi_batch', '_scvi_labels', 'ambient_removal', 'anatomical_position', 'assay', 'assay_ontology_term_id', 'broad_cell_class', 'cdna_plate', 'cdna_well', 'cell_type', 'cell_type_ontology_term_id', 'compartment', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_assay', 'donor_id', 'donor_method', 'donor_tissue', 'donor_tissue_assay', 'ethnicity_original', 'free_annotation', 'is_primary_data', 'library_plate', 'manually_annotated', 'method', 'n_genes_by_counts', 'notes', 'observation_joinid', 'organism', 'organism_ontology_term_id', 'pct_counts_ercc', 'pct_counts_mt', 'published_2022', 'replicate', 'sample_id', 'sample_number', 'scvi_leiden_donorassay_full', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_in_publication', 'tissue_ontology_term_id'

In [50]:
import numpy as np
from scipy import sparse

def load_subset_sparse(file_path, start_row=0, n_rows=1000):
    """
    Load a subset of rows from the sparse matrix.
    
    Args:
        file_path: Path to h5ad file
        start_row: Starting row index
        n_rows: Number of rows to load
    
    Returns:
        scipy.sparse.csr_matrix with the requested rows
    """
    with h5py.File(file_path, 'r') as f:
        # Get the indptr for the rows we want
        indptr = f['X']['indptr'][start_row:start_row + n_rows + 1]
        # Find the indices in data array for our rows
        start_idx = indptr[0]
        end_idx = indptr[-1]
        
        # Load the relevant parts of the data and indices
        data = f['X']['data'][start_idx:end_idx]
        indices = f['X']['indices'][start_idx:end_idx]
        
        # Adjust indptr to start at 0
        indptr = indptr - start_idx
        
        # Get the total number of columns from the var group
        n_cols = len(f['var']['feature_name']['categories'])
        
        # Create the sparse matrix
        return sparse.csr_matrix((data, indices, indptr), shape=(n_rows, n_cols))

# Example usage:
# Load first 1000 cells
matrix_subset = load_subset_sparse(file_path, start_row=0, n_rows=1000)
print("Subset shape:", matrix_subset.shape)
print("Subset density:", matrix_subset.nnz / (matrix_subset.shape[0] * matrix_subset.shape[1]))

Subset shape: (1000, 61759)
Subset density: 0.030184685632863226


In [49]:
with h5py.File(file_path, 'r') as f:
    print(f['var']['feature_name']["categories"])
    # print(f['X']['indices'][:10])

<HDF5 dataset "categories": shape (61759,), type "|O">


In [40]:
matrix_subset

<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 1864176 stored elements and shape (1000, 2)>

In [36]:
with h5py.File(file_path, 'r') as f:
    # Get the indptr for the rows we want
    ensembl_ids = f['var']['ensembl_id']
    gene_names = f['var']['feature_name']
    print(ensembl_ids[0:10])
    print(gene_names['categories'][:10])
    print(gene_names['codes'][:10])


[b'ENSG00000000003.15' b'ENSG00000000005.6' b'ENSG00000000419.14'
 b'ENSG00000000457.14' b'ENSG00000000460.17' b'ENSG00000000938.13'
 b'ENSG00000000971.17' b'ENSG00000001036.14' b'ENSG00000001084.13'
 b'ENSG00000001167.15']
[b'5S_rRNA_ENSG00000276861' b'5S_rRNA_ENSG00000277411'
 b'5S_rRNA_ENSG00000277488' b'5S_rRNA_ENSG00000285609'
 b'5S_rRNA_ENSG00000285626' b'5S_rRNA_ENSG00000285674'
 b'5S_rRNA_ENSG00000285776' b'5S_rRNA_ENSG00000285912'
 b'5S_rRNA_ENSG00000288601' b'5_8S_rRNA_ENSG00000275877']
[58098 57279  6814 52787  2922 28568  4406 28967 29310 41479]


In [10]:
with h5py.File(file_path, 'r') as f:
    # Get the indptr for the rows we want
    gene_names = f['var']['feature_name']
    ensembl_ids = f['var']['ensembl_id']
    major_ensembl_ids = pd.Series(
        ensembl_id.decode('utf-8').split('.')[0]
        for ensembl_id in ensembl_ids
    )
    

In [37]:
n_cols = len(f['var']['feature_name'])  # Total number of possible genes
return sparse.csr_matrix((data, indices, indptr), shape=(n_rows, n_cols))

ValueError: Invalid location identifier (invalid location identifier)

In [11]:
import h5py

with h5py.File(file_path, 'r') as f:
    # Look at the structure of obs group in detail
    obs_group = f['obs']
    print("Type of obs group:", type(obs_group))
    print("Keys in obs group:", list(obs_group.keys()))
    
    # Let's look at one specific column to understand its structure
    cell_type_data = obs_group['cell_type']
    print("\nType of cell_type data:", type(cell_type_data))
    if hasattr(cell_type_data, 'shape'):
        print("Shape of cell_type data:", cell_type_data.shape)

Type of obs group: <class 'h5py._hl.group.Group'>
Keys in obs group: ['10X_run', '_index', '_scvi_batch', '_scvi_labels', 'ambient_removal', 'anatomical_position', 'assay', 'assay_ontology_term_id', 'broad_cell_class', 'cdna_plate', 'cdna_well', 'cell_type', 'cell_type_ontology_term_id', 'compartment', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_assay', 'donor_id', 'donor_method', 'donor_tissue', 'donor_tissue_assay', 'ethnicity_original', 'free_annotation', 'is_primary_data', 'library_plate', 'manually_annotated', 'method', 'n_genes_by_counts', 'notes', 'observation_joinid', 'organism', 'organism_ontology_term_id', 'pct_counts_ercc', 'pct_counts_mt', 'published_2022', 'replicate', 'sample_id', 'sample_number', 'scvi_leiden_donorassay_full', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_in_publication', 'tissue_ontology_term_id', 'tis

In [12]:
from datasets import load_dataset

gene_info_table_dataset = load_dataset("honicky/genept-composable-embeddings-source-data", "gene_info")
gene_info_table = gene_info_table_dataset['train'].to_pandas()
gene_info_table.head()


  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,gene_name,ensembl_id,gene_type
0,TSPAN6,ENSG00000000003,protein_coding
1,TNMD,ENSG00000000005,protein_coding
2,DPM1,ENSG00000000419,protein_coding
3,SCYL3,ENSG00000000457,protein_coding
4,C1orf112,ENSG00000000460,protein_coding


In [13]:
gene_embedding = pd.read_parquet(data_dir / "generated/embeddings/embedding_associations_age_drugs_pathways_openai_large.parquet")


# embed using the mean for duplicates


In [15]:
gene_embeddings_with_ensembl_id = gene_embedding.merge(gene_info_table, left_index=True, right_on='gene_name')



In [16]:
gene_embeddings_with_ensembl_id[gene_embeddings_with_ensembl_id.ensembl_id == 'ENSG00000222005']

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3065,3066,3067,3068,3069,3070,3071,gene_name,ensembl_id,gene_type
83316,-0.017322,0.015861,-0.009243,0.002859,0.002974,-0.015873,0.029082,-0.009793,0.026003,0.014036,...,-0.027633,0.008933,-0.018642,0.021656,-0.008706,-0.005382,-0.021889,LINC01118,ENSG00000222005,
83105,-0.028728,0.013531,-0.01305,-0.003901,0.019633,-0.024284,0.029168,0.003008,0.012114,0.019853,...,-0.017858,-0.001864,-0.021449,0.01812,-0.018822,-0.017803,-0.023334,LINC01119,ENSG00000222005,


In [17]:
import numpy as np

matching_ensembl_ids = pd.DataFrame(major_ensembl_ids, columns=['ensembl_id']).merge(gene_embeddings_with_ensembl_id, left_on='ensembl_id', right_on='ensembl_id')[['gene_name', 'ensembl_id']]
ensembl_id_counts = matching_ensembl_ids.ensembl_id.value_counts()
ensembl_id_counts[ensembl_id_counts > 1]


ensembl_id
ENSG00000000003    2
ENSG00000243485    2
ENSG00000222005    2
ENSG00000264405    2
ENSG00000204792    2
ENSG00000201388    2
ENSG00000222345    2
ENSG00000276234    2
ENSG00000187838    2
ENSG00000202377    2
ENSG00000226364    2
ENSG00000269433    2
ENSG00000183598    2
ENSG00000267151    2
ENSG00000236790    2
ENSG00000264073    2
ENSG00000265134    2
ENSG00000238936    2
ENSG00000270722    2
ENSG00000226444    2
ENSG00000269099    2
ENSG00000204397    2
ENSG00000249532    2
ENSG00000206603    2
ENSG00000284917    2
ENSG00000268942    2
ENSG00000206903    2
ENSG00000063587    2
ENSG00000274020    2
ENSG00000255154    2
ENSG00000090857    2
ENSG00000254508    2
ENSG00000206897    2
ENSG00000251866    2
ENSG00000226419    2
ENSG00000264448    2
ENSG00000206785    2
ENSG00000197927    2
ENSG00000245080    2
ENSG00000269586    2
ENSG00000227518    2
ENSG00000145491    2
ENSG00000221164    2
ENSG00000250331    2
ENSG00000223770    2
ENSG00000269955    2
ENSG00000207187    2
EN

In [18]:
# Get embeddings without metadata columns
embedding_cols = [col for col in gene_embeddings_with_ensembl_id.columns 
                 if col not in ['gene_name', 'ensembl_id', 'gene_type']]

# Group by ensembl_id and take mean of embeddings
merged_embeddings = (gene_embeddings_with_ensembl_id
    .groupby('ensembl_id')[embedding_cols]
    .mean()
    .reset_index())

# Renormalize the embeddings
embedding_values = merged_embeddings[embedding_cols].values
norms = np.linalg.norm(embedding_values, axis=1, keepdims=True)
merged_embeddings[embedding_cols] = embedding_values / norms

# Verify the results
print(f"Original shape: {gene_embeddings_with_ensembl_id.shape}")
print(f"After merging duplicates: {merged_embeddings.shape}")

# Verify all duplicates are resolved
duplicate_check = merged_embeddings.ensembl_id.value_counts()
print("\nNumber of remaining duplicates:", (duplicate_check > 1).sum())

Original shape: (37220, 3075)
After merging duplicates: (36573, 3073)

Number of remaining duplicates: 0


In [19]:
merged_embeddings.columns

Index(['ensembl_id', '0', '1', '2', '3', '4', '5', '6', '7', '8',
       ...
       '3062', '3063', '3064', '3065', '3066', '3067', '3068', '3069', '3070',
       '3071'],
      dtype='object', length=3073)

In [20]:
def load_cell_metadata(file_path, start_row=0, n_rows=1000, columns=None):
    """
    Load metadata for specific cells.
    
    Args:
        file_path: Path to h5ad file
        start_row: Starting row index
        n_rows: Number of rows to load
        columns: List of metadata columns to load (if None, load all)
    
    Returns:
        Dictionary of metadata arrays
    """
    with h5py.File(file_path, 'r') as f:
        metadata = {}
        obs_keys = list(f['obs'].keys()) if columns is None else columns
        for key in obs_keys:
            if key in f['obs']:
                column_group = f['obs'][key]
                if isinstance(column_group, h5py.Dataset):
                    metadata[key] = column_group[start_row:start_row + n_rows]
                else:
                    if 'categories' in column_group and 'codes' in column_group:
                        # Get categories and decode from bytes to strings
                        categories = [cat.decode('utf-8') for cat in column_group['categories'][:]]
                        codes = column_group['codes'][start_row:start_row + n_rows]
                        metadata[key] = np.array([categories[code] for code in codes])
    return metadata

# Example usage:
metadata = load_cell_metadata(file_path, start_row=0, n_rows=1136219, 
                            columns=['cell_type', 'total_counts', 'broad_cell_class'])
print("Loaded metadata keys:", list(metadata.keys()))

# Print first few values of each column
for key in metadata:
    print(f"\nFirst 5 values of {key}:")
    print(metadata[key][:5])

Loaded metadata keys: ['cell_type', 'total_counts', 'broad_cell_class']

First 5 values of cell_type:
['naive thymus-derived CD4-positive, alpha-beta T cell' 'B cell' 'B cell'
 'B cell' 'CD8-positive, alpha-beta T cell']

First 5 values of total_counts:
[648388. 404690. 579976. 496511. 453314.]

First 5 values of broad_cell_class:
['t cell' 'lymphocyte of b lineage' 'lymphocyte of b lineage'
 'lymphocyte of b lineage' 't cell']


In [21]:
import pandas as pd
pd.Series(metadata['cell_type']).value_counts()

B cell                                                  114495
CD4-positive, alpha-beta T cell                          97130
fibroblast                                               83338
CD8-positive, alpha-beta T cell                          76277
neutrophil                                               69539
macrophage                                               69072
stromal cell of ovary                                    35003
endothelial cell                                         33703
basal cell                                               33526
plasma cell                                              27349
bladder urothelial cell                                  27241
monocyte                                                 25753
mesenchymal stem cell                                    23499
smooth muscle cell                                       18425
classical monocyte                                       18200
luminal epithelial cell of mammary gland               

In [22]:
pd.Series(metadata['broad_cell_class']).value_counts()

t cell                             208196
lymphocyte of b lineage            141844
myeloid leukocyte                  128294
fibroblast                         100791
granulocyte                         78199
endothelial cell                    63106
contractile cell                    56202
stem cell                           43527
stromal cell                        39919
epithelial cell                     35641
intestinal epithelial cell          34178
transitional epithelial cell        27241
duct epithelial cell                22707
glandular epithelial cell           21942
innate lymphoid cell                15202
epithelial cell of lung             14710
erythroid lineage cell              14613
endo-epithelial cell                14519
stratified epithelial cell          12323
conjunctival epithelial cell        11087
cardiac endothelial cell            10092
kidney epithelial cell               9278
hepatocyte                           7414
male germ cell                    

In [23]:
# # Load both expression data and metadata for 1000 cells
# matrix = load_subset_sparse(file_path, start_row=0, n_rows=1000)
# metadata = load_cell_metadata(file_path, start_row=0, n_rows=1000, 
#                             columns=['cell_type', 'total_counts'])

# # Get unique cell types in this subset
# unique_cell_types = np.unique(metadata['cell_type'])
# print("\nUnique cell types in subset:", unique_cell_types)

# # Calculate average expression for each cell type
# for cell_type in unique_cell_types[:3]:  # Show first 3 cell types
#     mask = metadata['cell_type'] == cell_type
#     avg_expression = matrix[mask].mean(axis=0)
#     print(f"\nAverage number of expressed genes in {cell_type}:", 
#           (avg_expression > 0).sum())


Unique cell types in subset: ['B cell' 'CD4-positive, alpha-beta T cell'
 'CD8-positive, alpha-beta T cell' 'CD8-positive, alpha-beta thymocyte'
 'T cell' 'basal cell' 'capillary endothelial cell'
 'cardiac endothelial cell' 'classical monocyte' 'endothelial cell'
 'endothelial cell of artery' 'erythrocyte' 'innate lymphoid cell'
 'intrahepatic cholangiocyte' 'macrophage' 'mature NK T cell'
 'mesenchymal stem cell' 'mesothelial cell' 'myeloid cell'
 'myeloid dendritic cell'
 'naive thymus-derived CD4-positive, alpha-beta T cell'
 'natural killer cell' 'neutrophil' 'non-classical monocyte' 'pericyte'
 'plasma cell' 'pulmonary alveolar type 2 cell'
 'regular atrial cardiac myocyte' 'regulatory T cell'
 'respiratory goblet cell' 'skeletal muscle satellite stem cell'
 'smooth muscle cell' 'tendon cell'
 'vascular associated smooth muscle cell' 'vein endothelial cell']

Average number of expressed genes in B cell: 0

Average number of expressed genes in CD4-positive, alpha-beta T cell: 1



: 

In [51]:
embedding_c

(1136218,)

In [23]:
with h5py.File(file_path, 'r') as f:
    print("Cell (obs) metadata columns:")
    print("\n".join(f"- {key}" for key in f['obs'].keys()))
    
    print("\nGene (var) metadata columns:")
    print("\n".join(f"- {key}" for key in f['var'].keys()))

Cell (obs) metadata columns:
- 10X_run
- _index
- _scvi_batch
- _scvi_labels
- ambient_removal
- anatomical_position
- assay
- assay_ontology_term_id
- broad_cell_class
- cdna_plate
- cdna_well
- cell_type
- cell_type_ontology_term_id
- compartment
- development_stage
- development_stage_ontology_term_id
- disease
- disease_ontology_term_id
- donor_assay
- donor_id
- donor_method
- donor_tissue
- donor_tissue_assay
- ethnicity_original
- free_annotation
- is_primary_data
- library_plate
- manually_annotated
- method
- n_genes_by_counts
- notes
- observation_joinid
- organism
- organism_ontology_term_id
- pct_counts_ercc
- pct_counts_mt
- published_2022
- replicate
- sample_id
- sample_number
- scvi_leiden_donorassay_full
- self_reported_ethnicity
- self_reported_ethnicity_ontology_term_id
- sex
- sex_ontology_term_id
- suspension_type
- tissue
- tissue_in_publication
- tissue_ontology_term_id
- tissue_type
- total_counts
- total_counts_ercc
- total_counts_mt

Gene (var) metadata column

In [24]:
matrix

NameError: name 'matrix' is not defined