In [6]:
import h5py
import anndata as ad
import pandas as pd
import numpy as np

# Replace 'your_file.h5ad' with the path to your .h5ad file
file_path = '../star/mtx_conversions/combined_filtered_matrix.h5ad'
def print_attrs(name, obj):
    if isinstance(obj, h5py.Dataset):
        print(f"{name}: {obj.shape} {obj.dtype}")

# Open the .h5ad file
with h5py.File(file_path, 'r') as f:
    # Print the root groups
    print("Root groups:")
    for group in f.keys():
        print(group)
    
    # Explore group 'X'
    if 'X' in f.keys():
        print("\nDatasets in group 'X':")
        for dataset in f['X'].keys():
            print(dataset)
            print(f['X'][dataset].shape)
            print(f['X'][dataset].dtype)
            # Uncomment the next line if you want to see the actual data (may be large)
            # print(f['X'][dataset][:])

    # Explore group 'obs'
    if 'obs' in f.keys():
        print("\nDatasets in group 'obs':")
        f['obs'].visititems(print_attrs)
    
    # Explore group 'var'
    if 'var' in f.keys():
        print("\nDatasets in group 'var':")
        f['var'].visititems(print_attrs)

    # If you want to see the actual data, you can retrieve it like this
    # For example, for 'obs' dataset
    if 'obs' in f.keys():
        obs_data = {}
        for name in f['obs'].keys():
            if isinstance(f['obs'][name], h5py.Dataset):
                obs_data[name] = f['obs'][name][:]
        print("\nData in 'obs' group:")
        for key, value in obs_data.items():
            print(f"{key}: {value[:10]}")  # Print the first 10 elements

    # Similarly, for 'var' dataset
    if 'var' in f.keys():
        var_data = {}
        for name in f['var'].keys():
            if isinstance(f['var'][name], h5py.Dataset):
                var_data[name] = f['var'][name][:]
        print("\nData in 'var' group:")
        for key, value in var_data.items():
            print(f"{key}: {value[:10]}") 

Root groups:
X
obs
var

Datasets in group 'X':
data
(15985,)
float32
indices
(15985,)
int32
indptr
(4,)
int32

Datasets in group 'obs':
__categories/fastq_1: (1,) object
__categories/fastq_2: (1,) object
_index: (3,) object
fastq_1: (3,) int8
fastq_2: (3,) int8
sample: (3,) object

Datasets in group 'var':
__categories/gene_symbol: (29683,) object
_index: (29744,) object
gene_symbol: (29744,) int16

Data in 'obs' group:
_index: [b'sample3_sample3_filtered' b'sample2_sample2_filtered'
 b'sample1_sample1_filtered']
fastq_1: [0 0 0]
fastq_2: [0 0 0]
sample: [b'sample3_filtered' b'sample2_filtered' b'sample1_filtered']

Data in 'var' group:
_index: [b'DDX11L1' b'WASH7P' b'MIR6859-1' b'MIR1302-2' b'FAM138A' b'OR4F5'
 b'LOC101927589' b'LOC729737' b'LOC100996442' b'LOC102723897']
gene_symbol: [ 4842 28533 17727 16753  6320 19526 12228 11192 11728 14151]


In [10]:
#file_path = '../star/mtx_conversions/combined_filtered_matrix.h5ad'
file_path = 'filtered_ms_adata.h5ad'

# Load the .h5ad file
adata = ad.read_h5ad(file_path)

# Convert the count matrix to a pandas DataFrame
count_matrix = adata.to_df()

# Count the number of zeros in the count matrix
num_zeros = (count_matrix == 0).sum().sum()


# Count the number of zeros in the count matrix
num_zeros = (count_matrix == 0).sum().sum()

# Get the total number of elements in the count matrix
total_elements = count_matrix.size


print(f"Number of zeros in the count matrix: {num_zeros}")
print(f"Total number of elements in the count matrix: {total_elements}")
print(f"Percentage of zeros in the count matrix: {num_zeros / total_elements * 100:.2f}%")

count_matrix

Number of zeros in the count matrix: 35834553
Total number of elements in the count matrix: 40404000
Percentage of zeros in the count matrix: 88.69%


Unnamed: 0,ENSG00000000971,ENSG00000002330,ENSG00000002549,ENSG00000002586,ENSG00000002745,ENSG00000003096,ENSG00000003137,ENSG00000003249,ENSG00000003989,ENSG00000004059,...,ENSG00000276432,ENSG00000276644,ENSG00000277443,ENSG00000277494,ENSG00000277586,ENSG00000277633,ENSG00000277656,ENSG00000277734,ENSG00000277883,ENSG00000279228
SRR9123033-AAACCTGAGCTAGCCC,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,2.573534,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0
SRR9123033-AAACCTGCAGACGCCT,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,2.619293,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0
SRR9123033-AAAGATGGTAGCGTAG,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0
SRR9123033-AAAGATGGTGCCTGGT,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,2.488792,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0
SRR9123033-AAAGTAGCACCAGGTC,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,2.380725,0.000000,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR9123052-TTTGGTTCATCCGGGT,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,1.137901,...,0.0,0.0,0.000000,0.000000,1.137901,0.0,1.137901,0.0,0.0,0.0
SRR9123052-TTTGGTTCATCGATGT,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.000000,1.410930,0.0,0.000000,0.0,0.0,0.0
SRR9123052-TTTGGTTGTCAACTGT,1.529693,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.585284,...,0.0,0.0,0.000000,0.000000,1.430790,0.0,0.000000,0.0,0.0,0.0
SRR9123052-TTTGTCAAGGCTACGA,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.0,0.0,1.488002,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0


# Load h5ad data using scanpy
- we write the name of the h5ad file in `file_path` variable
- There are three main dataframes in this h5ad file
    - obs
    - var
    - keys

In [41]:
import scanpy as sc

# Load the .h5ad file
file_path = '../star/mtx_conversions/combined_filtered_matrix.h5ad'
#file_path ='filtered_ms_adata.h5ad'
#file_path = 'Kim2020_Lung.h5ad'
adata = sc.read_h5ad(file_path)

## Visualize all columns contained on these dataframes

In [42]:

obs_columns = adata.obs.columns.tolist()
print("Columns in obs:")
print(obs_columns)

# List all column names in the var dataframe
var_columns = adata.var.columns.tolist()
print("\nColumns in var:")
print(var_columns)

# List all keys in the obsm dataframe
obsm_keys = adata.obsm.keys()
print("\nKeys in obsm:")
print(obsm_keys)

Columns in obs:
['sample', 'fastq_1', 'fastq_2', 'celltype']

Columns in var:
['gene_symbol']

Keys in obsm:
KeysView(AxisArrays with keys: )


## Add new column to dataframe using Scanpy
- we want to add the column `celltype` to the dataframe `obs`

In [40]:
import scanpy as sc
import pandas as pd
import numpy as np

# Load the .h5ad file
file_path = '../star/mtx_conversions/combined_filtered_matrix.h5ad'
adata = sc.read_h5ad(file_path)

nan_values = pd.Series([np.nan] * adata.n_obs, index=adata.obs.index)

# Add the NaN Series to the obs dataframe as a new column 'celltype'
adata.obs['celltype'] = nan_values
adata.write(file_path)