# TCGA GBM scRNA-seq Create h5ad file

This notebook reads in scRNA-seq data from a GDC `.tar.gz` download and prepares it for downstream analysis.

In [2]:
import os
import tarfile
import glob
import scanpy as sc
import anndata as ad
import pandas as pd
import loompy
import numpy as np
import h5py

sc.settings.verbosity = 3
sc.logging.print_header()

  and (v := getattr(pkg, "__version__", None))
  sc.logging.print_header()


Package,Version
Component,Info
scanpy,1.11.1
anndata,0.11.4
pandas,2.2.3
loompy,3.0.8
numpy,2.2.6
h5py,3.13.0
Python,"3.12.9 | packaged by Anaconda, Inc. | (main, Feb 6 2025, 12:55:12) [Clang 14.0.6 ]"
OS,macOS-26.3-arm64-arm-64bit
CPU,"14 logical CPU cores, arm"
GPU,No GPU found

Dependency,Version
jupyter_client,8.6.3
stack_data,0.6.3
Pygments,2.19.1
parso,0.8.4
matplotlib,3.10.3
tqdm,4.67.1
natsort,8.4.0
numpy-groupies,0.11.3
toolz,1.0.0
typing_extensions,4.15.0


## 1. Define Paths

In [None]:
# Path to the .tar.gz file on Google Drive
TARBALL_PATH = (
    "{path_of_saved_TCGA_files}"
)

# Local directory to extract contents into
EXTRACT_DIR = os.path.join(os.getcwd(), "data", "gdc_extract")
os.makedirs(EXTRACT_DIR, exist_ok=True)

print(f"Tarball path: {TARBALL_PATH}")
print(f"Extract dir:  {EXTRACT_DIR}")
print(f"Tarball exists: {os.path.exists(TARBALL_PATH)}")
print(f"Tarball size:  {os.path.getsize(TARBALL_PATH) / 1e9:.2f} GB")

Tarball path: /Users/jarrettevans/Library/CloudStorage/GoogleDrive-jevans2532@gmail.com/My Drive/Biomedical Data Science/Projects/TCGA_GBM_HD5_scRNA/gdc_download_20260222_212327.487958.tar.gz
Extract dir:  /Users/jarrettevans/Documents/Biomedical Data Science/TCGA_GBM_scRNA-seq/data/gdc_extract
Tarball exists: False


FileNotFoundError: [Errno 2] No such file or directory: '/Users/jarrettevans/Library/CloudStorage/GoogleDrive-jevans2532@gmail.com/My Drive/Biomedical Data Science/Projects/TCGA_GBM_HD5_scRNA/gdc_download_20260222_212327.487958.tar.gz'

## 2. Inspect the Tarball Contents

Before extracting, let's list the files inside the archive to understand its structure.

In [4]:
# List the contents of the tarball without extracting
with tarfile.open(TARBALL_PATH, "r:gz") as tar:
    members = tar.getmembers()
    print(f"Total files in archive: {len(members)}\n")
    for m in members:
        size_mb = m.size / 1e6
        print(f"  {m.name}  ({size_mb:.2f} MB)")

Total files in archive: 18

  MANIFEST.txt  (0.00 MB)
  06820e2c-9eb7-4e71-a1c3-976d561e659d/28433d89-2612-4c04-8a5d-9d43a9fe9fb4.seurat.1000x1000.loom  (65.30 MB)
  08418175-0464-4b33-86e3-416c30189bd2/cf280871-b2eb-4a63-9a3b-2dfd4f9b6972.seurat.1000x1000.loom  (50.35 MB)
  18d92af9-a7d0-47f2-84e2-6f4cc0dee298/122044eb-4d84-4d1e-b80e-8f3ecd688d7b.seurat.1000x1000.loom  (111.67 MB)
  20e86156-cdd7-4bea-8b17-2e630720df44/8cd63888-410b-4d09-9c8a-c82482a8ccc7.seurat.1000x1000.loom  (246.96 MB)
  21521784-3a62-4989-bd29-0f920e1eaa2b/ac9a3b04-a1df-4230-8057-b29e0f10f321.seurat.1000x1000.loom  (218.74 MB)
  44dcf3b7-977a-410f-8901-412697377924/9c78ef6c-a5e6-4466-b756-2ca6415a8115.seurat.1000x1000.loom  (218.54 MB)
  4a0d48cd-1c46-4d59-b4b0-ae842bf1323c/e07b97ed-9c67-4f9c-b5c8-7fc35c67dc4f.seurat.1000x1000.loom  (62.91 MB)
  5212fe36-192c-44a3-88f4-4c6e696b583c/627f8f16-34c0-4051-86e1-9cd438a370ca.seurat.1000x1000.loom  (48.56 MB)
  7eac574d-04fa-435d-8b4d-35da0100ecb9/55e91447-1779-4234-896a

## 3. Extract the Tarball

In [9]:
# Extract all files to the local extract directory
with tarfile.open(TARBALL_PATH, "r:gz") as tar:
    tar.extractall(path=EXTRACT_DIR)

print("Extraction complete!")
print(f"\nExtracted contents:")
for root, dirs, files in os.walk(EXTRACT_DIR):
    level = root.replace(EXTRACT_DIR, "").count(os.sep)
    indent = "  " * level
    print(f"{indent}{os.path.basename(root)}/")
    sub_indent = "  " * (level + 1)
    for f in files:
        fpath = os.path.join(root, f)
        size_mb = os.path.getsize(fpath) / 1e6
        print(f"{sub_indent}{f}  ({size_mb:.2f} MB)")

  tar.extractall(path=EXTRACT_DIR)


Extraction complete!

Extracted contents:
gdc_extract/
  MANIFEST.txt  (0.00 MB)
  9079231e-6906-4548-9f4f-684a86ed9b16/
    d3edc3b1-9f4c-4ff7-a6a4-3e712d4230a8.seurat.1000x1000.loom  (83.10 MB)
  21521784-3a62-4989-bd29-0f920e1eaa2b/
    ac9a3b04-a1df-4230-8057-b29e0f10f321.seurat.1000x1000.loom  (218.74 MB)
  903b7af4-a936-4c0e-8cff-87085c3faa31/
    2ccc7858-b1d6-4c62-9831-03c5f915e539.seurat.1000x1000.loom  (241.50 MB)
  f8b54be7-f58e-4881-9147-40df9cef487b/
    f5316799-2f79-47a2-ac45-0d0742055d67.seurat.1000x1000.loom  (155.20 MB)
  06820e2c-9eb7-4e71-a1c3-976d561e659d/
    28433d89-2612-4c04-8a5d-9d43a9fe9fb4.seurat.1000x1000.loom  (65.30 MB)
  c8ea9c15-368d-460c-9775-5037a5f1790a/
    f90da4a6-d318-4563-b5bc-7316e7ba9a00.seurat.1000x1000.loom  (108.80 MB)
  8c0685b3-521e-45a6-9677-f8e3f186e09b/
    1ad609b5-8c9f-4741-8a89-df03b6808b6d.seurat.1000x1000.loom  (180.39 MB)
  5212fe36-192c-44a3-88f4-4c6e696b583c/
    627f8f16-34c0-4051-86e1-9cd438a370ca.seurat.1000x1000.loom  (48.5

## 4. Load the Data

GDC scRNA-seq downloads typically contain `.h5` (10x HDF5), `.h5ad`, or `.loom` files. The cell below auto-detects the file type and loads accordingly.

In [5]:
# Discover all relevant data files
h5_files = glob.glob(os.path.join(EXTRACT_DIR, "**", "*.h5"), recursive=True)
h5ad_files = glob.glob(os.path.join(EXTRACT_DIR, "**", "*.h5ad"), recursive=True)
loom_files = glob.glob(os.path.join(EXTRACT_DIR, "**", "*.loom"), recursive=True)
mtx_dirs = glob.glob(os.path.join(EXTRACT_DIR, "**", "matrix.mtx*"), recursive=True)
tsv_files = glob.glob(os.path.join(EXTRACT_DIR, "**", "*.tsv*"), recursive=True)

print(f"Found {len(h5_files)} .h5 files")
print(f"Found {len(h5ad_files)} .h5ad files")
print(f"Found {len(loom_files)} .loom files")
print(f"Found {len(mtx_dirs)} matrix.mtx files")
print(f"Found {len(tsv_files)} .tsv files")

for f in h5_files:
    print(f"  .h5:   {f}")
for f in h5ad_files:
    print(f"  .h5ad: {f}")
for f in loom_files:
    print(f"  .loom: {f}")

Found 0 .h5 files
Found 0 .h5ad files
Found 17 .loom files
Found 0 matrix.mtx files
Found 0 .tsv files
  .loom: /Users/jarrettevans/Documents/Biomedical Data Science/TCGA_GBM_scRNA-seq/data/gdc_extract/9079231e-6906-4548-9f4f-684a86ed9b16/d3edc3b1-9f4c-4ff7-a6a4-3e712d4230a8.seurat.1000x1000.loom
  .loom: /Users/jarrettevans/Documents/Biomedical Data Science/TCGA_GBM_scRNA-seq/data/gdc_extract/21521784-3a62-4989-bd29-0f920e1eaa2b/ac9a3b04-a1df-4230-8057-b29e0f10f321.seurat.1000x1000.loom
  .loom: /Users/jarrettevans/Documents/Biomedical Data Science/TCGA_GBM_scRNA-seq/data/gdc_extract/903b7af4-a936-4c0e-8cff-87085c3faa31/2ccc7858-b1d6-4c62-9831-03c5f915e539.seurat.1000x1000.loom
  .loom: /Users/jarrettevans/Documents/Biomedical Data Science/TCGA_GBM_scRNA-seq/data/gdc_extract/f8b54be7-f58e-4881-9147-40df9cef487b/f5316799-2f79-47a2-ac45-0d0742055d67.seurat.1000x1000.loom
  .loom: /Users/jarrettevans/Documents/Biomedical Data Science/TCGA_GBM_scRNA-seq/data/gdc_extract/06820e2c-9eb7-4e71

In [6]:
# Load the data based on detected file type
adata_list = []

if h5ad_files:
    # Load .h5ad files directly
    for f in h5ad_files:
        print(f"Loading h5ad: {os.path.basename(f)}")
        adata_list.append(sc.read_h5ad(f))

elif h5_files:
    # Load 10x HDF5 files
    for f in h5_files:
        print(f"Loading 10x h5: {os.path.basename(f)}")
        try:
            adata_list.append(sc.read_10x_h5(f))
        except Exception as e:
            print(f"  Could not read as 10x h5, trying generic h5...")
            # Inspect the HDF5 structure for non-standard formats
            with h5py.File(f, "r") as h5f:
                print(f"  HDF5 keys: {list(h5f.keys())}")
                def print_h5_structure(name, obj):
                    print(f"    {name}: {type(obj).__name__}")
                h5f.visititems(print_h5_structure)

elif loom_files:
    # Load .loom files
    for f in loom_files:
        print(f"Loading loom: {os.path.basename(f)}")
        adata_list.append(sc.read_loom(f))

elif mtx_dirs:
    # Load from Market Matrix format (10x style)
    for mtx_file in mtx_dirs:
        mtx_parent = os.path.dirname(mtx_file)
        print(f"Loading 10x mtx from: {mtx_parent}")
        adata_list.append(sc.read_10x_mtx(mtx_parent))

else:
    print("No recognized data files found. Please inspect the extracted directory above.")

print(f"\nLoaded {len(adata_list)} AnnData object(s)")

Loading loom: d3edc3b1-9f4c-4ff7-a6a4-3e712d4230a8.seurat.1000x1000.loom
Loading loom: ac9a3b04-a1df-4230-8057-b29e0f10f321.seurat.1000x1000.loom
Loading loom: 2ccc7858-b1d6-4c62-9831-03c5f915e539.seurat.1000x1000.loom
Loading loom: f5316799-2f79-47a2-ac45-0d0742055d67.seurat.1000x1000.loom
Loading loom: 28433d89-2612-4c04-8a5d-9d43a9fe9fb4.seurat.1000x1000.loom
Loading loom: f90da4a6-d318-4563-b5bc-7316e7ba9a00.seurat.1000x1000.loom
Loading loom: 1ad609b5-8c9f-4741-8a89-df03b6808b6d.seurat.1000x1000.loom
Loading loom: 627f8f16-34c0-4051-86e1-9cd438a370ca.seurat.1000x1000.loom
Loading loom: 122044eb-4d84-4d1e-b80e-8f3ecd688d7b.seurat.1000x1000.loom
Loading loom: e07b97ed-9c67-4f9c-b5c8-7fc35c67dc4f.seurat.1000x1000.loom
Loading loom: a1a4024e-4730-4060-bdac-9f20e35f32f3.seurat.1000x1000.loom
Loading loom: 968424fb-d1db-480a-b6c9-09d1e599c8bf.seurat.1000x1000.loom
Loading loom: 9c78ef6c-a5e6-4466-b756-2ca6415a8115.seurat.1000x1000.loom
Loading loom: f9b4fd54-d06d-4ccd-ad0c-7bd009dac9bf.

In [7]:
# If multiple AnnData objects, concatenate; otherwise use the single one
if len(adata_list) == 1:
    adata = adata_list[0]
elif len(adata_list) > 1:
    print("Concatenating multiple AnnData objects...")
    adata = ad.concat(adata_list, join="outer", label="batch")
    adata.obs_names_make_unique()
else:
    raise ValueError("No data was loaded. Check the extracted files above.")

print(f"\nAnnData object:")
print(adata)
print(f"\nShape: {adata.shape[0]} cells × {adata.shape[1]} genes")
print(f"\nobs columns: {list(adata.obs.columns)}")
print(f"var columns: {list(adata.var.columns)}")

Concatenating multiple AnnData objects...

AnnData object:
AnnData object with n_obs × n_vars = 184495 × 39853
    obs: 'SCT_snn_res.0.5', 'nCount_RNA', 'nCount_SCT', 'nFeature_RNA', 'nFeature_SCT', 'orig.ident', 'seurat_clusters', 'batch'
    layers: 'counts'

Shape: 184495 cells × 39853 genes

obs columns: ['SCT_snn_res.0.5', 'nCount_RNA', 'nCount_SCT', 'nFeature_RNA', 'nFeature_SCT', 'orig.ident', 'seurat_clusters', 'batch']
var columns: []


  utils.warn_names_duplicates("obs")


In [8]:
# Quick look at the data
adata.obs.head()

Unnamed: 0_level_0,SCT_snn_res.0.5,nCount_RNA,nCount_SCT,nFeature_RNA,nFeature_SCT,orig.ident,seurat_clusters,batch
CellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AAACCCAAGATGTTCC-1,2,1687.0,1669.0,1222,1221,1,2,0
AAACCCAAGCCACTCG-1,1,5401.0,1520.0,2297,959,1,1,0
AAACCCAAGGTGTGAC-1,3,2796.0,2147.0,1911,1860,1,3,0
AAACCCACAGGATTCT-1,7,3297.0,2176.0,2057,1803,1,7,0
AAACCCAGTCCAGCCA-1,4,2106.0,1918.0,1468,1467,1,4,0


In [9]:
adata.var.head()

ENSG00000000003.15
ENSG00000000005.6
ENSG00000000419.13
ENSG00000000457.14
ENSG00000000460.17


In [10]:
adata.write("gbm_data.h5ad", compression="gzip")

In [None]:
# Read the file back into an AnnData object
adata = ad.read_h5ad("gbm_data.h5ad")

# Check the object to make sure it loaded correctly
print(adata)