In [4]:
import pandas as pd
import pyarrow.dataset as ds
import gcsfs
import os
import subprocess

# Initialize GCS file system
fs = gcsfs.GCSFileSystem()

# GCS base path
gcs_base_path = "gs://arc-ctc-scbasecamp/2025-02-25/"

# STARsolo feature type
feature_type = "GeneFull_Ex50pAS"

# Metadata path
metadata_path = os.path.join(gcs_base_path, "metadata", feature_type)

# Get the sample metadata file path for Homo sapiens
sample_metadata_path = os.path.join(metadata_path, "Homo_sapiens", "sample_metadata.parquet")

# Load the metadata
sample_metadata = ds.dataset(sample_metadata_path, filesystem=fs, format="parquet").to_table().to_pandas()

# Filter for A549 cell line
a549_samples = sample_metadata[sample_metadata["cell_line"].str.contains("A549", na=False)]

# Print the results
print(f"Found {len(a549_samples)} samples with A549 cell line:")
print(a549_samples[["srx_accession", "tissue", "disease", "perturbation"]])

Found 34 samples with A549 cell line:
      srx_accession              tissue  \
531      ERX8792190               other   
2708    SRX21897869               other   
2854    SRX25289894                lung   
3361    SRX17488180                 eye   
3466    SRX17915869               other   
3638    SRX19004457               ovary   
4899    SRX24227811     endocrine gland   
5137    SRX25289882               other   
5253    SRX26771412               other   
7881    SRX17150748                lung   
7882    SRX17150749                lung   
7883    SRX17150750                lung   
7884    SRX17150747                lung   
8473    SRX17915870               other   
8500    SRX17941758                lung   
8505    SRX17941757                lung   
9522    SRX19215444                lung   
9526    SRX19215443                lung   
12529   SRX21897873                lung   
12533   SRX21897872               other   
12736   SRX22159982                lung   
15533   SRX25289

In [5]:
a549_samples.to_csv("a549_samples.csv", index=False)


In [6]:
a549_samples


Unnamed: 0,entrez_id,srx_accession,file_path,obs_count,lib_prep,tech_10x,cell_prep,organism,tissue,disease,perturbation,cell_line,czi_collection_id,czi_collection_name
531,21270666,ERX8792190,gs://arc-ctc-scbasecamp/2025-02-25/h5ad/GeneFu...,1634,10x_Genomics,vdj,single_cell,Homo sapiens,other,lung adenocarcinoma,"Ritonavir, gemcitabine, cisplatin","A549, H522, H23, H838",62ef75e4-cbea-454e-a0ce-998ec40223d3,Cross-tissue immune cell analysis reveals tiss...
2708,29793049,SRX21897869,gs://arc-ctc-scbasecamp/2025-02-25/h5ad/GeneFu...,13259,10x_Genomics,3_prime_gex,single_cell,Homo sapiens,other,lung adenocarcinoma,"CAR T cell therapy, SUV39H1 knockout, CD19 tum...","CAR-T cells, A549",,
2854,33632199,SRX25289894,gs://arc-ctc-scbasecamp/2025-02-25/h5ad/GeneFu...,4372,10x_Genomics,3_prime_gex,single_cell,Homo sapiens,lung,prostate cancer,uninfected (mock treatment),A549,,
3361,24323208,SRX17488180,gs://arc-ctc-scbasecamp/2025-02-25/h5ad/GeneFu...,8182,10x_Genomics,3_prime_gex,single_cell,Homo sapiens,eye,age-related macular degeneration,in vitro culture of iPSC-RPE,"iPSC-RPE, A549",,
3466,24854651,SRX17915869,gs://arc-ctc-scbasecamp/2025-02-25/h5ad/GeneFu...,2663,10x_Genomics,3_prime_gex,single_cell,Homo sapiens,other,Influenza A virus (A/WSN/1933(H1N1)),NS1 4xstop (mutant),A549 and MDCK-SIAT1,,
3638,26154125,SRX19004457,gs://arc-ctc-scbasecamp/2025-02-25/h5ad/GeneFu...,5529,10x_Genomics,3_prime_gex,single_cell,Homo sapiens,ovary,ovarian cancer,Bexmarilimab,A549 cells,,
4899,32539833,SRX24227811,gs://arc-ctc-scbasecamp/2025-02-25/h5ad/GeneFu...,38909,10x_Genomics,3_prime_gex,single_cell,Homo sapiens,endocrine gland,PitNET (Pituitary Neuroendocrine Tumor),ACME HS dissociation,"A549, RAW264.7",,
5137,33632187,SRX25289882,gs://arc-ctc-scbasecamp/2025-02-25/h5ad/GeneFu...,4361,10x_Genomics,3_prime_gex,single_cell,Homo sapiens,other,infected with H1N1 (A/California/07/2009),8 hours post infection,A549,,
5253,36177696,SRX26771412,gs://arc-ctc-scbasecamp/2025-02-25/h5ad/GeneFu...,11968,10x_Genomics,other,single_cell,Homo sapiens,other,"cancer types such as melanoma, pancreatic canc...","mixed sample treatments include DMSO, ARS-1620...","mixed sample (A375, HTT114, SKMEL2, MiaPaca-2,...",,
7881,23953746,SRX17150748,gs://arc-ctc-scbasecamp/2025-02-25/h5ad/GeneFu...,12222,10x_Genomics,3_prime_gex,single_cell,Homo sapiens,lung,duodenal gastrointestinal stromal tumors,irradiated A549 cells (6 Gy γ-ray treatment),A549,,


In [7]:


# Create a directory to save the files
output_dir = "a549_data"
os.makedirs(output_dir, exist_ok=True)

# Download the files using gsutil
for i, row in a549_samples.iterrows():
    file_path = row["file_path"]
    output_file = os.path.join(output_dir, os.path.basename(file_path))
    
    print(f"Downloading {file_path} to {output_file}...")
    
    # Using subprocess to call gsutil
    fs.get(file_path, output_file)

print(f"All A549 h5ad files downloaded to {output_dir}")

Downloading gs://arc-ctc-scbasecamp/2025-02-25/h5ad/GeneFull_Ex50pAS/Homo_sapiens/ERX8792190.h5ad to a549_data/ERX8792190.h5ad...
Downloading gs://arc-ctc-scbasecamp/2025-02-25/h5ad/GeneFull_Ex50pAS/Homo_sapiens/SRX21897869.h5ad to a549_data/SRX21897869.h5ad...
Downloading gs://arc-ctc-scbasecamp/2025-02-25/h5ad/GeneFull_Ex50pAS/Homo_sapiens/SRX25289894.h5ad to a549_data/SRX25289894.h5ad...
Downloading gs://arc-ctc-scbasecamp/2025-02-25/h5ad/GeneFull_Ex50pAS/Homo_sapiens/SRX17488180.h5ad to a549_data/SRX17488180.h5ad...
Downloading gs://arc-ctc-scbasecamp/2025-02-25/h5ad/GeneFull_Ex50pAS/Homo_sapiens/SRX17915869.h5ad to a549_data/SRX17915869.h5ad...
Downloading gs://arc-ctc-scbasecamp/2025-02-25/h5ad/GeneFull_Ex50pAS/Homo_sapiens/SRX19004457.h5ad to a549_data/SRX19004457.h5ad...
Downloading gs://arc-ctc-scbasecamp/2025-02-25/h5ad/GeneFull_Ex50pAS/Homo_sapiens/SRX24227811.h5ad to a549_data/SRX24227811.h5ad...
Downloading gs://arc-ctc-scbasecamp/2025-02-25/h5ad/GeneFull_Ex50pAS/Homo_sapi

In [6]:
# Read the specific SRX26771412.h5ad file
import scanpy as sc
file_path = os.path.join("a549_data/SRX26771412.h5ad")
adata_srx26771412 = sc.read_h5ad(file_path)

In [10]:
adata_srx26771412

AnnData object with n_obs × n_vars = 11968 × 36601
    obs: 'gene_count', 'umi_count', 'SRX_accession'
    var: 'gene_symbols', 'feature_types'

In [8]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import anndata

# Directory containing the downloaded A549 files
data_dir = "a549_data"

# List all h5ad files in the directory
a549_files = [file for file in os.listdir(data_dir) if file.endswith('.h5ad')]
print(f"Found {len(a549_files)} A549 h5ad files")

# Load all datasets
adatas = []
for file in a549_files:
    file_path = os.path.join(data_dir, file)
    print(f"Loading {file}...")
    try:
        adata = sc.read_h5ad(file_path)
        # Extract accession ID from filename
        accession = file.replace('.h5ad', '')
        adata.obs['sample_id'] = accession
        adatas.append(adata)
    except Exception as e:
        print(f"Error loading {file}: {e}")


# Combine the data
print("Combining data...")
combined = adatas[0].concatenate(adatas[1:], join='outer', index_unique='-')
print(f"Combined data shape: {combined.shape}")

# Basic preprocessing
sc.pp.filter_cells(combined, min_genes=200)
sc.pp.filter_genes(combined, min_cells=10)

Found 34 A549 h5ad files
Loading SRX17915870.h5ad...
Loading SRX17150748.h5ad...
Loading SRX25289884.h5ad...
Loading SRX21897873.h5ad...
Loading SRX25289889.h5ad...
Loading SRX25289882.h5ad...
Loading SRX17150747.h5ad...
Loading SRX25289890.h5ad...
Loading ERX8792190.h5ad...
Loading SRX19215443.h5ad...
Loading SRX25289894.h5ad...
Loading SRX17915869.h5ad...
Loading SRX25289893.h5ad...
Loading SRX17488180.h5ad...
Loading SRX26771412.h5ad...
Loading SRX17150749.h5ad...
Loading SRX25289891.h5ad...
Loading SRX21897869.h5ad...
Loading SRX19215444.h5ad...
Loading SRX22159982.h5ad...
Loading SRX17150750.h5ad...
Loading SRX25289892.h5ad...
Loading SRX25289886.h5ad...
Loading SRX25289887.h5ad...
Loading SRX17941758.h5ad...
Loading SRX24227811.h5ad...
Loading SRX21897872.h5ad...
Loading SRX17941757.h5ad...
Loading SRX25289881.h5ad...
Loading SRX25289879.h5ad...
Loading SRX25289885.h5ad...
Loading SRX25289888.h5ad...
Loading SRX25289880.h5ad...
Loading SRX19004457.h5ad...
Combining data...


  combined = adatas[0].concatenate(adatas[1:], join='outer', index_unique='-')


Combined data shape: (252338, 36601)


In [9]:
# Extract metadata from sample_id and add to combined.obs
a549_samples = pd.read_csv("a549_samples.csv")

# Create a mapping from SRX accession to metadata columns
metadata_mapping = a549_samples.set_index('srx_accession')

# Add metadata columns to combined.obs
for col in ['tissue', 'disease', 'perturbation', 'cell_line']:
    if col in metadata_mapping.columns:
        combined.obs[col] = combined.obs['sample_id'].map(
            metadata_mapping[col].to_dict()
        )

# Display the combined object
combined

AnnData object with n_obs × n_vars = 246262 × 33388
    obs: 'gene_count', 'umi_count', 'SRX_accession', 'sample_id', 'batch', 'n_genes', 'tissue', 'disease', 'perturbation', 'cell_line'
    var: 'gene_symbols', 'feature_types', 'n_cells'

In [10]:
output_file = "a549_combined_data.h5ad"
combined.write_h5ad(output_file)


In [14]:
combined


AnnData object with n_obs × n_vars = 246262 × 33388
    obs: 'gene_count', 'umi_count', 'SRX_accession', 'sample_id', 'batch', 'n_genes', 'tissue', 'disease', 'perturbation', 'cell_line'
    var: 'gene_symbols', 'feature_types', 'n_cells'

In [6]:
metadata = []
output_dir = "a549_combined_data"
os.makedirs(output_dir, exist_ok=True)


# List all h5ad files in the directory
a549_files = [file for file in os.listdir(data_dir) if file.endswith('.h5ad')]
print(f"Found {len(a549_files)} A549 h5ad files")

# Save basic metadata for each file
for file in a549_files:
    file_path = os.path.join(data_dir, file)
    try:
        # Load basic info without reading full data
        adata = sc.read_h5ad(file_path, backed='r')
        
        metadata.append({
            'filename': file,
            'filepath': file_path,
            'n_obs': adata.n_obs,
            'n_vars': adata.n_vars,
            'obs_keys': list(adata.obs.keys()),
            'var_keys': list(adata.var.keys()),
        })
        
        print(f"Processed metadata for {file}")
        
    except Exception as e:
        print(f"Error processing {file}: {e}")

# Save metadata as CSV
metadata_df = pd.DataFrame(metadata)
metadata_df.to_csv(os.path.join(output_dir, "a549_files_metadata.csv"), index=False)
print(f"Saved metadata for {len(metadata_df)} files")

# Create a readme file with loading instructions
with open(os.path.join(output_dir, "README.txt"), 'w') as f:
    f.write("A549 Cell Line Data Collection\n")
    f.write("=============================\n\n")
    f.write(f"This directory contains metadata for {len(metadata_df)} A549 cell line h5ad files.\n\n")
    f.write("To recreate the combined dataset, use the following Python code:\n\n")
    f.write("```python\n")
    f.write("import scanpy as sc\n")
    f.write("import pandas as pd\n")
    f.write("import os\n\n")
    f.write("# Load the metadata\n")
    f.write("metadata = pd.read_csv('a549_files_metadata.csv')\n\n")
    f.write("# Load all datasets\n")
    f.write("adatas = []\n")
    f.write("for file_path in metadata['filepath']:\n")
    f.write("    adata = sc.read_h5ad(file_path)\n")
    f.write("    # Add sample ID from filename\n")
    f.write("    adata.obs['sample_id'] = os.path.basename(file_path).replace('.h5ad', '')\n")
    f.write("    adatas.append(adata)\n\n")
    f.write("# Combine datasets\n")
    f.write("combined = adatas[0].concatenate(adatas[1:], join='outer', index_unique='-')\n")
    f.write("print(f'Combined data shape: {combined.shape}')\n")
    f.write("```\n")

print(f"Created documentation in {output_dir}/README.txt")

Found 34 A549 h5ad files
Processed metadata for SRX17915870.h5ad
Processed metadata for SRX17150748.h5ad
Processed metadata for SRX25289884.h5ad
Processed metadata for SRX21897873.h5ad
Processed metadata for SRX25289889.h5ad
Processed metadata for SRX25289882.h5ad
Processed metadata for SRX17150747.h5ad
Processed metadata for SRX25289890.h5ad
Processed metadata for ERX8792190.h5ad
Processed metadata for SRX19215443.h5ad
Processed metadata for SRX25289894.h5ad
Processed metadata for SRX17915869.h5ad
Processed metadata for SRX25289893.h5ad
Processed metadata for SRX17488180.h5ad
Processed metadata for SRX26771412.h5ad
Processed metadata for SRX17150749.h5ad
Processed metadata for SRX25289891.h5ad
Processed metadata for SRX21897869.h5ad
Processed metadata for SRX19215444.h5ad
Processed metadata for SRX22159982.h5ad
Processed metadata for SRX17150750.h5ad
Processed metadata for SRX25289892.h5ad
Processed metadata for SRX25289886.h5ad
Processed metadata for SRX25289887.h5ad
Processed metada