In [5]:
import scanpy as sc
import pandas as pd

# Normalization functions
def normalize_category(value):
    if pd.isna(value):  # Check if the value is missing
        return "Unknown"
    value = value.strip()
    return "Unknown" if value == "" or value == "unknown" else value

def normalized_age(age):
    if pd.isna(age):
        return "Unknown"
    age_cases = {
        "first": 0, "second": 10, "third": 20, "fourth": 30,
        "fifth": 40, "sixth": 50, "seventh": 60, "eighth": 70,
        "nineth": 80, "tenth": 90
    }
    age_str = str(age_cases.get(age.split(" ")[0], age)) if isinstance(age, str) else str(age)
    return f"{age_str[0]}0-{age_str[0]}9" if age_str and age_str[0].isdigit() else normalize_category(age)

def normalize_race(race):
    if pd.isna(race):
        return "Unknown"
    return "Black or African American" if race == "African American" else normalize_category(race)

# Extract SPP1 gene expression and normalize the data
def find_spp1_gene_and_normalize(input_h5ad, gene_name, gene_column):
    # Read the h5ad file
    matrix = sc.read_h5ad(input_h5ad, backed='r')

    # Find the SPP1 gene location
    spp1_var_idx = matrix.var[gene_column] == gene_name
    if spp1_var_idx.sum() == 0:
        raise ValueError(f"Gene {gene_name} not found in {gene_column}")

    # Get SPP1 gene expression data
    spp1_expression = matrix[:, spp1_var_idx].X
    spp1_expression_df = pd.DataFrame(spp1_expression, index=matrix.obs.index, columns=[gene_name])

    # Merge obs data with SPP1 expression data
    result_df = pd.concat([matrix.obs, spp1_expression_df], axis=1)

    # Check if 'age', 'sex', 'race' columns exist in obs data and normalize them
    if 'age' in result_df.columns:
        result_df['age'] = result_df['age'].apply(normalized_age)
    else:
        result_df['age'] = "Unknown"  # If 'age' column is missing, fill with "Unknown"

    if 'sex' in result_df.columns:
        result_df['sex'] = result_df['sex'].apply(normalize_category)
    else:
        result_df['sex'] = "Unknown"  # If 'sex' column is missing, fill with "Unknown"

    if 'race' in result_df.columns:
        result_df['race'] = result_df['race'].apply(normalize_race)
    else:
        result_df['race'] = "Unknown"  # If 'race' column is missing, fill with "Unknown"

    return result_df

# Combine datasets and save the results
def combine_and_save_data(datasets, gene_name, kpmp_gene_column, hubmap_gene_column, output_csv):
    combined_df = pd.DataFrame()
    
    for dataset_name, h5ad_path in datasets.items():
        print(f"Processing dataset: {dataset_name}")
        
        # Select the gene_column based on the dataset source
        if 'KPMP' in dataset_name:
            gene_column = kpmp_gene_column
        else:
            gene_column = hubmap_gene_column
        
        normalized_data = find_spp1_gene_and_normalize(h5ad_path, gene_name, gene_column)
        normalized_data['dataset'] = dataset_name  # Add dataset name
        combined_df = pd.concat([combined_df, normalized_data])

    # Save the combined results to a CSV file
    combined_df.to_csv(output_csv)
    print(f"Combined and normalized SPP1 gene expression data has been saved to {output_csv}")

# Dataset file paths
datasets = {
    'KPMP SC RNAseq': 'kpmp-sc-rnaseq.h5ad',
    'KPMP SN RNAseq': 'kpmp-sn-rnaseq.h5ad',
    'HuBMAP Left Kidney': 'hubmap-LK-processed.h5ad',
    'HuBMAP Right Kidney': 'hubmap-RK-processed.h5ad'
}

# Output file path
output_csv = 'combined_spp1_expression.csv'

# Run the function with different gene columns for KPMP and HuBMAP datasets
combine_and_save_data(datasets, 'SPP1', 'feature_name', 'hugo_symbol', output_csv)


Processing dataset: KPMP SC RNAseq
Processing dataset: KPMP SN RNAseq
Processing dataset: HuBMAP Left Kidney
Processing dataset: HuBMAP Right Kidney
Combined and normalized SPP1 gene expression data has been saved to combined_spp1_expression.csv
