# Normalize HuBMAP and KPMP datasets for comparison

## Install pre-requisite libraries

In [None]:
!pip install anndata pandas seaborn matplotlib

## Part 1: extract obs variables to csv

### Imports / functions

In [2]:
import anndata
import pandas as pd

In [3]:
def h5ad_obs_to_csv(input_h5ad):
    output_csv = input_h5ad.replace('.h5ad', '.obs.csv')
    x = anndata.read_h5ad(input_h5ad, backed='r')
    x.obs.to_csv(output_csv)


### Datasets to load

In [4]:
datasets = {
  'KPMP SC RNAseq': 'kpmp-sc-rnaseq.h5ad',
  'KPMP SN RNAseq': 'kpmp-sn-rnaseq.h5ad',
  'HuBMAP Left Kidney': 'hubmap-LK-processed.h5ad',
  'HuBMAP Right Kidney': 'hubmap-RK-processed.h5ad'
}

### Write out dataset obs variables to separate csv files

In [5]:
for h5ad in datasets.values():
    h5ad_obs_to_csv(h5ad)

In [None]:
!head -3 *.csv

## Part 2: normalize and combine obs datasets

### Imports / functions

In [7]:
def normalize_category(str):
    str = str.strip()
    if str == "" or str == "unknown":
        return "Unknown"
    else:
        return str

def normalized_age(age):
    match age.split(" ")[0]:
        case "first":
            age = 0
        case "second":
            age = 10
        case "third":
            age = 20
        case "fourth":
            age = 30
        case "fifth":
            age = 40
        case "sixth":
            age = 50
        case "seventh":
            age = 60
        case "eighth":
            age = 70
        case "nineth":
            age = 80
        case "tenth":
            age = 90
    age = str(age)
    if age != "" and age[0].isdigit():
        return f"{age[0]}0-{age[0]}9"
    else:
        return normalize_category(age)

def normalize_race(race):
    if race == "African American":
        return "Black or African American"
    else:
        return normalize_category(race)

def normalize_hubmap_row(row, collection):
    normalized_row = {
        "consortium": "HuBMAP",
        "collection": collection,
        "dataset_id": row["uuid"],
        "cell_id": row["cell_id"],
        "as_id": "UBERON:0002113",
        "cl_id": row["predicted_CLID"],
        "cl_label": row["predicted_label"],
        "gene_count": int(row["n_genes"]),
        "age": normalized_age(row["age"]),
        "sex": normalize_category(row["sex"].title()),
        "race": normalize_race(row["race"]),
        "disease": "normal"
    }
    return normalized_row

def normalize_kpmp_row(row, collection):
    normalized_row = {
        "consortium": "KPMP",
        "collection": collection,
        "dataset_id": row.get("LibraryID", row.get("library_id", "Unknown")),
        "cell_id": row[""],
        "as_id": row["tissue_ontology_term_id"],
        "cl_id": row["cell_type_ontology_term_id"],
        "cl_label": row["cell_type"],
        "gene_count": int(float(row["nCount_RNA"])),
        "age": normalized_age(row["Age_binned"]),
        "sex": normalize_category(row["sex"].title()),
        "race": normalize_race(row["self_reported_ethnicity"]),
        "disease": normalize_category(row["disease"])
    }
    return normalized_row

fields = ["consortium", "collection", "dataset_id", "cell_id", "as_id", "cl_id", "cl_label", "gene_count", "age", "sex", "race", "disease"]

### Read in obs data and write out a normalized csv

In [8]:
import csv
import gzip
with gzip.open("all-normalized-obs.csv.gz", 'wt', compresslevel=9, newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fields)
    writer.writeheader()
    for (collection, h5ad) in datasets.items():
        obs_csv = h5ad.replace(".h5ad", ".obs.csv")
        with open(obs_csv, newline='') as obs_csvfile:
            reader = csv.DictReader(obs_csvfile)
            for row in reader:
                if h5ad.startswith("hubmap-"):
                    normalized_row = normalize_hubmap_row(row, collection)
                else:
                    normalized_row = normalize_kpmp_row(row, collection)
                writer.writerow(normalized_row)


In [None]:
!zcat all-normalized-obs.csv.gz | head -3

## Part 3: show basic results

### Imports / functions

In [18]:
import seaborn as sns
import matplotlib.pyplot as plt

### Display unique values in normalized data

In [None]:
df = pd.read_csv("all-normalized-obs.csv.gz")

# Create a dictionary to store unique values for each column
unique_values = {column: df[column].unique() for column in df.columns}

# Print unique values for each column
for column, values in unique_values.items():
    if len(values) < 100:
        print(f"\"{column}\" values ({len(values)}): {', '.join(values)}")
    else:
        print(f"\"{column}\" values ({len(values)}): {', '.join(map(str, values))[:200]}...")

### Visualize distributions of metadata by cell

In [None]:
# Set the aesthetic style of the plots
sns.set_theme(style="whitegrid")

def plot_distribution(x, column):
    plt.figure(figsize=(8, 4))
    plt.xticks(rotation=90, fontsize=8)
    sorted_values = sorted(unique_values[column], key=lambda v: v.lower())
    sns.countplot(x=column, data=x, order=sorted_values)
    plt.gcf().subplots_adjust(bottom=0.4)
    plt.xlabel(column)
    plt.ylabel('cell count')
    plt.tight_layout()
    plt.show()

# Plot the distribution of each column
for column in df.columns:
    if len(unique_values[column]) < 100:
        plot_distribution(df, column)
