# Download Data 
This is the Data needed for the Jupyter Notebooks in the folder DataAnalysis

In [1]:
from pathlib import Path
import gc
import hdf5plugin
import numpy as np
import os
import pandas as pd
import requests
import scanpy as sc

General Settings:

In [2]:
# Directory:
data_dir = Path(".") / "Data"
data_dir.mkdir(exist_ok=True)

# Download function:
def download_file(url, output_dir, use_cache=True):
    filename = output_dir / os.path.basename(url)
    
    if use_cache and filename.exists():
        print(f"File already exists, skipping: {filename}")
        return filename

    response = requests.get(url, stream=True)
    response.raise_for_status()
    
    with open(filename, "wb") as file:
        for chunk in response.iter_content(chunk_size=8192):
            file.write(chunk)
    
    print(f"Downloaded: {filename}")
    return filename

## Hepatocytes Data 

In [25]:
# Download data files:
file_urls = [
    "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE84nnn/GSE84498/suppl/GSE84498%5Fexperimental%5Fdesign.txt.gz",
    "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE84nnn/GSE84498/suppl/GSE84498%5Fumitab.txt.gz"
    ]
for url in file_urls:
    download_file(url=url, output_dir=data_dir)

File already exists, skipping: Data/GSE84498%5Fexperimental%5Fdesign.txt.gz
File already exists, skipping: Data/GSE84498%5Fumitab.txt.gz


In [28]:
# Save the AnnData object:
obs = (pd.read_csv(data_dir / os.path.basename(file_urls[0]), sep="\t")
       .set_index("well"))
count_df = (pd.read_csv(data_dir / os.path.basename(file_urls[1]), sep="\t")
            .set_index("gene").T
            .loc[obs.index, :])
adata_hep = sc.AnnData(
    X = count_df.values.astype(np.float32),
    obs = obs, 
    var = pd.DataFrame(index=[c.split(";")[0] for c in count_df.columns])
)
adata_hep = adata_hep[:, adata_hep.X.sum(axis=0) >= 20].copy()
# remove batches of different cells (probably non-hepatocytes)
adata_hep = adata_hep[~adata_hep.obs["batch"].isin(["AB630", "AB631"])].copy()
adata_hep.write_h5ad( data_dir / "adata_hep.h5ad")

## Non-classical Monocytes

In [5]:
# File URL to download
url = "https://datasets.cellxgene.cziscience.com/4532eea4-24b7-461a-93f5-fe437ee96f0a.h5ad"

# Download the file
download_file(url=url, output_dir=data_dir)

File already exists, skipping: Data/4532eea4-24b7-461a-93f5-fe437ee96f0a.h5ad


PosixPath('Data/4532eea4-24b7-461a-93f5-fe437ee96f0a.h5ad')

In [7]:
# Save the AnnData object:
adata_ncM = sc.read_h5ad(data_dir / "4532eea4-24b7-461a-93f5-fe437ee96f0a.h5ad")
adata_ncM.obs["Status"] = adata_ncM.obs["disease_state"].map({
    "managed": "Managed",
    "na": "Healthy",
    "flare": "Flare",
    "treated": "Treated"
})
adata_ncM = adata_ncM[adata_ncM.obs["author_cell_type"]=="ncM", :].copy() # only consider non-classical monocytes
adata_ncM = adata_ncM[adata_ncM.obs["Status"] != "Treated", :].copy() # remove samples with "treated" status
# remove columns we don"t need
adata_ncM.obs.drop(columns=["mapped_reference_annotation", "cell_type_ontology_term_id", "is_primary_data", 
                        "cell_state", "tissue_ontology_term_id", "development_stage_ontology_term_id", 
                        "tissue", "organism", "tissue_type", "suspension_type", "organism_ontology_term_id",
                        "assay_ontology_term_id", "suspension_enriched_cell_types", "suspension_uuid",
                        "self_reported_ethnicity_ontology_term_id", "disease_ontology_term_id",
                        "sex_ontology_term_id"], 
                        inplace=True)
# create new index
adata_ncM.obs.index = [s.split("-")[0] + "-" + str(len(s.split("-"))) + "-" + str(donor_id) 
                   for s, donor_id in zip(adata_ncM.obs.index, adata_ncM.obs["donor_id"].to_list())]
# remove obsm we don't need
del adata_ncM.obsm["X_pca"], adata_ncM.obsm["X_umap"], adata_ncM.uns
gc.collect()

# use the raw counts
adata_ncM.X = adata_ncM.raw.X

# use gene symbols instead of ensembl IDs
assert len(adata_ncM.var["feature_name"]) == len(adata_ncM.var["feature_name"].unique())
adata_ncM.var = adata_ncM.var.set_index("feature_name")

adata_ncM.write_h5ad( data_dir / "adata_ncM.h5ad")

AnnData expects .var.index to contain strings, but got values like:
    ['MIR1302-2HG', 'FAM138A', 'OR4F5', 'ENSG00000238009.6', 'ENSG00000239945.1']

    Inferred to be: categorical

  value_idx = self._prep_dim_index(value.index, attr)
