In [3]:
!pip install cellxgene_census
!pip install scanpy
!pip install mygene
!pip install pandas openpyxl
import cellxgene_census as cellxgene
import urllib
import scanpy as sc
import numpy as np
import pandas as pd

Collecting cellxgene_census
  Downloading cellxgene_census-1.16.2-py3-none-any.whl.metadata (5.2 kB)
Collecting tiledbsoma!=1.14.1,>=1.12.3 (from cellxgene_census)
  Downloading tiledbsoma-1.14.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.4 kB)
Collecting anndata (from cellxgene_census)
  Downloading anndata-0.11.1-py3-none-any.whl.metadata (8.2 kB)
Collecting s3fs>=2021.06.1 (from cellxgene_census)
  Downloading s3fs-2024.10.0-py3-none-any.whl.metadata (1.7 kB)
Collecting aiobotocore<3.0.0,>=2.5.4 (from s3fs>=2021.06.1->cellxgene_census)
  Downloading aiobotocore-2.15.2-py3-none-any.whl.metadata (23 kB)
Collecting scanpy>=1.9.2 (from tiledbsoma!=1.14.1,>=1.12.3->cellxgene_census)
  Downloading scanpy-1.10.4-py3-none-any.whl.metadata (9.3 kB)
Collecting somacore==1.0.17 (from tiledbsoma!=1.14.1,>=1.12.3->cellxgene_census)
  Downloading somacore-1.0.17-py3-none-any.whl.metadata (1.5 kB)
Collecting tiledb~=0.32.0 (from tiledbsoma!=1.14.1,>=1.12.3->cellxgene_ce

In [4]:
# Data Download and Loading Cell

#https://datasets.cellxgene.cziscience.com/3d690bcf-c9d3-4fcf-b7e1-e0e622bbf958.h5ad
#https://datasets.cellxgene.cziscience.com/ee226a77-6ec1-4a16-b653-8cbacd3876bc.h5ad
#https://datasets.cellxgene.cziscience.com/7bb8238f-b5a7-4bbd-9c00-244e2b72e140.h5ad

# Define file paths and URLs for data download

file1 = "3d690bcf-c9d3-4fcf-b7e1-e0e622bbf958.h5ad"

file2 = "ee226a77-6ec1-4a16-b653-8cbacd3876bc.h5ad"

file3 = "7bb8238f-b5a7-4bbd-9c00-244e2b72e140.h5ad"

url1 = 'https://datasets.cellxgene.cziscience.com/3d690bcf-c9d3-4fcf-b7e1-e0e622bbf958.h5ad'

url2 = 'https://datasets.cellxgene.cziscience.com/ee226a77-6ec1-4a16-b653-8cbacd3876bc.h5ad'

url3 = 'https://datasets.cellxgene.cziscience.com/7bb8238f-b5a7-4bbd-9c00-244e2b72e140.h5ad'


# Download datasets

urllib.request.urlretrieve(url1, file1)

urllib.request.urlretrieve(url2, file2)

urllib.request.urlretrieve(url3, file3)


# Load datasets

adata1 = sc.read_h5ad(file1)

adata2 = sc.read_h5ad(file2)

adata3 = sc.read_h5ad(file3)



###################################

#Cleaning up dataset to remove outliers
adata_merged = adata1.concatenate(adata2, adata3, join='outer',batch_key='batch')

q1 = np.percentile(adata_merged.obs['Fraction mitochrondrial UMIs'], 25)
q3 = np.percentile(adata_merged.obs['Fraction mitochrondrial UMIs'], 75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr


# Filter cells based on QC metrics
min_genes = 200
max_genes = 5000
min_umis = 500
max_umis = 30000

adata_1_filtered = adata1[
    (adata1.obs['Genes detected'] > min_genes) &
    (adata1.obs['Genes detected'] < max_genes) &
    (adata1.obs['Number of UMIs'] > min_umis) &
    (adata1.obs['Number of UMIs'] < max_umis) &
    (adata1.obs['Fraction mitochrondrial UMIs'] > lower_bound) &
    (adata1.obs['Fraction mitochrondrial UMIs'] < upper_bound)
]

adata_2_filtered = adata2[
    (adata2.obs['Genes detected'] > min_genes) &
    (adata2.obs['Genes detected'] < max_genes) &
    (adata2.obs['Number of UMIs'] > min_umis) &
    (adata2.obs['Number of UMIs'] < max_umis) &
    (adata2.obs['Fraction mitochrondrial UMIs'] > lower_bound) &
    (adata2.obs['Fraction mitochrondrial UMIs'] < upper_bound)
]

adata_3_filtered = adata3[
    (adata3.obs['Genes detected'] > min_genes) &
    (adata3.obs['Genes detected'] < max_genes) &
    (adata3.obs['Number of UMIs'] > min_umis) &
    (adata3.obs['Number of UMIs'] < max_umis) &
    (adata3.obs['Fraction mitochrondrial UMIs'] > lower_bound) &
    (adata3.obs['Fraction mitochrondrial UMIs'] < upper_bound)
]


# Data Filtering Cell


# Find common genes in both datasets

common_genes = adata_1_filtered.var_names.intersection(adata_2_filtered.var_names).intersection(adata_3_filtered.var_names)


# Filter each dataset to include only the common genes

adata1 = adata1[:, common_genes]

adata2 = adata2[:, common_genes]

adata3 = adata3[:, common_genes]


# Filter for Alzheimer and normal patients based on the "disease" field

adata1_filtered = adata1[adata1.obs["disease"].isin(["dementia", "normal"])]

adata2_filtered = adata2[adata2.obs["disease"].isin(["dementia", "normal"])]

adata3_filtered = adata3[adata3.obs["disease"].isin(["dementia", "normal"])]


# Save filtered datasets to .h5ad files for future use

adata1_filtered.write("filtered_adata1.h5ad")

adata2_filtered.write("filtered_adata2.h5ad")

adata3_filtered.write("filtered_adata3.h5ad")


print("Filtered datasets have been saved as 'filtered_adata1.h5ad' and 'filtered_adata2.h5ad' and 'filtered_adata3.h5ad'.")


# Data Conversion and Concatenation Cell


# Load the filtered datasets

adata1_filtered = sc.read_h5ad("filtered_adata1.h5ad")

adata2_filtered = sc.read_h5ad("filtered_adata2.h5ad")

adata3_filtered = sc.read_h5ad("filtered_adata3.h5ad")

adata_filtered_merged = adata1_filtered.concatenate(adata2_filtered, adata3_filtered, join='outer',batch_key='batch')


  adata_merged = adata1.concatenate(adata2, adata3, join='outer',batch_key='batch')


Filtered datasets have been saved as 'filtered_adata1.h5ad' and 'filtered_adata2.h5ad' and 'filtered_adata3.h5ad'.


  adata_filtered_merged = adata1_filtered.concatenate(adata2_filtered, adata3_filtered, join='outer',batch_key='batch')


In [5]:
adata_filtered_merged.obs.columns

Index(['assay_ontology_term_id', 'cell_type_ontology_term_id',
       'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'organism_ontology_term_id', 'sex_ontology_term_id',
       'tissue_ontology_term_id', 'is_primary_data', 'Neurotypical reference',
       'Class', 'Subclass', 'Supertype', 'Age at death', 'Years of education',
       'Cognitive status', 'ADNC', 'Braak stage', 'Thal phase', 'CERAD score',
       'APOE4 status', 'Lewy body disease pathology', 'LATE-NC stage',
       'Microinfarct pathology', 'Specimen ID', 'donor_id', 'PMI',
       'Number of UMIs', 'Genes detected', 'Fraction mitochrondrial UMIs',
       'suspension_type', 'development_stage_ontology_term_id',
       'Continuous Pseudo-progression Score', 'tissue_type', 'cell_type',
       'assay', 'disease', 'organism', 'sex', 'tissue',
       'self_reported_ethnicity', 'development_stage', 'observation_joinid',
       'batch'],
      dtype='object')