**Code used for subsetting data to the 6 major cell types that have and have both UT and stimulated cells.**

In [1]:
import argparse
import anndata as an
import pandas as pd
import scanpy as sc
from pathlib import Path
import random

In [2]:
DATA_DIR = Path("../Data/SCDRS/multiome")
file = DATA_DIR / "mo_all_20240517_scanpyv1.10.2_annotated_agesexcovid_raw.h5ad"
adata = an.read_h5ad(file)
adata

AnnData object with n_obs × n_vars = 874200 × 36601
    obs: 'cell_id', 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'barcode', 'barcode_1', 'barcode_lane', 'lane', 'batch', 'percent.mt', 'nCount_RNA_mad', 'nFeature_RNA_mad', 'confined_best_match_sample', 'confined_second_match_sample', 'confined_best_match_correlation', 'confined_second_match_correlation', 'confined_condition', 'unconfined_best_match_sample', 'unconfined_second_match_sample', 'unconfined_best_match_correlation', 'unconfined_second_match_correlation', 'unconfined_condition', 'soup_cluster', 'soup_status', 'soup_singlet_posterior', 'soup_doublet_posterior', 'soup_log_prob_singleton', 'soup_log_prob_doublet', 'soup_cluster0', 'soup_cluster1', 'soup_cluster2', 'soup_cluster3', 'soup_cluster4', 'soup_cluster5', 'soup_cluster6', 'soup_cluster7', 'sample_final', 'final_condition', 'predicted.mo_10x_cell_type', 'predicted.mo_10x_cell_type.score', 'predicted.mo_10x_cell_type.lowerres', 'nCount_SCT', 'nFeature_SCT', 'SCT_snn_res

In [3]:
for threshold in [50, 100, 300, 800]:
    n = (adata.X > 0).sum(axis=0).A1  # Get non-zero counts per gene
    print(f"min_cells={threshold}: {(n >= threshold).sum()} genes retained")

min_cells=50: 31008 genes retained
min_cells=100: 29333 genes retained
min_cells=300: 25892 genes retained
min_cells=800: 22112 genes retained


In [3]:
# Basic filtering
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=80)

In [None]:
adata.obs_keys()

In [6]:
adata.obs["sample_final"].unique()

['MO205', 'UGLI012768', 'UGLI011032', 'MO204', 'UGLI021599', ..., 'MO210', 'MO216', 'MO215', 'MO211', 'MO212']
Length: 264
Categories (264, object): ['MO2E', 'MO2F', 'MO2G', 'MO2H', ..., 'UGLI034065', 'UGLI035701', 'UGLI035866', 'UGLI036253']

In [7]:
adata.obs["condition_final"].unique()

['24hCA', 'UT']
Categories (2, object): ['24hCA', 'UT']

In [38]:
samples = list(adata.obs["sample_final"].unique())

random_samples = random.sample(samples, 10)

print(random_samples)

['MO62', 'MO138', 'MO200', 'MO100', 'MO149', 'MO906', 'UGLI021423', 'MO90', 'UGLI028084', 'MO66']


In [4]:
# Extract relevant columns
df = adata.obs[["sample_final", "condition_final"]]

# Filter only rows with condition_final "24hCA" or "UT"
df_filtered = df[df["condition_final"].isin(["24hCA", "UT"])]

# Group by sample_final and get the unique set of condition_final values for each
condition_sets = df_filtered.groupby("sample_final")["condition_final"].unique()

# Find sample_final entries where both "24hCA" and "UT" are present
samples_with_both = condition_sets[condition_sets.apply(lambda x: set(["24hCA", "UT"]).issubset(set(x)))].index.tolist()

print(samples_with_both)


['MO3A', 'MO3B', 'MO11G', 'MO51', 'MO56', 'MO61', 'MO65', 'MO72', 'MO73', 'MO77', 'MO78', 'MO80', 'MO81', 'MO91', 'MO92', 'MO96', 'MO100', 'MO111', 'MO112', 'MO116', 'MO117', 'MO130', 'MO131', 'MO134', 'MO137', 'MO140', 'MO145', 'MO146', 'MO149', 'MO150', 'MO153', 'MO184', 'MO203', 'MO301', 'MO302', 'MO407', 'MO1001', 'MO1002', 'MO1107', 'UGLI001858', 'UGLI008810', 'UGLI009341', 'UGLI009951', 'UGLI015616', 'UGLI023249', 'UGLI023668', 'UGLI025281', 'UGLI027720', 'UGLI030350', 'UGLI035866', 'UGLI036253']


  condition_sets = df_filtered.groupby("sample_final")["condition_final"].unique()


In [11]:
print(condition_sets)

sample_final
MO2E          ['24hCA']
Categories (2, object): ['24hCA', 'UT']
MO2F          ['24hCA']
Categories (2, object): ['24hCA', 'UT']
MO2G          ['24hCA']
Categories (2, object): ['24hCA', 'UT']
MO2H          ['24hCA']
Categories (2, object): ['24hCA', 'UT']
MO3A          ['UT', '24hCA']
Categories (2, object): ['24hC...
                                    ...                        
UGLI033928    ['24hCA']
Categories (2, object): ['24hCA', 'UT']
UGLI034065    ['24hCA']
Categories (2, object): ['24hCA', 'UT']
UGLI035701    ['24hCA']
Categories (2, object): ['24hCA', 'UT']
UGLI035866    ['UT', '24hCA']
Categories (2, object): ['24hC...
UGLI036253    ['24hCA', 'UT']
Categories (2, object): ['24hC...
Name: condition_final, Length: 264, dtype: object


In [10]:
# Double check
for sample in samples_with_both:
    x = adata.obs["condition_final"][adata.obs["sample_final"] == sample].unique()
    if len(x) != 2:
        
        print("nope")

In [12]:
mask = (
    (adata.obs["condition_final"].isin(["UT"])) &
    (adata.obs["sample_final"].isin(samples_with_both)) &
    (adata.obs["celltype_imputed_lowerres"].isin(['B', 'monocyte', 'NK', 'DC', 'CD8T', 'CD4T']))
)
subsetted_data = adata[mask].copy()
subsetted_data.obs["celltype_imputed_lowerres"].value_counts()

celltype_imputed_lowerres
CD4T        49053
monocyte    38337
CD8T        17428
NK          15272
B            9350
DC           1295
Name: count, dtype: int64

In [13]:
# Write file
out_file = DATA_DIR / "UT+24hCA_6ct_UT_raw.h5ad"
subsetted_data.write_h5ad(out_file)
print(f"Adata written to {out_file}")

Adata written to /groups/umcg-franke-scrna/tmp02/projects/multiome/ongoing/sc-scoring-student-project/scScoring/Data/SCDRS/multiome/UT+24hCAsamples_6ct_UT_raw.h5ad


In [None]:
mask = (
    (adata.obs["condition_final"].isin(["24hCA"])) &
    (adata.obs["sample_final"].isin(samples_with_both)) &
    (adata.obs["celltype_imputed_lowerres"].isin(['B', 'monocyte', 'NK', 'DC', 'CD8T', 'CD4T']))
)
subsetted_data = adata[mask].copy()
subsetted_data.obs["celltype_imputed_lowerres"].value_counts()

In [None]:
# Write file
out_file = DATA_DIR / "UT+24hCA_6ct_24hCA_raw.h5ad"
subsetted_data.write_h5ad(out_file)