# Make custom objects reflecting samples for sub-cohorts of interest

In [1]:
import pandas as pd
import numpy as np
import scanpy as sc
import anndata as ad
import harmonypy as hm
import pp, cna, os, pickle
pp.umapprops['s'] = 0.4
import multianndata as mad
import matplotlib.pyplot as plt
from scipy.io import mmwrite
fig_dir = '/data/srlab/lrumker/datasets/onek1k/figs/'
src_folder = "/data/srlab/lrumker/datasets/onek1k/pheno/"
res_folder = "/data/srlab/lrumker/MCSC_Project/cna-qtl/results/"
np.random.seed(0)

In [2]:
# review all disease annotations
meta = pd.read_csv(src_folder+"sample_meta.csv", index_col = 0)
other_dz = meta.Other_Disease.value_counts().index
other_dz = np.concatenate([meta.Autoimmune_Disease_Other.value_counts().index, other_dz])
other_dz = np.unique(np.concatenate([other_dz[i].split(", ") for i in np.arange(len(other_dz))]))

In [3]:
ca_types = meta.Ca_Type.value_counts().index
ca_types = np.unique(np.concatenate([ca_types[i].split(", ") for i in np.arange(len(ca_types))]))

## Only donors with known absence of autoimmune disease

Excludes donors lacking clinical metadata or with known autoimmune diseasemm

In [4]:
for celltype in ["Myeloid", "NK"]:
    print(celltype)
    np.random.seed(0)
    d = sc.read_h5ad(src_folder+celltype+"_expr.h5ad")

    # Remove individuals without documented clinical information
    meta = pd.read_csv(src_folder+"sample_meta.csv", index_col = 0)
    meta_clin = meta.drop(columns = meta.columns[pd.isna(meta).sum(axis=0)<400], inplace = False) # 32 clinical variables
    rm_ids = meta_clin.index[pd.isna(meta_clin).sum(axis=1)==meta_clin.shape[1]]
    meta = meta.drop(index=rm_ids)
    
    # Remove individuals with any labeled autoimmune disease
    other_dz = meta.Other_Disease.value_counts().index
    other_dz = np.concatenate([meta.Autoimmune_Disease_Other.value_counts().index, other_dz])
    AIDs = ['MS', 'SLE', 'Sarcoidosis', "Sjogren's", 'ankylosing spondylitis', 'multiple sclerosis',
            "Crohn's", 'dermatomyositis', 'lupus', 'palindromic rheumatism','psoriasis', 
            'sarcoidosis', 'scleroderma', 'psoriatic arthritis']
    meta['non_null'] = [True if type(meta.Other_Disease.values[i]) is not float else False \
                                 for i in np.arange(meta.shape[0])]
    meta['Any_AID'] = np.repeat(False, meta.shape[0])
    for AID_term in AIDs:
        meta.loc[meta.non_null, 'Any_AID'] = [True if meta.Any_AID.values[i] or \
                                             AID_term in meta.Autoimmune_Disease_Other.values[i] or \
                                  AID_term in meta.Other_Disease.values[i] else False \
                                 for i in np.where(meta.non_null)[0]]
    AID_ids = np.array(meta.index[meta.Any_AID])
    meta = meta.drop(index=AID_ids)
    
    # Remove individuals with tabulated autoimmune disease
    meta = meta.loc[meta.Autoimmune_Disease==0,:]
    meta = meta.loc[meta.Rheumatoid_arthritis==0,:]
    meta = meta.loc[meta.Diabetes_type1==0,:]
    meta = meta.loc[meta.UlcerativeColitis==0,:]
    meta = meta.loc[meta.Autoimmune_Disease_Other=="N",:]
    keep_ids = meta.index
    keep_cells = np.repeat(False, d.obs.shape[0])
    for sel_donor in keep_ids:
        keep_cells[np.where(d.obs.individual.values==sel_donor)] = True

    d = d[keep_cells,:]
    print("Keeping "+str(len(np.unique(d.obs.individual)))+" samples with known absence of autoimmune disease")

    # Retain only samples with at least 25 cells
    cellcount = pd.DataFrame(d.obs.individual.value_counts())
    cellcount.columns = ['n_cells']
    keep_ids = cellcount.index[cellcount.n_cells>=25]
    keep_cells = np.repeat(False, d.obs.shape[0])
    for sel_donor in keep_ids:
        keep_cells[np.where(d.obs.individual.values==sel_donor)] = True

    d = d[keep_cells,:]
    print("Keeping "+str(len(np.unique(d.obs.individual)))+" samples with at least 25 cells")

    # Remove all HLA- genes (21)
    d.var['HLA'] = ['HLA-' in d.var.index[i] for i in np.arange(d.var.shape[0])]
    d = d[:,~d.var.HLA.values]

    # Remove cell cycle genes
    cc_genes = ["MCM5","PCNA","TYMS","FEN1","MCM2","MCM4", "RRM1","UNG","GINS2","MCM6",
                "CDCA7","DTL","PRIM1","UHRF1","MLF1IP","HELLS","RFC2","RPA2","NASP", 
                "RAD51AP1","GMNN","WDR76","SLBP","CCNE2","UBR7","POLD3","MSH2","ATAD2",
                "RAD51","RRM2", "CDC45", "CDC6", "EXO1", "TIPIN", "DSCC1", "BLM", "CASP8AP2",
                "USP1","CLSPN","POLA1","CHAF1B","BRIP1","E2F8","HMGB2","CDK1","NUSAP1","UBE2C",
                "BIRC5","TPX2","TOP2A","NDC80","CKS2","NUF2","CKS1B","MKI67","TMPO","CENPF",
                "TACC3","FAM64A","SMC4","CCNB2","CKAP2L","CKAP2","AURKB","BUB1","KIF11",
                "ANP32E","TUBB4B","GTSE1","KIF20B","HJURP","CDCA3","HN1", "CDC20", "TTK",
                "CDC25C", "KIF2C", "RANGAP1", "NCAPD2", "DLGAP5", "CDCA2", "CDCA8", "ECT2", 
                "KIF23", "HMMR", "AURKA", "PSRC1", "ANLN", "LBR", "CKAP5", "CENPE", "CTCF",
                "NEK2","G2E3","GAS2L3","CBX5","CENPA"]
    d.var['CC'] = [d.var.index[i] in cc_genes for i in np.arange(d.var.shape[0])]
    d = d[:,~d.var.CC.values]

    # Remove hemoglobin genes (polymorphic)
    hb_genes = ['HBB', 'HBA2', 'HBD', 'HBA1']
    d.var['HB'] = [d.var.index[i] in hb_genes for i in np.arange(d.var.shape[0])]
    d = d[:,~d.var.HB.values]

    # Remove platelet genes
    plt_genes = ['PF4', 'PPBP']
    d.var['Plt'] = [d.var.index[i] in plt_genes for i in np.arange(d.var.shape[0])]
    d = d[:,~d.var.Plt.values]

    sc.pp.normalize_total(d, target_sum=1e4) #normalize expr
    sc.pp.log1p(d) #logarithmize

    # variable gene selection
    min_disp = {'Myeloid':0.51, 'B':0.39, 'NK': 0.37, 'T': 0.31, 'allcells':0.30}
    sc.pp.highly_variable_genes(d, min_disp=min_disp[celltype]) #np.sum(d.var.highly_variable)

    high_dispersion = d.var.dispersions_norm > 11
    d.var.loc[high_dispersion, 'highly_variable'] = False

    d = d[:, d.var.highly_variable]

    sc.pp.scale(d, max_value=10) # Scale each gene to unit variance
    sc.tl.pca(d, svd_solver='arpack') # PCA

    # Harmonize over batch (theta of 2 is default, if >1 batch variable, thetas should sum to 1)
    if celltype == "allcells":
        ho = hm.run_harmony(d.obsm['X_pca'][:,:20], d.obs, ['pool'], max_iter_harmony = 50, theta = 2)
    else:
        # sel sigma 0.2 > default of 0.1 --> encourages softer clustering b/c all one major type
        ho = hm.run_harmony(d.obsm['X_pca'][:,:20], d.obs, ['pool'], 
                            nclust = 50, sigma = 0.2, max_iter_harmony = 50, theta = 2)
    d.obsm['harmpca'] = ho.Z_corr.T

    print("graph")
    sc.pp.neighbors(d, use_rep = 'harmpca') # graph    
    print("umap")
    sc.tl.umap(d) # umap    

    # Load cell metadata
    cell_meta = pd.read_csv(src_folder+"cell_meta.csv", index_col = 0)
    cell_meta['batch'] = cell_meta.pool_number.values
    d.obs['id'] = d.obs.individual.values
    d.obs['preQC_celltype'] = d.obs['predicted.celltype.l2'].values

    d.obs = d.obs.loc[:,['id', 'i_RawExpr', 'majortype', 'celltype', 'ref_UMAP1', 'ref_UMAP2', 'preQC_celltype']]
    d.obs = d.obs.join(cell_meta.loc[:,['nCount_RNA', 'nFeature_RNA', 'pool', 'percent.mt', 'batch',
            'sex', 'age', 'indiv_barcode']])

    # make anndata object
    d = mad.MultiAnnData(d, sampleid='id')
    d.use_R2 = False
    d.scale_variance = False
    d.count_factor = 0

    # aggregate sample metadata imported per-cell
    d.obs_to_sample(['sex', 'age', 'batch'])
    d.samplem['sex_M'] = (d.samplem.sex==1)*1 # From 1 vs 2 to boolean
    d.samplem = d.samplem.drop(columns = ['sex'])

    # add other clinical metadata
    d.samplem = d.samplem.join(meta.loc[:,['gPC1', 'gPC2', 'gPC3', 'gPC4', 'gPC5', 'gPC6']])

    categorical = ['Autoimmune_Disease_Other', 'Ca_Type', 'Eye_DiseaseType', 'Other_Disease', 'Other_Meds']
    for attribute in d.samplem.columns:
        if attribute not in categorical: d.samplem[attribute] = d.samplem[attribute].values.tolist()
    for attribute in d.samplem.columns:
        if attribute in categorical: d.samplem[attribute] = d.samplem[attribute].values.astype(str).tolist()

    # build NAM, compute NAM-PCs corrected for batch and covariates
    covs = ['age', 'sex_M', 'gPC1', 'gPC2', 'gPC3', 'gPC4', 'gPC5', 'gPC6']
    cna.tl.nam(d, batches=d.samplem.batch, covs=d.samplem[covs], ks=[d.samplem.shape[0]])

    # save data objects
    d.write(res_folder+"gwas_"+celltype+"/"+celltype+"_noAIDs.h5ad")

Myeloid


Trying to set attribute `.var` of view, copying.


Keeping 472 samples with known absence of autoimmune disease
Keeping 247 samples with at least 25 cells


Trying to set attribute `.var` of view, copying.
Trying to set attribute `.var` of view, copying.
Trying to set attribute `.var` of view, copying.
  view_to_actual(adata)
  view_to_actual(adata)
2023-11-01 11:52:23,560 - harmonypy - INFO - Iteration 1 of 50
2023-11-01 11:52:28,178 - harmonypy - INFO - Iteration 2 of 50
2023-11-01 11:52:32,900 - harmonypy - INFO - Iteration 3 of 50
2023-11-01 11:52:37,827 - harmonypy - INFO - Iteration 4 of 50
2023-11-01 11:52:42,016 - harmonypy - INFO - Iteration 5 of 50
2023-11-01 11:52:45,335 - harmonypy - INFO - Converged after 5 iterations


graph
umap


  exec(code_obj, self.user_global_ns, self.user_ns)


['id' 'majortype' 'celltype' 'preQC_celltype' 'pool' 'indiv_barcode']
consider casting to numeric types where appropriate, and
consider re-coding text-valued columns with pandas.get_dummies


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'pool' as categorical


NK


Trying to set attribute `.var` of view, copying.


Keeping 473 samples with known absence of autoimmune disease
Keeping 454 samples with at least 25 cells


Trying to set attribute `.var` of view, copying.
Trying to set attribute `.var` of view, copying.
Trying to set attribute `.var` of view, copying.
  view_to_actual(adata)
  view_to_actual(adata)
2023-11-01 11:58:08,585 - harmonypy - INFO - Iteration 1 of 50
2023-11-01 11:58:30,217 - harmonypy - INFO - Iteration 2 of 50
2023-11-01 11:58:53,156 - harmonypy - INFO - Iteration 3 of 50
2023-11-01 11:59:19,472 - harmonypy - INFO - Iteration 4 of 50
2023-11-01 11:59:36,299 - harmonypy - INFO - Iteration 5 of 50
2023-11-01 11:59:50,502 - harmonypy - INFO - Iteration 6 of 50
2023-11-01 12:00:06,286 - harmonypy - INFO - Iteration 7 of 50
2023-11-01 12:00:20,206 - harmonypy - INFO - Iteration 8 of 50
2023-11-01 12:00:35,214 - harmonypy - INFO - Iteration 9 of 50
2023-11-01 12:00:48,561 - harmonypy - INFO - Iteration 10 of 50
2023-11-01 12:01:00,991 - harmonypy - INFO - Converged after 10 iterations


graph
umap


  exec(code_obj, self.user_global_ns, self.user_ns)


['id' 'majortype' 'celltype' 'preQC_celltype' 'pool' 'indiv_barcode']
consider casting to numeric types where appropriate, and
consider re-coding text-valued columns with pandas.get_dummies


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'pool' as categorical


In [5]:
# Report count of samples included in each object
for celltype in ['NK', 'Myeloid']:
    d = cna.read(res_folder+"gwas_"+celltype+"/"+celltype+"_noAIDs.h5ad")
    print(d.samplem.shape[0])

454
247


## Only donors with known absence of asthma

Excludes donors lacking clinical metadata or with known asthma

In [6]:
celltype="NK"
np.random.seed(0)
d = sc.read_h5ad(src_folder+celltype+"_expr.h5ad")

# Remove individuals without documented clinical information
meta = pd.read_csv(src_folder+"sample_meta.csv", index_col = 0)
meta_clin = meta.drop(columns = meta.columns[pd.isna(meta).sum(axis=0)<400], inplace = False) # 32 clinical variables
rm_ids = meta_clin.index[pd.isna(meta_clin).sum(axis=1)==meta_clin.shape[1]]
meta = meta.drop(index=rm_ids)

# Remove individuals with any labeled autoimmune disease
meta['non_null'] = [True if type(meta.Other_Disease.values[i]) is not float else False \
                             for i in np.arange(meta.shape[0])]
meta['Asthma'] = np.repeat(False, meta.shape[0])
meta.loc[meta.non_null, 'Asthma'] = [True if 'asthma' in meta.Autoimmune_Disease_Other.values[i] or \
                              'asthma' in meta.Other_Disease.values[i] else False \
                             for i in np.where(meta.non_null)[0]]
Asthma_ids = np.array(meta.index[meta.Asthma.values])    
    
meta = meta.drop(index=Asthma_ids)
keep_ids = meta.index
keep_cells = np.repeat(False, d.obs.shape[0])
for sel_donor in keep_ids:
    keep_cells[np.where(d.obs.individual.values==sel_donor)] = True

d = d[keep_cells,:]
print("Keeping "+str(len(np.unique(d.obs.individual)))+" samples with known absence of asthma")

# Retain only samples with at least 25 cells
cellcount = pd.DataFrame(d.obs.individual.value_counts())
cellcount.columns = ['n_cells']
keep_ids = cellcount.index[cellcount.n_cells>=25]
keep_cells = np.repeat(False, d.obs.shape[0])
for sel_donor in keep_ids:
    keep_cells[np.where(d.obs.individual.values==sel_donor)] = True

d = d[keep_cells,:]
print("Keeping "+str(len(np.unique(d.obs.individual)))+" samples with at least 25 cells")

# Remove all HLA- genes (21)
d.var['HLA'] = ['HLA-' in d.var.index[i] for i in np.arange(d.var.shape[0])]
d = d[:,~d.var.HLA.values]

# Remove cell cycle genes
cc_genes = ["MCM5","PCNA","TYMS","FEN1","MCM2","MCM4", "RRM1","UNG","GINS2","MCM6",
            "CDCA7","DTL","PRIM1","UHRF1","MLF1IP","HELLS","RFC2","RPA2","NASP", 
            "RAD51AP1","GMNN","WDR76","SLBP","CCNE2","UBR7","POLD3","MSH2","ATAD2",
            "RAD51","RRM2", "CDC45", "CDC6", "EXO1", "TIPIN", "DSCC1", "BLM", "CASP8AP2",
            "USP1","CLSPN","POLA1","CHAF1B","BRIP1","E2F8","HMGB2","CDK1","NUSAP1","UBE2C",
            "BIRC5","TPX2","TOP2A","NDC80","CKS2","NUF2","CKS1B","MKI67","TMPO","CENPF",
            "TACC3","FAM64A","SMC4","CCNB2","CKAP2L","CKAP2","AURKB","BUB1","KIF11",
            "ANP32E","TUBB4B","GTSE1","KIF20B","HJURP","CDCA3","HN1", "CDC20", "TTK",
            "CDC25C", "KIF2C", "RANGAP1", "NCAPD2", "DLGAP5", "CDCA2", "CDCA8", "ECT2", 
            "KIF23", "HMMR", "AURKA", "PSRC1", "ANLN", "LBR", "CKAP5", "CENPE", "CTCF",
            "NEK2","G2E3","GAS2L3","CBX5","CENPA"]
d.var['CC'] = [d.var.index[i] in cc_genes for i in np.arange(d.var.shape[0])]
d = d[:,~d.var.CC.values]

# Remove hemoglobin genes (polymorphic)
hb_genes = ['HBB', 'HBA2', 'HBD', 'HBA1']
d.var['HB'] = [d.var.index[i] in hb_genes for i in np.arange(d.var.shape[0])]
d = d[:,~d.var.HB.values]

# Remove platelet genes
plt_genes = ['PF4', 'PPBP']
d.var['Plt'] = [d.var.index[i] in plt_genes for i in np.arange(d.var.shape[0])]
d = d[:,~d.var.Plt.values]

sc.pp.normalize_total(d, target_sum=1e4) #normalize expr
sc.pp.log1p(d) #logarithmize

# variable gene selection
min_disp = {'Myeloid':0.51, 'B':0.39, 'NK': 0.37, 'T': 0.31, 'allcells':0.30}
sc.pp.highly_variable_genes(d, min_disp=min_disp[celltype]) #np.sum(d.var.highly_variable)

high_dispersion = d.var.dispersions_norm > 11
d.var.loc[high_dispersion, 'highly_variable'] = False

d = d[:, d.var.highly_variable]

sc.pp.scale(d, max_value=10) # Scale each gene to unit variance
sc.tl.pca(d, svd_solver='arpack') # PCA

# Harmonize over batch (theta of 2 is default, if >1 batch variable, thetas should sum to 1)
if celltype == "allcells":
    ho = hm.run_harmony(d.obsm['X_pca'][:,:20], d.obs, ['pool'], max_iter_harmony = 50, theta = 2)
else:
    # sel sigma 0.2 > default of 0.1 --> encourages softer clustering b/c all one major type
    ho = hm.run_harmony(d.obsm['X_pca'][:,:20], d.obs, ['pool'], 
                        nclust = 50, sigma = 0.2, max_iter_harmony = 50, theta = 2)
d.obsm['harmpca'] = ho.Z_corr.T

print("graph")
sc.pp.neighbors(d, use_rep = 'harmpca') # graph    
print("umap")
sc.tl.umap(d) # umap    

# Load cell metadata
cell_meta = pd.read_csv(src_folder+"cell_meta.csv", index_col = 0)
cell_meta['batch'] = cell_meta.pool_number.values
d.obs['id'] = d.obs.individual.values
d.obs['preQC_celltype'] = d.obs['predicted.celltype.l2'].values

d.obs = d.obs.loc[:,['id', 'i_RawExpr', 'majortype', 'celltype', 'ref_UMAP1', 'ref_UMAP2', 'preQC_celltype']]
d.obs = d.obs.join(cell_meta.loc[:,['nCount_RNA', 'nFeature_RNA', 'pool', 'percent.mt', 'batch',
        'sex', 'age', 'indiv_barcode']])

# make anndata object
d = mad.MultiAnnData(d, sampleid='id')
d.use_R2 = False
d.scale_variance = False
d.count_factor = 0

# aggregate sample metadata imported per-cell
d.obs_to_sample(['sex', 'age', 'batch'])
d.samplem['sex_M'] = (d.samplem.sex==1)*1 # From 1 vs 2 to boolean
d.samplem = d.samplem.drop(columns = ['sex'])

# add other clinical metadata
d.samplem = d.samplem.join(meta.loc[:,['gPC1', 'gPC2', 'gPC3', 'gPC4', 'gPC5', 'gPC6']])

categorical = ['Autoimmune_Disease_Other', 'Ca_Type', 'Eye_DiseaseType', 'Other_Disease', 'Other_Meds']
for attribute in d.samplem.columns:
    if attribute not in categorical: d.samplem[attribute] = d.samplem[attribute].values.tolist()
for attribute in d.samplem.columns:
    if attribute in categorical: d.samplem[attribute] = d.samplem[attribute].values.astype(str).tolist()

# build NAM, compute NAM-PCs corrected for batch and covariates
covs = ['age', 'sex_M', 'gPC1', 'gPC2', 'gPC3', 'gPC4', 'gPC5', 'gPC6']
cna.tl.nam(d, batches=d.samplem.batch, covs=d.samplem[covs], ks=[d.samplem.shape[0]])

# save data objects
d.write(res_folder+"gwas_"+celltype+"/"+celltype+"_noAsthma.h5ad")

Trying to set attribute `.var` of view, copying.


Keeping 461 samples with known absence of asthma
Keeping 444 samples with at least 25 cells


Trying to set attribute `.var` of view, copying.
Trying to set attribute `.var` of view, copying.
Trying to set attribute `.var` of view, copying.
  view_to_actual(adata)
  view_to_actual(adata)
2023-11-01 12:03:17,962 - harmonypy - INFO - Iteration 1 of 50
2023-11-01 12:03:46,663 - harmonypy - INFO - Iteration 2 of 50
2023-11-01 12:04:13,892 - harmonypy - INFO - Iteration 3 of 50
2023-11-01 12:04:39,864 - harmonypy - INFO - Iteration 4 of 50
2023-11-01 12:04:53,656 - harmonypy - INFO - Iteration 5 of 50
2023-11-01 12:05:07,058 - harmonypy - INFO - Iteration 6 of 50
2023-11-01 12:05:21,160 - harmonypy - INFO - Iteration 7 of 50
2023-11-01 12:05:37,776 - harmonypy - INFO - Iteration 8 of 50
2023-11-01 12:05:54,420 - harmonypy - INFO - Iteration 9 of 50
2023-11-01 12:06:07,298 - harmonypy - INFO - Iteration 10 of 50
2023-11-01 12:06:20,228 - harmonypy - INFO - Converged after 10 iterations


graph
umap


  exec(code_obj, self.user_global_ns, self.user_ns)


['id' 'majortype' 'celltype' 'preQC_celltype' 'pool' 'indiv_barcode']
consider casting to numeric types where appropriate, and
consider re-coding text-valued columns with pandas.get_dummies


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'pool' as categorical


In [7]:
# Report count of samples included in each object
for celltype in ['NK']:
    d = cna.read(res_folder+"gwas_"+celltype+"/"+celltype+"_noAsthma.h5ad")
    print(d.samplem.shape[0])

444


## Only donors with known metadata (no ovarian cancer cases recorded)

Excludes donors lacking clinical metadata. None with clinical metadata have a record of ovarian cancer.

In [8]:
celltype="NK"
np.random.seed(0)
d = sc.read_h5ad(src_folder+celltype+"_expr.h5ad")

# Remove individuals without documented clinical information
meta = pd.read_csv(src_folder+"sample_meta.csv", index_col = 0)
meta_clin = meta.drop(columns = meta.columns[pd.isna(meta).sum(axis=0)<400], inplace = False) # 32 clinical variables
rm_ids = meta_clin.index[pd.isna(meta_clin).sum(axis=1)==meta_clin.shape[1]]
meta = meta.drop(index=rm_ids)
keep_ids = meta.index
keep_cells = np.repeat(False, d.obs.shape[0])
for sel_donor in keep_ids:
    keep_cells[np.where(d.obs.individual.values==sel_donor)] = True

d = d[keep_cells,:]
print("Keeping "+str(len(np.unique(d.obs.individual)))+" samples with known absence of asthma")

# Retain only samples with at least 25 cells
cellcount = pd.DataFrame(d.obs.individual.value_counts())
cellcount.columns = ['n_cells']
keep_ids = cellcount.index[cellcount.n_cells>=25]
keep_cells = np.repeat(False, d.obs.shape[0])
for sel_donor in keep_ids:
    keep_cells[np.where(d.obs.individual.values==sel_donor)] = True

d = d[keep_cells,:]
print("Keeping "+str(len(np.unique(d.obs.individual)))+" samples with at least 25 cells")

# Remove all HLA- genes (21)
d.var['HLA'] = ['HLA-' in d.var.index[i] for i in np.arange(d.var.shape[0])]
d = d[:,~d.var.HLA.values]

# Remove cell cycle genes
cc_genes = ["MCM5","PCNA","TYMS","FEN1","MCM2","MCM4", "RRM1","UNG","GINS2","MCM6",
            "CDCA7","DTL","PRIM1","UHRF1","MLF1IP","HELLS","RFC2","RPA2","NASP", 
            "RAD51AP1","GMNN","WDR76","SLBP","CCNE2","UBR7","POLD3","MSH2","ATAD2",
            "RAD51","RRM2", "CDC45", "CDC6", "EXO1", "TIPIN", "DSCC1", "BLM", "CASP8AP2",
            "USP1","CLSPN","POLA1","CHAF1B","BRIP1","E2F8","HMGB2","CDK1","NUSAP1","UBE2C",
            "BIRC5","TPX2","TOP2A","NDC80","CKS2","NUF2","CKS1B","MKI67","TMPO","CENPF",
            "TACC3","FAM64A","SMC4","CCNB2","CKAP2L","CKAP2","AURKB","BUB1","KIF11",
            "ANP32E","TUBB4B","GTSE1","KIF20B","HJURP","CDCA3","HN1", "CDC20", "TTK",
            "CDC25C", "KIF2C", "RANGAP1", "NCAPD2", "DLGAP5", "CDCA2", "CDCA8", "ECT2", 
            "KIF23", "HMMR", "AURKA", "PSRC1", "ANLN", "LBR", "CKAP5", "CENPE", "CTCF",
            "NEK2","G2E3","GAS2L3","CBX5","CENPA"]
d.var['CC'] = [d.var.index[i] in cc_genes for i in np.arange(d.var.shape[0])]
d = d[:,~d.var.CC.values]

# Remove hemoglobin genes (polymorphic)
hb_genes = ['HBB', 'HBA2', 'HBD', 'HBA1']
d.var['HB'] = [d.var.index[i] in hb_genes for i in np.arange(d.var.shape[0])]
d = d[:,~d.var.HB.values]

# Remove platelet genes
plt_genes = ['PF4', 'PPBP']
d.var['Plt'] = [d.var.index[i] in plt_genes for i in np.arange(d.var.shape[0])]
d = d[:,~d.var.Plt.values]

sc.pp.normalize_total(d, target_sum=1e4) #normalize expr
sc.pp.log1p(d) #logarithmize

# variable gene selection
min_disp = {'Myeloid':0.51, 'B':0.39, 'NK': 0.37, 'T': 0.31, 'allcells':0.30}
sc.pp.highly_variable_genes(d, min_disp=min_disp[celltype]) #np.sum(d.var.highly_variable)

high_dispersion = d.var.dispersions_norm > 11
d.var.loc[high_dispersion, 'highly_variable'] = False

d = d[:, d.var.highly_variable]

sc.pp.scale(d, max_value=10) # Scale each gene to unit variance
sc.tl.pca(d, svd_solver='arpack') # PCA

# Harmonize over batch (theta of 2 is default, if >1 batch variable, thetas should sum to 1)
if celltype == "allcells":
    ho = hm.run_harmony(d.obsm['X_pca'][:,:20], d.obs, ['pool'], max_iter_harmony = 50, theta = 2)
else:
    # sel sigma 0.2 > default of 0.1 --> encourages softer clustering b/c all one major type
    ho = hm.run_harmony(d.obsm['X_pca'][:,:20], d.obs, ['pool'], 
                        nclust = 50, sigma = 0.2, max_iter_harmony = 50, theta = 2)
d.obsm['harmpca'] = ho.Z_corr.T

print("graph")
sc.pp.neighbors(d, use_rep = 'harmpca') # graph    
print("umap")
sc.tl.umap(d) # umap    

# Load cell metadata
cell_meta = pd.read_csv(src_folder+"cell_meta.csv", index_col = 0)
cell_meta['batch'] = cell_meta.pool_number.values
d.obs['id'] = d.obs.individual.values
d.obs['preQC_celltype'] = d.obs['predicted.celltype.l2'].values

d.obs = d.obs.loc[:,['id', 'i_RawExpr', 'majortype', 'celltype', 'ref_UMAP1', 'ref_UMAP2', 'preQC_celltype']]
d.obs = d.obs.join(cell_meta.loc[:,['nCount_RNA', 'nFeature_RNA', 'pool', 'percent.mt', 'batch',
        'sex', 'age', 'indiv_barcode']])

# make anndata object
d = mad.MultiAnnData(d, sampleid='id')
d.use_R2 = False
d.scale_variance = False
d.count_factor = 0

# aggregate sample metadata imported per-cell
d.obs_to_sample(['sex', 'age', 'batch'])
d.samplem['sex_M'] = (d.samplem.sex==1)*1 # From 1 vs 2 to boolean
d.samplem = d.samplem.drop(columns = ['sex'])

# add other clinical metadata
d.samplem = d.samplem.join(meta.loc[:,['gPC1', 'gPC2', 'gPC3', 'gPC4', 'gPC5', 'gPC6']])

categorical = ['Autoimmune_Disease_Other', 'Ca_Type', 'Eye_DiseaseType', 'Other_Disease', 'Other_Meds']
for attribute in d.samplem.columns:
    if attribute not in categorical: d.samplem[attribute] = d.samplem[attribute].values.tolist()
for attribute in d.samplem.columns:
    if attribute in categorical: d.samplem[attribute] = d.samplem[attribute].values.astype(str).tolist()

# build NAM, compute NAM-PCs corrected for batch and covariates
covs = ['age', 'sex_M', 'gPC1', 'gPC2', 'gPC3', 'gPC4', 'gPC5', 'gPC6']
cna.tl.nam(d, batches=d.samplem.batch, covs=d.samplem[covs], ks=[d.samplem.shape[0]])

# save data objects
d.write(res_folder+"gwas_"+celltype+"/"+celltype+"_KnownMeta.h5ad")

Keeping 534 samples with known absence of asthma
Keeping 513 samples with at least 25 cells


Trying to set attribute `.var` of view, copying.
Trying to set attribute `.var` of view, copying.
Trying to set attribute `.var` of view, copying.
Trying to set attribute `.var` of view, copying.
  view_to_actual(adata)
  view_to_actual(adata)
2023-11-01 12:08:32,952 - harmonypy - INFO - Iteration 1 of 50
2023-11-01 12:09:06,048 - harmonypy - INFO - Iteration 2 of 50
2023-11-01 12:09:40,176 - harmonypy - INFO - Iteration 3 of 50
2023-11-01 12:10:14,974 - harmonypy - INFO - Iteration 4 of 50
2023-11-01 12:10:49,287 - harmonypy - INFO - Iteration 5 of 50
2023-11-01 12:11:09,067 - harmonypy - INFO - Iteration 6 of 50
2023-11-01 12:11:25,050 - harmonypy - INFO - Iteration 7 of 50
2023-11-01 12:11:40,220 - harmonypy - INFO - Iteration 8 of 50
2023-11-01 12:11:54,581 - harmonypy - INFO - Iteration 9 of 50
2023-11-01 12:12:10,774 - harmonypy - INFO - Iteration 10 of 50
2023-11-01 12:12:26,924 - harmonypy - INFO - Converged after 10 iterations


graph
umap


  exec(code_obj, self.user_global_ns, self.user_ns)


['id' 'majortype' 'celltype' 'preQC_celltype' 'pool' 'indiv_barcode']
consider casting to numeric types where appropriate, and
consider re-coding text-valued columns with pandas.get_dummies


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'pool' as categorical


In [9]:
# Report count of samples included in each object
for celltype in ['NK']:
    d = cna.read(res_folder+"gwas_"+celltype+"/"+celltype+"_KnownMeta.h5ad")
    print(d.samplem.shape[0])

513


## Downsampled datasets for power simulations

In [None]:
for frac_keep in [0.8, 0.6, 0.4, 0.2]:
    print(frac_keep)
    for celltype in ["allcells", "NK", "B", "T", "Myeloid"]:
        print(celltype)
        np.random.seed(0)
        d = sc.read_h5ad(src_folder+celltype+"_expr.h5ad")

        # Retain only samples with at least 25 cells
        cellcount = pd.DataFrame(d.obs.individual.value_counts())
        cellcount.columns = ['n_cells']
        keep_ids = cellcount.index[cellcount.n_cells>=25]
        keep_cells = np.repeat(False, d.obs.shape[0])
        for sel_donor in keep_ids:
            keep_cells[np.where(d.obs.individual.values==sel_donor)] = True
        d = d[keep_cells,:]
        print("Keeping "+str(len(np.unique(d.obs.individual)))+" samples with at least 25 cells")

        # Downsample dataset
        all_ids = np.unique(d.obs.individual)
        N_keep = int(frac_keep*len(all_ids))
        keep_ids = all_ids[np.random.choice(np.arange(len(all_ids)), N_keep, replace = False)]
        keep_cells = np.repeat(False, d.obs.shape[0])
        for sel_donor in keep_ids:
            keep_cells[np.where(d.obs.individual.values==sel_donor)] = True
        d = d[keep_cells,:]
        print("Keeping "+str(len(np.unique(d.obs.individual)))+" samples after downsampling")

        # Remove all HLA- genes (21)
        d.var['HLA'] = ['HLA-' in d.var.index[i] for i in np.arange(d.var.shape[0])]
        d = d[:,~d.var.HLA.values]

        # Remove cell cycle genes
        cc_genes = ["MCM5","PCNA","TYMS","FEN1","MCM2","MCM4", "RRM1","UNG","GINS2","MCM6",
                    "CDCA7","DTL","PRIM1","UHRF1","MLF1IP","HELLS","RFC2","RPA2","NASP", 
                    "RAD51AP1","GMNN","WDR76","SLBP","CCNE2","UBR7","POLD3","MSH2","ATAD2",
                    "RAD51","RRM2", "CDC45", "CDC6", "EXO1", "TIPIN", "DSCC1", "BLM", "CASP8AP2",
                    "USP1","CLSPN","POLA1","CHAF1B","BRIP1","E2F8","HMGB2","CDK1","NUSAP1","UBE2C",
                    "BIRC5","TPX2","TOP2A","NDC80","CKS2","NUF2","CKS1B","MKI67","TMPO","CENPF",
                    "TACC3","FAM64A","SMC4","CCNB2","CKAP2L","CKAP2","AURKB","BUB1","KIF11",
                    "ANP32E","TUBB4B","GTSE1","KIF20B","HJURP","CDCA3","HN1", "CDC20", "TTK",
                    "CDC25C", "KIF2C", "RANGAP1", "NCAPD2", "DLGAP5", "CDCA2", "CDCA8", "ECT2", 
                    "KIF23", "HMMR", "AURKA", "PSRC1", "ANLN", "LBR", "CKAP5", "CENPE", "CTCF",
                    "NEK2","G2E3","GAS2L3","CBX5","CENPA"]
        d.var['CC'] = [d.var.index[i] in cc_genes for i in np.arange(d.var.shape[0])]
        d = d[:,~d.var.CC.values]

        # Remove hemoglobin genes (polymorphic)
        hb_genes = ['HBB', 'HBA2', 'HBD', 'HBA1']
        d.var['HB'] = [d.var.index[i] in hb_genes for i in np.arange(d.var.shape[0])]
        d = d[:,~d.var.HB.values]

        # Remove platelet genes
        plt_genes = ['PF4', 'PPBP']
        d.var['Plt'] = [d.var.index[i] in plt_genes for i in np.arange(d.var.shape[0])]
        d = d[:,~d.var.Plt.values]

        sc.pp.normalize_total(d, target_sum=1e4) #normalize expr
        sc.pp.log1p(d) #logarithmize

        # variable gene selection
        min_disp = {'Myeloid':0.51, 'B':0.39, 'NK': 0.37, 'T': 0.31, 'allcells':0.30}
        sc.pp.highly_variable_genes(d, min_disp=min_disp[celltype]) #np.sum(d.var.highly_variable)

        high_dispersion = d.var.dispersions_norm > 11
        d.var.loc[high_dispersion, 'highly_variable'] = False

        d = d[:, d.var.highly_variable]

        sc.pp.scale(d, max_value=10) # Scale each gene to unit variance
        sc.tl.pca(d, svd_solver='arpack') # PCA

        # Harmonize over batch (theta of 2 is default, if >1 batch variable, thetas should sum to 1)
        if celltype == "allcells":
            ho = hm.run_harmony(d.obsm['X_pca'][:,:20], d.obs, ['pool'], max_iter_harmony = 50, theta = 2)
        else:
            # sel sigma 0.2 > default of 0.1 --> encourages softer clustering b/c all one major type
            ho = hm.run_harmony(d.obsm['X_pca'][:,:20], d.obs, ['pool'], 
                                nclust = 50, sigma = 0.2, max_iter_harmony = 50, theta = 2)
        d.obsm['harmpca'] = ho.Z_corr.T

        print("graph")
        sc.pp.neighbors(d, use_rep = 'harmpca') # graph    
        #print("umap")
        #sc.tl.umap(d) # umap    

        # Load cell metadata
        cell_meta = pd.read_csv(src_folder+"cell_meta.csv", index_col = 0)
        cell_meta['batch'] = cell_meta.pool_number.values
        d.obs['id'] = d.obs.individual.values
        d.obs['preQC_celltype'] = d.obs['predicted.celltype.l2'].values

        d.obs = d.obs.loc[:,['id', 'i_RawExpr', 'majortype', 'celltype', 'ref_UMAP1', 'ref_UMAP2', 'preQC_celltype']]
        d.obs = d.obs.join(cell_meta.loc[:,['nCount_RNA', 'nFeature_RNA', 'pool', 'percent.mt', 'batch',
                'sex', 'age', 'indiv_barcode']])

        # make anndata object
        d = mad.MultiAnnData(d, sampleid='id')
        d.use_R2 = False
        d.scale_variance = False
        d.count_factor = 0

        # aggregate sample metadata imported per-cell
        d.obs_to_sample(['sex', 'age', 'batch'])
        d.samplem['sex_M'] = (d.samplem.sex==1)*1 # From 1 vs 2 to boolean
        d.samplem = d.samplem.drop(columns = ['sex'])

        # add other clinical metadata
        meta = pd.read_csv(src_folder+"sample_meta.csv", index_col = 0)
        d.samplem = d.samplem.join(meta.loc[:,['gPC1', 'gPC2', 'gPC3', 'gPC4', 'gPC5', 'gPC6']])

        categorical = ['Autoimmune_Disease_Other', 'Ca_Type', 'Eye_DiseaseType', 'Other_Disease', 'Other_Meds']
        for attribute in d.samplem.columns:
            if attribute not in categorical: d.samplem[attribute] = d.samplem[attribute].values.tolist()
        for attribute in d.samplem.columns:
            if attribute in categorical: d.samplem[attribute] = d.samplem[attribute].values.astype(str).tolist()

        # build NAM, compute NAM-PCs corrected for batch and covariates
        covs = ['age', 'sex_M', 'gPC1', 'gPC2', 'gPC3', 'gPC4', 'gPC5', 'gPC6']
        cna.tl.nam(d, batches=d.samplem.batch, covs=d.samplem[covs], ks=[d.samplem.shape[0]])

        # save data objects
        d.write("/data/srlab/lrumker/MCSC_Project/cna-qtl/nonnull_sims/downsampled/"+\
                celltype+"_"+"pt".join(str(frac_keep).split("."))+".h5ad")