## **To recreate study results please load package versions found in main_requirements.txt**

In [1]:
import scanpy as sc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
import math
import seaborn as sns
import os
# %config IPCompleter.greedy=True
%load_ext autoreload
%autoreload 2

sc.settings.verbosity = 0
sc.logging.print_header()
sns.set_context("paper")

  from .autonotebook import tqdm as notebook_tqdm


scanpy==1.9.2 anndata==0.8.0 umap==0.5.3 numpy==1.22.3 scipy==1.8.1 pandas==1.5.3 scikit-learn==1.2.1 statsmodels==0.13.5 pynndescent==0.5.8


In [2]:
# import local module containing misc code, helps keep notebooks clean from commonly used functions
import new_misc_code as nmc

## **Load raw UMI count data**

In [3]:
# read in count matrices post nuclei and gene quality control
# input_h5ad = "data/2023-05-16_input/seurat.excitatory_select.h5ad"
# input_h5ad = "data/2023-05-16_input/seurat.inhibitory_select.h5ad"
input_h5ad = "data/2023-05-16_input/seurat.merged.h5ad"
adata = sc.read(input_h5ad)

In [34]:
adata.var

Unnamed: 0,features
Xkr4,Xkr4
Gm1992,Gm1992
Gm19938,Gm19938
Gm37381,Gm37381
Rp1,Rp1
...,...
Gm28187,Gm28187
Gm14617,Gm14617
Ghrhr,Ghrhr
Irx6,Irx6


In [4]:
# adata.obs_names.values
# scaled and variable genes
adata
# all genes
adata.raw.n_obs
adata.raw.n_vars

25440

In [8]:
sc.pp.calculate_qc_metrics( adata, inplace=True)

In [9]:
sc.pl.umap( adata, color=['age'], legend_fontsize=5, add_outline=True, size=2, legend_fontoutline=0.5)

KeyError: "Could not find 'umap' or 'X_umap' in .obsm"

## **Create batches**

In [5]:
# batches should be my.cell.type_sample
# was cluster
l_cell_type = list(np.unique(adata.obs['my.cell.type'].values))
# was age
l_sample = list(np.unique(adata.obs['sample'].values))
batches = []
 
for i in range(len(l_cell_type)):
    for j in range(len(l_sample)):
        batches.append(f"{l_cell_type[i]}--{l_sample[j]}")
batches
# len(batches)

['e-A1--e16f',
 'e-A1--e16m_apr23',
 'e-A1--e18f',
 'e-A1--e18f_may22',
 'e-A1--e18m',
 'e-A1--e18m_may22',
 'e-A1--p0f',
 'e-A1--p0f2',
 'e-A1--p0f_mar23',
 'e-A1--p0m_april',
 'e-A1--p0m_july',
 'e-A1--p10f',
 'e-A1--p10f_mar23',
 'e-A1--p10m',
 'e-A1--p10m_apr23',
 'e-A1--p18f',
 'e-A1--p18f_oct',
 'e-A1--p18m',
 'e-A1--p18m_oct',
 'e-A1--p28f',
 'e-A1--p28f_may22',
 'e-A1--p28m',
 'e-A1--p28m_may22',
 'e-A1--p4f',
 'e-A1--p4f_oct',
 'e-A1--p4m',
 'e-A1--p4m_mar23',
 'e-A1--p65f_apr23',
 'e-A1--p65f_april',
 'e-A1--p65m_april',
 'e-A1--p65m_aug',
 'e-A2--e16f',
 'e-A2--e16m_apr23',
 'e-A2--e18f',
 'e-A2--e18f_may22',
 'e-A2--e18m',
 'e-A2--e18m_may22',
 'e-A2--p0f',
 'e-A2--p0f2',
 'e-A2--p0f_mar23',
 'e-A2--p0m_april',
 'e-A2--p0m_july',
 'e-A2--p10f',
 'e-A2--p10f_mar23',
 'e-A2--p10m',
 'e-A2--p10m_apr23',
 'e-A2--p18f',
 'e-A2--p18f_oct',
 'e-A2--p18m',
 'e-A2--p18m_oct',
 'e-A2--p28f',
 'e-A2--p28f_may22',
 'e-A2--p28m',
 'e-A2--p28m_may22',
 'e-A2--p4f',
 'e-A2--p4f_oct',
 'e-


## **Bulk by cell.type - sample**

In [37]:
#for batch_itr in batches:
# ind = batches[0]
# ind
#batch_mk = adata.obs['sample'].values==batch_itr
#num_cells = batch_mk.sum()
#batch_adata = adata[batch_mk]
#bulk.loc[ind] = batch_adata.raw.X.sum(0).A1
#bulk

In [258]:
# obs_col_mk = (adata.obs.nunique()<=len(batches)).values
# obs_cols = np.array(adata.obs_keys())[obs_col_mk]
# obs_col_mk
# ind = batches[6]
# print(ind)

# batch_mk = adata.obs['age'].values==ind
# num_cells = batch_mk.sum()
# num_cells

# bulk.drop( ind, axis=0, inplace=True)
# obs.drop(  ind, axis=0, inplace=True)

#batch_adata = adata[batch_mk]
#bulk.loc[ind] = batch_adata.raw.X.sum(0).A1
#obs.loc[ind,'Num_Cells'] = num_cells
#obs.loc[ind, obs_cols] = batch_adata.obs.iloc[0, obs_col_mk]


8


In [6]:
# set minimum number of nuclei needed to make a bulk data set
min_cells = 10
for cell_type in l_cell_type:
    cell_batches = [x for x in batches if x.startswith(f"{cell_type}--")]
    # create dataframe to hold bulked data
    # when normalized is present
    # columns = list(adata.raw.var.index)
    columns = list(adata.var.index)
    bulk = pd.DataFrame(index = cell_batches, columns = columns)

    obs_col_mk = (adata.obs.nunique()<=len(cell_batches)).values
    obs_cols = np.array(adata.obs_keys())[obs_col_mk]
    obs = pd.DataFrame(index=cell_batches, columns=obs_cols)

    obs['Num_Cells'] = 0
    # loop through batches
    for s_batch in cell_batches:
        [s_cell_type, s_sample] = [s for s in s_batch.split("--")]
        batch_c = adata[adata.obs['my.cell.type'] == s_cell_type].copy()
        batch_ac = batch_c[batch_c.obs['sample'] == s_sample].copy()
        num_cells = batch_ac.n_obs
        # print(f"{s_batch} : {num_cells}")
        if(num_cells < 10):
            bulk.drop(s_batch, axis=0, inplace=True)
            obs.drop(s_batch, axis=0, inplace=True)
            continue
        else:
            batch_adata = batch_ac
            # when input is normalised with SCT assay by default, use raw
            # bulk.loc[s_batch] = batch_adata.raw.X.sum(0).A1
            # for the merged dataset use X
            bulk.loc[s_batch] = batch_adata.X.sum(0).A1
            obs.loc[s_batch,'Num_Cells'] = num_cells
            obs.loc[s_batch, obs_cols] = batch_adata.obs.iloc[0, obs_col_mk]
    # save files
    bulk.T.to_csv(f"./data/limma_voom/{cell_type}_pseudo-bulk-cts_min{min_cells}.csv")
    obs.to_csv(f"./data/limma_voom/{cell_type}_obs-cts_min{min_cells}.csv")

In [55]:
# example output, un-transposed
# obs

Unnamed: 0,orig.ident,sample,Phase,mitoFr,riboFr,PLGFr,SScoreFr,G2MScoreFr,age,sex,...,brain.region,my.cell.type,merge.dj.cell.type,merge.fnl.cell.type,merge.my.fnl.type,prediction.confidence,fnl.cell.type,n_genes_by_counts,log1p_n_genes_by_counts,Num_Cells
e-P3--e16f,e16f,e16f,G2M,Medium,High,High,Low,High,0,female,...,PVN,e-P3,,,,loose,Crh/Trh,2000,7.601402,12
e-P3--e16m_apr23,e16m,e16m_apr23,G2M,Low,High,High,Medium high,High,0,male,...,PVN,e-P3,E-13_e2:Tac1/Fezf1,,,loose,Crh/Trh,2000,7.601402,12
e-P3--e18f,e18f,e18f,S,Medium high,Medium high,High,High,High,1,female,...,PVN,e-P3,,,,confident,Crh/Trh,2000,7.601402,40
e-P3--e18m,e18m,e18m,S,Medium high,High,High,Medium high,Medium,1,male,...,PVN,e-P3,e4:Trh/Angpt1,Trh,e-P4 (Trh),confident,Crh/Trh,2000,7.601402,34
e-P3--p0f,p0f,p0f,G1,Low,High,High,Medium,Low,2,female,...,PVN,e-P3,e4:Trh/Angpt1,Trh,e-P4 (Trh),confident,Crh/Trh,2000,7.601402,46
e-P3--p0f2,p0f2,p0f2,S,Medium high,High,Low,High,Medium high,2,female,...,PVN,e-P3,e20:Crh,Crh,e-P3 (Crh),confident,Crh/Trh,2000,7.601402,38
e-P3--p0m_july,p0m,p0m_july,S,Low,Medium,Medium high,High,Medium,2,male,...,PVN,e-P3,e20:Crh,Crh,e-P3 (Crh),confident,Crh/Trh,2000,7.601402,37
e-P3--p10f,p10f,p10f,G1,High,High,High,Low,Medium,4,female,...,PVN,e-P3,e20:Crh,Crh,e-P3 (Crh),confident,Crh/Trh,2000,7.601402,12
e-P3--p10f_mar23,p10f,p10f_mar23,G1,Medium,High,High,Medium,Low,4,female,...,PVN,e-P3,e20:Crh,Crh,e-P3 (Crh),confident,Crh/Trh,2000,7.601402,23
e-P3--p10m,p10m,p10m,S,High,High,Medium high,Medium high,Medium high,4,male,...,PVN,e-P3,e20:Crh,Crh,e-P3 (Crh),confident,Crh/Trh,2000,7.601402,83


In [14]:
# example output, un-transposed
bulk.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25430,25431,25432,25433,25434,25435,25436,25437,25438,25439
i-X9--e16f,322.0,11.0,11.0,0.0,0.0,0.0,0.0,6.0,3.0,11.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
i-X9--e16m_apr23,500.0,23.0,12.0,0.0,0.0,0.0,0.0,4.0,4.0,29.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
i-X9--e18f,208.0,8.0,7.0,0.0,0.0,0.0,0.0,3.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
i-X9--e18f_may22,99.0,5.0,2.0,0.0,0.0,0.0,0.0,4.0,2.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
i-X9--e18m,299.0,5.0,7.0,1.0,0.0,0.0,0.0,5.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
