In [1]:
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [2]:

import numpy as np
import scanpy as sc
import pandas as pd
import anndata as ad
import seaborn as sns
import scrublet as scr



In [3]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=80, facecolor='white')

In [4]:
mouse_colors = plt.cm.colors.ListedColormap(['red', 'darkred', 'blue','darkblue', 'orange', 'darkorange', 'violet', 'darkviolet',])

samples = [
    "GSM6321073_PFC_4wk_1_matrix.h5",
    "GSM6321077_PFC_90wk_1_matrix.h5",
]

mouse_id = {
    0 : 1,
    1 : 1
}

In [5]:
all_adata = []
i = 0
for s in samples:
    label, area, age, idx, _ = s.split("_")
    print(label, area, age, idx)
    curr_adata = sc.read_10x_h5(f"/Users/cmdb/qb25project/mouse-brain-RNAseq/GSE207848_RAW/{s}")
    curr_adata.var_names_make_unique()
    curr_adata.obs['area'] = area
    curr_adata.obs['age'] = age
    curr_adata.obs['idx'] = i
    i += 1
    curr_adata.var['mt'] = curr_adata.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
    sc.pp.calculate_qc_metrics(curr_adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

    all_adata.append(curr_adata)


GSM6321073 PFC 4wk 1
reading /Users/cmdb/qb25project/mouse-brain-RNAseq/GSE207848_RAW/GSM6321073_PFC_4wk_1_matrix.h5


  utils.warn_names_duplicates("var")


 (0:00:01)


  utils.warn_names_duplicates("var")


GSM6321077 PFC 90wk 1
reading /Users/cmdb/qb25project/mouse-brain-RNAseq/GSE207848_RAW/GSM6321077_PFC_90wk_1_matrix.h5


  utils.warn_names_duplicates("var")


 (0:00:01)


  utils.warn_names_duplicates("var")


In [6]:
total_cells = np.sum([a.n_obs for a in all_adata])
print('total cells:', total_cells)

total cells: 28818


In [7]:
adata = ad.concat(all_adata)

  utils.warn_names_duplicates("obs")


In [8]:
adata[adata.obs.area=='PFC']

View of AnnData object with n_obs × n_vars = 28818 × 32285
    obs: 'area', 'age', 'idx', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt'

In [9]:
adata.obs_names_make_unique()

In [10]:
sc.pp.filter_cells(adata, min_genes=1000)
sc.pp.filter_cells(adata, max_counts=100000)
sc.pp.filter_genes(adata, min_cells=3)
sc.pp.filter_cells(adata, min_counts=2500)

filtered out 1766 cells that have less than 1000 genes expressed
filtered out 24 cells that have more than 100000 counts
filtered out 6412 genes that are detected in less than 3 cells
filtered out 1369 cells that have less than 2500 counts


In [11]:
adata.obs['mouse_id'] = [mouse_id[i] for i in adata.obs.idx]

In [12]:
# run scrublet on adata to identify doublets
scrub = scr.Scrublet(adata.X, expected_doublet_rate=0.09)
doublet_scores, predicted_doublets = scrub.scrub_doublets(min_gene_variability_pctl=85, 
                                                          n_prin_comps=30)

Preprocessing...
Simulating doublets...
Embedding transcriptomes using PCA...
Calculating doublet scores...
Automatically set threshold at doublet score = 0.20
Detected doublet rate = 9.2%
Estimated detectable doublet fraction = 66.2%
Overall doublet rate:
	Expected   = 9.0%
	Estimated  = 13.9%
Elapsed time: 22.9 seconds


In [13]:
np.sum(predicted_doublets)/len(doublet_scores)

np.float64(0.09174168907595776)

In [14]:
adata = adata[~predicted_doublets,:]

In [15]:
adata.write("adata_combined_nodoublet.h5ad")

  df[key] = c
  df[key] = c


In [16]:
print(np.median(adata.obs.n_genes_by_counts))

3332.0


In [17]:
print(np.median(adata.obs.total_counts))

9699.0


In [None]:
#adata = adata[adata.obs.n_genes_by_counts < 3000, :]
#adata = adata[adata.obs.pct_counts_mt < 5, :]

In [18]:
sc.pp.normalize_total(adata, target_sum=1e4)

sc.pp.log1p(adata)

sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)

normalizing counts per cell
    finished (0:00:00)
extracting highly variable genes
    finished (0:00:00)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)


In [19]:
adata.write("adata_combined_nodoublet_normalized.h5ad")

In [20]:
import anndata as ad
adata = ad.read_h5ad('adata_combined_nodoublet.h5ad')

#print(adata.obs.tail(n= 50))  # This shows the last few rows of cell metadata (annotations)

adata.obs.dtypes

# for i in adata.X:
#     print(i)

# print(adata.obs["age"])
# for i in adata.obs["age"]:
#     print(i)

#print(adata.obs["n_genes"])
#print(adata.obs["n_genes_by_counts"])

with open ('age.tsv', "w") as f:
    f.write("idx" + "\t" + "age" + "\n")
    for i in range(len(adata.obs['age'])):
        #print(adata.obs_names[i], adata.obs["age"][i])
        f.write(adata.obs_names[i] + "\t" + adata.obs["age"][i] + "\n")




  f.write(adata.obs_names[i] + "\t" + adata.obs["age"][i] + "\n")
