* ref: https://www.jianshu.com/p/24e1b2e823d2

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc

import matplotlib.pyplot as pl
from matplotlib import rcParams

import os


wr_dir = '/Users/jplab/Desktop/DAILY_CODE_DATA/2022-5/data/5-7_singlecell_scanpy_demo'
os.chdir(wr_dir)


# verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.verbosity = 3  
sc.logging.print_versions()

In [None]:
filtered_matrix_h5 = "/Users/jplab/Downloads/GSE122960_RAW/GSM3489183_IPF_01_filtered_gene_bc_matrices_h5.h5" 
adata = sc.read_10x_h5(filtered_matrix_h5, genome='GRCh38',gex_only=True)
adata.var_names_make_unique()

adata

In [None]:
sc.pl.highest_expr_genes(adata, n_top=100)

In [None]:
sc.pp.filter_cells(adata, min_genes=200) # 去除表达基因200以下的细胞 
sc.pp.filter_genes(adata, min_cells=3) # 去除在3个细胞以下表达的基因
adata

In [None]:
mito_genes = adata.var_names.str.startswith('MT-')
mito_genes
print('aaa',adata[:,mito_genes])

# for each cell compute fraction of counts in mito genes vs. all genes
# the `.A1` is only necessary as X is sparse (to transform to a dense array after summing)
adata.obs['percent_mito'] = np.sum(adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
# add the total counts per cell as observations-annotation to adata
adata.obs['n_counts'] = adata.X.sum(axis=1).A1

adata

In [None]:
sc.pl.violin(
    adata, 
    ['n_genes', 'n_counts', 'percent_mito'],
    jitter=0.4, multi_panel=True)

In [None]:
sc.pl.scatter(adata, x='n_counts', y='percent_mito')
sc.pl.scatter(adata, x='n_counts', y='n_genes')

In [None]:
adata = adata[adata.obs['n_genes'] < 4000, :]
adata = adata[adata.obs['percent_mito'] < 0.3, :]
adata

In [None]:
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
sc.pp.log1p(adata)
adata

adata.raw = adata
adata.__dict__

In [None]:
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(adata)

adata = adata[:, adata.var['highly_variable']]
adata

In [None]:
sc.pp.regress_out(adata, ['n_counts', 'percent_mito'])
sc.pp.scale(adata, max_value=10)

In [None]:
sc.tl.pca(adata, svd_solver='arpack')
sc.pl.pca(adata, color=['SFTPC','FOXA1','LAMP3'])
sc.pl.pca_variance_ratio(adata, log=True)

In [None]:
adata.write("pca_results.h5ad")

In [None]:
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=15)

sc.tl.umap(adata)
sc.pl.umap(adata, color=['SFTPC','FOXA1','FOXM1'])

In [None]:
sc.tl.louvain(adata)
sc.pl.umap(adata, color=['louvain'])

adata.write("umap.h5ad")

In [None]:
sc.tl.tsne(adata)
sc.pl.tsne(adata, color=['SFTPC','FOXA1','FOXM1'])

sc.tl.louvain(adata)
sc.pl.tsne(adata, color=['louvain'])

adata.write("tsne.h5ad")

In [None]:
sc.tl.rank_genes_groups(adata, 'louvain', method='wilcoxon')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

df_rank_genes_groups_names = pd.DataFrame(adata.uns['rank_genes_groups']['names'])
sc.pl.stacked_violin(adata,var_names=df_rank_genes_groups_names.iloc[0], groupby='louvain', use_raw=True)

In [None]:
sc.pl.rank_genes_groups_heatmap(adata, n_genes=20, groupby='louvain', use_raw=True)

In [None]:
adata.obs['louvain'].cat.categories

adata.obs['louvain_anno'] = adata.obs['louvain']
adata.obs['louvain_anno']

sc.tl.paga(adata, groups='louvain_anno')
sc.pl.paga(adata, threshold=0.03)
adata

sc.tl.draw_graph(adata, init_pos='paga')

In [None]:
pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(5)

result = adata.uns['rank_genes_groups']
groups = result['names'].dtype.names
pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names', 'pvals']}).head(5)