In [1]:
#!gsutil -m cp gs://terra-featured-workspaces/Sc_SnRNA-seq/data/mouse_cortex/raw_feature_bc_matrix.h5 ./mouse/
#!gsutil -m cp gs://terra-featured-workspaces/Sc_SnRNA-seq/data/mouse_cortex_ADT/mouse_cortex_ADT.csv ./mouse/

In [2]:
#!scCloud demuxEM -p 8 --min-num-genes 200 --generate-diagnostic-plots ./mouse/mouse_cortex_ADT.csv ./mouse/raw_feature_bc_matrix.h5 ./mouse/exp

In [3]:
import scCloud as scc
import numpy as np

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


## Preprocessing

In [4]:
adata = scc.read_input("./mouse/exp_demux.h5sc", select_singlets = True)
adata

AnnData object with n_obs × n_vars = 3267 × 27998 
    obs: 'assignment', 'Channel'
    var: 'gene_ids'
    uns: 'genome'

In [5]:
adata.obs['assignment'].value_counts()

Sample5M    502
Sample8M    467
Sample6M    408
Sample2F    397
Sample1F    397
Sample3F    376
Sample7M    367
Sample4F    353
Name: assignment, dtype: int64

In [6]:
scc.qc_metrics(adata, mito_prefix = 'mt-', min_genes = 200)

In [7]:
adata.var.keys()

Index(['gene_ids', 'n_cells', 'percent_cells', 'robust',
       'highly_variable_features'],
      dtype='object')

In [8]:
adata.obs.keys()

Index(['assignment', 'Channel', 'passed_qc', 'n_genes', 'n_counts',
       'percent_mito'],
      dtype='object')

In [9]:
scc.violin(adata, keys = ['n_genes', 'n_counts', 'percent_mito'], by = 'passed_qc')

In [10]:
scc.scatter_matrix(adata, ['n_genes', 'n_counts', 'percent_mito'], color='passed_qc')

In [11]:
scc.violin(adata, keys = ['n_cells'])

In [12]:
scc.filter_data(adata)
adata

AnnData object with n_obs × n_vars = 3219 × 20353 
    obs: 'assignment', 'Channel', 'passed_qc', 'n_genes', 'n_counts', 'percent_mito'
    var: 'gene_ids', 'n_cells', 'percent_cells', 'robust', 'highly_variable_features'
    uns: 'genome'

In [13]:
scc.log_norm(adata, norm_count = 1e5)

In [14]:
scc.highly_variable_features(adata, consider_batch = False)

In [15]:
scc.variable_feature_plot(adata)

In [16]:
scc.pca(adata)

In [17]:
scc.neighbors(adata)

## Clustering

In [18]:
scc.leiden(adata, n_iter = 2)

In [19]:
scc.composition_plot(adata, by = 'leiden_labels', condition = 'assignment')

## Dimension Reduction

In [20]:
scc.fitsne(adata)

In [21]:
scc.embedding(adata, basis = 'fitsne', keys = ['leiden_labels', 'assignment'])

In [None]:
scc.umap(adata)

In [23]:
scc.embedding(adata, basis = 'umap', keys = ['leiden_labels', 'assignment'])

In [24]:
scc.diffmap(adata)
scc.fle(adata, file_name = 'exp_out')

exp_out.net is written.
['java', '-Djava.awt.headless=true', '-Xmx8g', '-cp', '//anaconda3/envs/sccloud/lib/python3.7/site-packages/forceatlas2/ext/forceatlas2.jar://anaconda3/envs/sccloud/lib/python3.7/site-packages/forceatlas2/ext/gephi-toolkit-0.9.2-all.jar', 'kco.forceatlas2.Main', '--input', 'exp_out.net', '--output', 'exp_out.coords', '--nthreads', '16', '--seed', '0', '--targetChangePerNode', '2.0', '--targetSteps', '5000', '--2d']
Force-directed layout is generated.


In [25]:
scc.embedding(adata, basis = 'fle', keys = ['leiden_labels', 'assignment'])

## Differential Expression Analysis

In [26]:
scc.de_analysis(adata, cluster = 'leiden_labels', fisher = True, mwu = True)

In [27]:
adata.varm

AxisArrays with keys: de_res

## Find Markers Using Deep Learning

In [28]:
markers = scc.find_markers(adata, label_attr = 'leiden_labels')

[1]	valid_0's multi_error: 0.0938902	valid_1's multi_error: 0.236025
Training until validation scores don't improve for 1 rounds.
[2]	valid_0's multi_error: 0.0552295	valid_1's multi_error: 0.18323
[3]	valid_0's multi_error: 0.0434933	valid_1's multi_error: 0.15528
[4]	valid_0's multi_error: 0.0303763	valid_1's multi_error: 0.136646
[5]	valid_0's multi_error: 0.0276148	valid_1's multi_error: 0.124224
[6]	valid_0's multi_error: 0.0210563	valid_1's multi_error: 0.130435
Early stopping, best iteration is:
[5]	valid_0's multi_error: 0.0276148	valid_1's multi_error: 0.124224


In [29]:
len(markers)

13

## Annotating Clusters

In [30]:
scc.infer_cell_types(adata, markers = 'mouse_brain', de_test = 't')

Cluster 1:
    name: Glutamatergic neuron; score: 0.50; average marker percentage: 68.26%; strong support: (Rbfox3+,95.33%),(Slc17a7+,41.18%)
Cluster 2:
Cluster 3:
    name: Glutamatergic neuron; score: 0.50; average marker percentage: 63.53%; strong support: (Rbfox3+,87.54%),(Slc17a7+,39.51%)
        name: Glutamatergic layer 6a; score: 1.00; average marker percentage: 69.00%; strong support: (Foxp2+,75.08%),(Tle4+,62.92%)
        name: Glutamatergic layer 6b; score: 0.50; average marker percentage: 62.92%; strong support: (Tle4+,62.92%)
Cluster 4:
    name: Glutamatergic neuron; score: 0.50; average marker percentage: 69.66%; strong support: (Rbfox3+,92.57%),(Slc17a7+,46.75%)
        name: Glutamatergic layer 5; score: 1.00; average marker percentage: 28.79%; strong support: (Deptor+,28.79%)
Cluster 5:
    name: GABAergic neuron; score: 0.75; average marker percentage: 41.67%; strong support: (Reln+,38.96%),(Gad1+,38.64%),(Gad2+,47.40%)
        name: GABAergic Sst interneuron; score: