In [20]:
#!gsutil -m cp gs://terra-featured-workspaces/Sc_SnRNA-seq/data/mouse_cortex/raw_feature_bc_matrix.h5 ./mouse/
#!gsutil -m cp gs://terra-featured-workspaces/Sc_SnRNA-seq/data/mouse_cortex_ADT/mouse_cortex_ADT.csv ./mouse/

In [21]:
#!scCloud demuxEM -p 8 --min-num-genes 200 --generate-diagnostic-plots ./mouse/mouse_cortex_ADT.csv ./mouse/raw_feature_bc_matrix.h5 ./mouse/exp

In [22]:
import scCloud as scc
import numpy as np

## Preprocessing

In [23]:
adata = scc.read_input("./mouse/exp_demux.h5sc", select_singlets = True)
adata

AnnData object with n_obs × n_vars = 3267 × 27998 
    obs: 'assignment', 'Channel'
    var: 'gene_ids'
    uns: 'genome'

In [24]:
adata.obs['assignment'].value_counts()

Sample5M    502
Sample8M    467
Sample6M    408
Sample2F    397
Sample1F    397
Sample3F    376
Sample7M    367
Sample4F    353
Name: assignment, dtype: int64

In [25]:
scc.qc_metrics(adata, mito_prefix = 'mt-', min_genes = 200)

In [26]:
adata.var.keys()

Index(['gene_ids', 'n_cells', 'percent_cells', 'robust',
       'highly_variable_features'],
      dtype='object')

In [27]:
adata.obs.keys()

Index(['assignment', 'Channel', 'passed_qc', 'n_genes', 'n_counts',
       'percent_mito'],
      dtype='object')

In [28]:
scc.violin(adata, keys = ['n_genes', 'n_counts', 'percent_mito'], by = 'passed_qc')

In [29]:
scc.scatter(adata, x = 'n_genes', y = 'n_counts', color = 'passed_qc')

In [30]:
scc.violin(adata, keys = ['n_cells'])

In [31]:
scc.filter_data(adata)
adata

AnnData object with n_obs × n_vars = 3219 × 20353 
    obs: 'assignment', 'Channel', 'passed_qc', 'n_genes', 'n_counts', 'percent_mito'
    var: 'gene_ids', 'n_cells', 'percent_cells', 'robust', 'highly_variable_features'
    uns: 'genome'

In [32]:
scc.log_norm(adata, norm_count = 1e5)

In [33]:
scc.highly_variable_features(adata, consider_batch = False)

In [34]:
adata

AnnData object with n_obs × n_vars = 3219 × 20353 
    obs: 'assignment', 'Channel', 'passed_qc', 'n_genes', 'n_counts', 'percent_mito'
    var: 'gene_ids', 'n_cells', 'percent_cells', 'robust', 'highly_variable_features', 'mean', 'var', 'hvf_loess', 'hvf_rank'
    uns: 'genome', 'Channels'

In [35]:
scc.pca(adata)

In [36]:
scc.neighbors(adata)

## Clustering

In [37]:
scc.leiden(adata, n_iter = 2)

In [38]:
scc.composition_plot(adata, by = 'leiden_labels', condition = 'assignment')

## Dimension Reduction

In [39]:
scc.fitsne(adata)

In [40]:
scc.embedding(adata, basis = 'fitsne', keys = ['leiden_labels', 'assignment'])

In [41]:
scc.umap(adata)

UMAP(a=None, angular_rp_forest=False, b=None, init='spectral',
     learning_rate=1.0, local_connectivity=1.0, metric='euclidean',
     metric_kwds=None, min_dist=0.5, n_components=2, n_epochs=None,
     n_neighbors=15, negative_sample_rate=5, random_state=0,
     repulsion_strength=1.0, set_op_mix_ratio=1.0, spread=1.0,
     target_metric='categorical', target_metric_kwds=None,
     target_n_neighbors=-1, target_weight=0.5, transform_queue_size=4.0,
     transform_seed=42, verbose=True)
Construct fuzzy simplicial set
Thu Aug 22 14:50:52 2019 Finding Nearest Neighbors
Thu Aug 22 14:50:52 2019 Finished Nearest Neighbor Search
Thu Aug 22 14:50:54 2019 Construct embedding


  n_components


	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Thu Aug 22 14:50:59 2019 Finished embedding


In [42]:
scc.embedding(adata, basis = 'umap', keys = ['leiden_labels', 'assignment'])

In [43]:
scc.diffmap(adata)
scc.fle(adata, file_name = 'exp_out')

exp_out.net is written.
['java', '-Djava.awt.headless=true', '-Xmx8g', '-cp', '/Users/yy939/mgh/miniconda3/envs/scCloud-test/lib/python3.7/site-packages/forceatlas2/ext/forceatlas2.jar:/Users/yy939/mgh/miniconda3/envs/scCloud-test/lib/python3.7/site-packages/forceatlas2/ext/gephi-toolkit-0.9.2-all.jar', 'kco.forceatlas2.Main', '--input', 'exp_out.net', '--output', 'exp_out.coords', '--nthreads', '12', '--seed', '0', '--targetChangePerNode', '2.0', '--targetSteps', '5000', '--2d']
Force-directed layout is generated.


In [44]:
scc.embedding(adata, basis = 'fle', keys = ['leiden_labels', 'assignment'])

## Differential Expression Analysis

In [45]:
scc.de_analysis(adata, cluster = 'leiden_labels', fisher = True, mwu = True)

In [46]:
adata.varm

AxisArrays with keys: de_res

## Find Markers Using Deep Learning

In [47]:
markers = scc.find_markers(adata, label_attr = 'leiden_labels')

[1]	valid_0's multi_error: 0.0938902	valid_1's multi_error: 0.236025
Training until validation scores don't improve for 1 rounds.
[2]	valid_0's multi_error: 0.0552295	valid_1's multi_error: 0.18323
[3]	valid_0's multi_error: 0.0434933	valid_1's multi_error: 0.15528
[4]	valid_0's multi_error: 0.0303763	valid_1's multi_error: 0.136646
[5]	valid_0's multi_error: 0.0276148	valid_1's multi_error: 0.124224
[6]	valid_0's multi_error: 0.0210563	valid_1's multi_error: 0.130435
Early stopping, best iteration is:
[5]	valid_0's multi_error: 0.0276148	valid_1's multi_error: 0.124224


In [48]:
len(markers)

13

## Annotating Clusters

In [49]:
scc.infer_cell_types(adata, markers = 'mouse_brain', de_test = 't')

Cluster 1:
    name: Glutamatergic neuron; score: 0.50; average marker percentage: 68.26%; strong support: (Rbfox3+,95.33%),(Slc17a7+,41.18%)
Cluster 2:
Cluster 3:
    name: Glutamatergic neuron; score: 0.50; average marker percentage: 63.53%; strong support: (Rbfox3+,87.54%),(Slc17a7+,39.51%)
        name: Glutamatergic layer 6a; score: 1.00; average marker percentage: 69.00%; strong support: (Foxp2+,75.08%),(Tle4+,62.92%)
        name: Glutamatergic layer 6b; score: 0.50; average marker percentage: 62.92%; strong support: (Tle4+,62.92%)
Cluster 4:
    name: Glutamatergic neuron; score: 0.50; average marker percentage: 69.66%; strong support: (Rbfox3+,92.57%),(Slc17a7+,46.75%)
        name: Glutamatergic layer 5; score: 1.00; average marker percentage: 28.79%; strong support: (Deptor+,28.79%)
Cluster 5:
    name: GABAergic neuron; score: 0.75; average marker percentage: 41.67%; strong support: (Reln+,38.96%),(Gad1+,38.64%),(Gad2+,47.40%)
        name: GABAergic Sst interneuron; score: