In [1]:
import pandas as pd
import anndata
from scipy.sparse import csr_matrix

  from pandas.core.index import RangeIndex


## Make a gene count table

In [21]:
# Allen provide intron and exon count separately
intron_matrix = pd.read_csv('raw/mouse_MOp_nuclei_2018-10-04_intron-matrix.csv', 
                          index_col=0)
exon_matrix = pd.read_csv('raw/mouse_MOp_nuclei_2018-10-04_exon-matrix.csv', 
                          index_col=0)

# here to keep things simple, I only use the whole gene table to do further analysis
gene_matrix = intron_matrix + exon_matrix
gene_matrix

Unnamed: 0,SM-GE653_S113_E1-50,SM-GE653_S114_E1-50,SM-GE653_S115_E1-50,SM-GE653_S116_E1-50,SM-GE653_S117_E1-50,SM-GE653_S118_E1-50,SM-GE653_S119_E1-50,SM-GE653_S120_E1-50,SM-GE653_S121_E1-50,SM-GE653_S122_E1-50,...,SM-GE66H_S079_E1-50,SM-GE66H_S080_E1-50,SM-GE66H_S081_E1-50,SM-GE66H_S082_E1-50,SM-GE66H_S083_E1-50,SM-GE66H_S084_E1-50,SM-GE66H_S085_E1-50,SM-GE66H_S086_E1-50,SM-GE66H_S087_E1-50,SM-GE66H_S088_E1-50
0610005C13Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,70,0,0,0,0
0610006L08Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610007P14Rik,136,0,231,102,25,0,254,1,301,0,...,0,0,0,0,102,0,1,85,0,179
0610009B22Rik,0,0,136,0,0,0,0,0,0,0,...,0,0,0,1,0,74,0,0,0,63
0610009E02Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
n-R5s142,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
n-R5s143,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
n-R5s144,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
n-R5s146,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Assembly an AnnData object
- This will be explained in the book, here I just present you the code.

In [8]:
# create a sparse format, thus the memory usage greatly decreace
data = csr_matrix(gene_matrix.values)

# load cell and gene annotation
cell_meta = pd.read_csv('raw/mouse_MOp_nuclei_2018-10-04_samples-columns.csv', index_col=0)
gene_meta = pd.read_csv('raw/mouse_MOp_nuclei_2018-10-04_genes-rows.csv', index_col=0)


In [22]:
cell_meta.head()

Unnamed: 0_level_0,sample_id,sample_type,organism,donor,sex,age_days,genotype,driver_lines,reporter_lines,brain_hemisphere,...,percent_rrna_reads,percent_unique_reads,percent_synth_reads,percent_ecoli_reads,percent_aligned_reads_total,complexity_cg,genes_detected_cpm_criterion,genes_detected_fpkm_criterion,tdt_cpm,gfp_cpm
seq_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SM-GE653_S113_E1-50,646939681,Nuclei,Mus musculus,351985,M,57,Gad2-IRES-Cre/wt;Ai14(RCL-tdT)/wt,Gad2-IRES-Cre,Ai14(RCL-tdT),Right,...,0.085614,86.29012,0.012748,0.002857,93.661906,0.331667,5580,4189,11.127829,0.0
SM-GE653_S114_E1-50,646939692,Nuclei,Mus musculus,351985,M,57,Gad2-IRES-Cre/wt;Ai14(RCL-tdT)/wt,Gad2-IRES-Cre,Ai14(RCL-tdT),Right,...,0.006899,81.916949,0.020981,0.009762,88.42839,0.35863,4835,2375,0.0,0.0
SM-GE653_S115_E1-50,646939703,Nuclei,Mus musculus,351985,M,57,Gad2-IRES-Cre/wt;Ai14(RCL-tdT)/wt,Gad2-IRES-Cre,Ai14(RCL-tdT),Right,...,0.017213,86.294229,0.017966,0.004101,94.394047,0.327857,4810,3482,156.070589,0.0
SM-GE653_S116_E1-50,646939714,Nuclei,Mus musculus,351985,M,57,Gad2-IRES-Cre/wt;Ai14(RCL-tdT)/wt,Gad2-IRES-Cre,Ai14(RCL-tdT),Right,...,0.0,75.149766,0.040095,0.008792,83.081251,0.374806,3046,1358,0.0,0.0
SM-GE653_S117_E1-50,646939725,Nuclei,Mus musculus,351985,M,57,Gad2-IRES-Cre/wt;Ai14(RCL-tdT)/wt,Gad2-IRES-Cre,Ai14(RCL-tdT),Right,...,0.013598,88.286853,0.006153,0.003293,94.764682,0.326724,5653,4286,0.0,0.0


In [23]:
gene_meta.head()

Unnamed: 0_level_0,chromosome,entrez_id,gene_name
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0610005C13Rik,7,71661,RIKEN cDNA 0610005C13 gene
0610006L08Rik,7,76253,RIKEN cDNA 0610006L08 gene
0610007P14Rik,12,58520,RIKEN cDNA 0610007P14 gene
0610009B22Rik,11,66050,RIKEN cDNA 0610009B22 gene
0610009E02Rik,2,100125929,RIKEN cDNA 0610009E02 gene


In [20]:
# make the anndata object
adata = anndata.AnnData(X=data.T,
                        obs=cell_meta.reindex(gene_matrix.columns),
                        var=gene_meta.reindex(gene_matrix.index))

# the anndata object contain rich information, including cell-by-gene count, cell metadata, gene metadata
adata

AnnData object with n_obs × n_vars = 6847 × 45768 
    obs: 'sample_id', 'sample_type', 'organism', 'donor', 'sex', 'age_days', 'genotype', 'driver_lines', 'reporter_lines', 'brain_hemisphere', 'brain_region', 'brain_subregion', 'facs_date', 'facs_container', 'sample_name', 'facs_sort_criteria', 'rna_amplification_set', 'library_prep_set', 'library_prep_avg_size_bp', 'seq_tube', 'seq_batch', 'total_reads', 'percent_exon_reads', 'percent_intron_reads', 'percent_intergenic_reads', 'percent_rrna_reads', 'percent_unique_reads', 'percent_synth_reads', 'percent_ecoli_reads', 'percent_aligned_reads_total', 'complexity_cg', 'genes_detected_cpm_criterion', 'genes_detected_fpkm_criterion', 'tdt_cpm', 'gfp_cpm'
    var: 'chromosome', 'entrez_id', 'gene_name'

In [24]:
# this is cell-by-gene count matrix
adata.X

<6847x45768 sparse matrix of type '<class 'numpy.float32'>'
	with 40173362 stored elements in Compressed Sparse Column format>

In [25]:
# this is cell meta
adata.obs.head()

Unnamed: 0,sample_id,sample_type,organism,donor,sex,age_days,genotype,driver_lines,reporter_lines,brain_hemisphere,...,percent_rrna_reads,percent_unique_reads,percent_synth_reads,percent_ecoli_reads,percent_aligned_reads_total,complexity_cg,genes_detected_cpm_criterion,genes_detected_fpkm_criterion,tdt_cpm,gfp_cpm
SM-GE653_S113_E1-50,646939681,Nuclei,Mus musculus,351985,M,57,Gad2-IRES-Cre/wt;Ai14(RCL-tdT)/wt,Gad2-IRES-Cre,Ai14(RCL-tdT),Right,...,0.085614,86.29012,0.012748,0.002857,93.661906,0.331667,5580,4189,11.127829,0.0
SM-GE653_S114_E1-50,646939692,Nuclei,Mus musculus,351985,M,57,Gad2-IRES-Cre/wt;Ai14(RCL-tdT)/wt,Gad2-IRES-Cre,Ai14(RCL-tdT),Right,...,0.006899,81.916949,0.020981,0.009762,88.42839,0.35863,4835,2375,0.0,0.0
SM-GE653_S115_E1-50,646939703,Nuclei,Mus musculus,351985,M,57,Gad2-IRES-Cre/wt;Ai14(RCL-tdT)/wt,Gad2-IRES-Cre,Ai14(RCL-tdT),Right,...,0.017213,86.294229,0.017966,0.004101,94.394047,0.327857,4810,3482,156.070589,0.0
SM-GE653_S116_E1-50,646939714,Nuclei,Mus musculus,351985,M,57,Gad2-IRES-Cre/wt;Ai14(RCL-tdT)/wt,Gad2-IRES-Cre,Ai14(RCL-tdT),Right,...,0.0,75.149766,0.040095,0.008792,83.081251,0.374806,3046,1358,0.0,0.0
SM-GE653_S117_E1-50,646939725,Nuclei,Mus musculus,351985,M,57,Gad2-IRES-Cre/wt;Ai14(RCL-tdT)/wt,Gad2-IRES-Cre,Ai14(RCL-tdT),Right,...,0.013598,88.286853,0.006153,0.003293,94.764682,0.326724,5653,4286,0.0,0.0


In [26]:
# this is gene meta
adata.var.head()

Unnamed: 0,chromosome,entrez_id,gene_name
0610005C13Rik,7,71661,RIKEN cDNA 0610005C13 gene
0610006L08Rik,7,76253,RIKEN cDNA 0610006L08 gene
0610007P14Rik,12,58520,RIKEN cDNA 0610007P14 gene
0610009B22Rik,11,66050,RIKEN cDNA 0610009B22 gene
0610009E02Rik,2,100125929,RIKEN cDNA 0610009E02 gene


## Save the AnnData object

In [31]:
adata.write_h5ad('mouse_MOp_SMARTseq_gene_count.h5ad', compression="gzip", compression_opts=5)