In [1]:
%load_ext lab_black
%load_ext autotime
import pandas as pd
import numpy as np

time: 252 ms (started: 2022-09-11 21:02:31 -07:00)


This notebook relies on a [berenslab tutorial](https://github.com/berenslab/rna-seq-tsne/blob/398261383041f84a5b818ff243a412748fbc2f2a/demo.ipynb) for most of its code.

## Warning

This is another notebook that involves downloading large files (VISp file is 291 MB compressed and uncompresses to a folder containing 2.8 GB). This is both a test of your patience and potentially your RAM, although processing the data should not take up more than ~7 GB. 

## Download the RNAseq data

The data is stored in a CSV inside a zipped folder containing multiple files. The [pandas read_csv doc](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html) for the `compression` parameter currently states:

> If using ‘zip’, the ZIP file must contain only one data file to be read in.

so we must make use of the [python zipfile module](https://docs.python.org/3/library/zipfile.html) to fetch this data.

In [2]:
import zipfile
from io import BytesIO

import requests
from scipy import sparse


def get_remote_zipfile(url, timeout_secs=10):
    req = requests.get(url, timeout=timeout_secs)
    return zipfile.ZipFile(BytesIO(req.content))


def open_files_in_remote_zip(url, filenames_in_zip, timeout_secs=10):
    zip_file = get_remote_zipfile(url, timeout_secs=timeout_secs)
    return [zip_file.open(filename) for filename in filenames_in_zip]


def sparseload(visp_file, alm_file, csv_kwds=None):
    if csv_kwds is None:
        csv_kwds = dict(chunksize=1000, index_col=0, na_filter=False)

    genes = []
    sparseblocks = []
    areas = []
    cells = []
    for chunk1, chunk2 in zip(
        pd.read_csv(visp_file, **csv_kwds),
        pd.read_csv(alm_file, **csv_kwds),
    ):
        if len(cells) == 0:
            cells = np.concatenate((chunk1.columns, chunk2.columns))
            areas = [0] * chunk1.columns.size + [1] * chunk2.columns.size

        genes.extend(list(chunk1.index))
        sparseblock1 = sparse.csr_matrix(chunk1.values.astype(float))
        sparseblock2 = sparse.csr_matrix(chunk2.values.astype(float))
        sparseblock = sparse.hstack((sparseblock1, sparseblock2), format="csr")
        sparseblocks.append([sparseblock])
        print(".", end="", flush=True)
    print(" done")
    counts = sparse.bmat(sparseblocks)
    return (counts.T, np.array(genes), cells, np.array(areas))

time: 106 ms (started: 2022-09-11 21:02:32 -07:00)


Fetching these files over the internet will take a few minutes, but should take up around 500 MB

In [3]:
visp_zip_url = (
    "http://celltypes.brain-map.org/api/v2/well_known_file_download/694413985"
)
visp_exon_filename = "mouse_VISp_2018-06-14_exon-matrix.csv"
visp_genes_filename = "mouse_VISp_2018-06-14_genes-rows.csv"
visp_exon_file, visp_genes_file = open_files_in_remote_zip(
    visp_zip_url, [visp_exon_filename, visp_genes_filename]
)

time: 2min 50s (started: 2022-09-11 21:02:32 -07:00)


In [4]:
alm_zip_url = "http://celltypes.brain-map.org/api/v2/well_known_file_download/694413179"
alm_exon_filename = "mouse_ALM_2018-06-14_exon-matrix.csv"
alm_exon_file = open_files_in_remote_zip(alm_zip_url, [alm_exon_filename])[0]

time: 1min 59s (started: 2022-09-11 21:05:23 -07:00)


Expect the data generated in the next step to be in the 3 GB range:

In [5]:
counts, genes, cells, areas = sparseload(visp_exon_file, alm_exon_file)

.............................................. done
time: 2min 17s (started: 2022-09-11 21:07:22 -07:00)


In [6]:
counts, genes, cells, areas

(<25481x45768 sparse matrix of type '<class 'numpy.float64'>'
 	with 227422472 stored elements in Compressed Sparse Column format>,
 array([    71661,     76253,     58520, ..., 100861498, 100861500,
        100861503]),
 array(['F1S4_160108_001_A01', 'F1S4_160108_001_B01',
        'F1S4_160108_001_C01', ..., 'FJS4_170511_012_F01',
        'FJS4_170511_012_G01', 'FJS4_170511_012_H01'], dtype=object),
 array([0, 0, 0, ..., 1, 1, 1]))

time: 11.7 ms (started: 2022-09-11 21:09:40 -07:00)


### Replace the entrez ids with gene symbols

In [7]:
genesDF = pd.read_csv(visp_genes_file)
genesDF

Unnamed: 0,gene_symbol,gene_id,chromosome,gene_entrez_id,gene_name
0,0610005C13Rik,500717483,7,71661,RIKEN cDNA 0610005C13 gene
1,0610006L08Rik,500717917,7,76253,RIKEN cDNA 0610006L08 gene
2,0610007P14Rik,500730104,12,58520,RIKEN cDNA 0610007P14 gene
3,0610009B22Rik,500726890,11,66050,RIKEN cDNA 0610009B22 gene
4,0610009E02Rik,500702775,2,100125929,RIKEN cDNA 0610009E02 gene
...,...,...,...,...,...
45763,n-R5s142,500721654,8,100861496,nuclear encoded rRNA 5S 142
45764,n-R5s143,500721655,8,100861497,nuclear encoded rRNA 5S 143
45765,n-R5s144,500721656,8,100861498,nuclear encoded rRNA 5S 144
45766,n-R5s146,500721658,8,100861500,nuclear encoded rRNA 5S 146


time: 74.9 ms (started: 2022-09-11 21:09:40 -07:00)


In [8]:
gene_entrez_ids = genesDF["gene_entrez_id"].tolist()
symbols = genesDF["gene_symbol"].tolist()
id2symbol = dict(zip(gene_entrez_ids, symbols))
genes = np.array([id2symbol[g] for g in genes])

time: 32.4 ms (started: 2022-09-11 21:09:40 -07:00)


## Read cluster information

In [9]:
clusterInfo = pd.read_csv(
    "https://raw.githubusercontent.com/berenslab/rna-seq-tsne/398261383041f84a5b818ff243a412748fbc2f2a/data/tasic-sample_heatmap_plot_data.csv",
)
clusterInfo

Unnamed: 0,sample_name,cluster_id,cluster_color,cluster_label,class_id,class_color,class_label,Gad2_log10_cpm,Slc17a7_log10_cpm,Lamp5_log10_cpm,Sncg_log10_cpm,Vip_log10_cpm,Sst_log10_cpm,Pvalb_log10_cpm
0,F1S4_161216_001_A01,94,#53D385,L5 PT ALM Slco2a1,2,#27AAE1,Glutamatergic,0.000000,2.703004,2.644231,0.000000,0.194593,0.000000,0.000000
1,F1S4_180124_314_A01,73,#33A9CE,L5 IT ALM Npw,2,#27AAE1,Glutamatergic,0.000000,2.655333,3.254294,0.000000,0.000000,0.000000,0.000000
2,F1S4_180124_315_A01,2,#FF88AD,Lamp5 Fam19a1 Pax6,1,#EF4136,GABAergic,2.981714,0.000000,0.000000,0.968798,0.000000,0.000000,0.000000
3,F1S4_180124_315_B01,8,#9440F3,Sncg Slc17a8,1,#EF4136,GABAergic,2.479560,0.000000,0.000000,2.388210,0.000000,0.000000,1.685995
4,F1S4_180124_315_C01,8,#9440F3,Sncg Slc17a8,1,#EF4136,GABAergic,2.881715,0.000000,0.000000,3.005049,0.000000,0.000000,0.952222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23817,FYS4_171004_104_C01,92,#00A863,L5 PT VISp C1ql2 Cdh13,2,#27AAE1,Glutamatergic,0.000000,2.720627,3.122579,0.000000,0.000000,0.000000,0.000000
23818,FYS4_171004_104_D01,42,#BF9F00,Sst Hpse Sema3c,1,#EF4136,GABAergic,2.246059,0.000000,0.000000,0.000000,0.000000,3.863217,1.146638
23819,FYS4_171004_104_F01,89,#0000FF,L5 PT VISp Chrna6,2,#27AAE1,Glutamatergic,0.000000,2.157171,2.817028,0.000000,0.000000,0.476322,0.000000
23820,FYS4_171004_104_G01,35,#CC6D3D,Sst Calb2 Pdlim5,1,#EF4136,GABAergic,2.792404,0.000000,0.000000,0.295530,0.000000,4.184470,0.000000


time: 647 ms (started: 2022-09-11 21:09:40 -07:00)


In [10]:
goodCells = clusterInfo["sample_name"].values
clusterIds = clusterInfo["cluster_id"].values
labels = clusterInfo["cluster_label"].values
colors = clusterInfo["cluster_color"].values

clusterNames = np.array(
    [labels[clusterIds == i + 1][0] for i in range(np.max(clusterIds))]
)
clusterColors = np.array(
    [colors[clusterIds == i + 1][0] for i in range(np.max(clusterIds))]
)
clusters = np.copy(clusterIds) - 1

clusterNames[:5], clusterColors[:5], clusters[:5]

(array(['Lamp5 Krt73', 'Lamp5 Fam19a1 Pax6', 'Lamp5 Fam19a1 Tmem182',
        'Lamp5 Ntn1 Npy2r', 'Lamp5 Plch2 Dock5'], dtype='<U26'),
 array(['#DDACC9', '#FF88AD', '#FFB8CE', '#DD6091', '#FF7290'], dtype='<U7'),
 array([93, 72,  1,  7,  7]))

time: 20.1 ms (started: 2022-09-11 21:09:40 -07:00)


## Keep the good cells

In [11]:
ind = np.array([np.where(cells == c)[0][0] for c in goodCells])
counts = counts[ind, :]

time: 12.3 s (started: 2022-09-11 21:09:40 -07:00)


In [12]:
tasic2018 = {
    "counts": counts,
    "genes": genes,
    "clusters": clusters,
    "areas": areas,
    "clusterColors": clusterColors,
    "clusterNames": clusterNames,
}

time: 58.4 ms (started: 2022-09-11 21:09:53 -07:00)


In [13]:
print("Number of cells:", tasic2018["counts"].shape[0])
print("Number of cells from ALM:", np.sum(tasic2018["areas"] == 0))
print("Number of cells from VISp:", np.sum(tasic2018["areas"] == 1))
print("Number of clusters:", np.unique(tasic2018["clusters"]).size)
print("Number of genes:", tasic2018["counts"].shape[1])
print(
    "Fraction of zeros in the data matrix: {:.2f}".format(
        tasic2018["counts"].size / np.prod(tasic2018["counts"].shape)
    )
)

Number of cells: 23822
Number of cells from ALM: 15413
Number of cells from VISp: 10068
Number of clusters: 133
Number of genes: 45768
Fraction of zeros in the data matrix: 0.20
time: 10.1 ms (started: 2022-09-11 21:09:53 -07:00)


### Save just in case?

Although this will probably take longer to write than it does to read and process the data over the network:

In [14]:
# import pickle
# import drnb.io as nbio
# import gzip

# with gzip.open(
#     nbio.DATA_ROOT / "data" / "tasic2018-raw.pkl.gz",
#     "wb",
# ) as f:
#     pickle.dump(tasic2018, f, pickle.HIGHEST_PROTOCOL)

time: 819 µs (started: 2022-09-11 21:09:53 -07:00)


## Feature Selection

In [15]:
def calcNearZeroRate(data, threshold=0):
    zeroRate = 1 - np.squeeze(np.array((data > threshold).mean(axis=0)))
    return zeroRate


def calcMeanLogExpression(data, threshold=0, atleast=10):
    nonZeros = np.squeeze(np.array((data > threshold).sum(axis=0)))
    N = data.shape[0]
    A = data.multiply(data > threshold)
    A.data = np.log2(A.data)
    meanExpr = np.zeros(data.shape[1]) * np.nan
    detected = nonZeros >= atleast
    meanExpr[detected] = np.squeeze(np.array(A[:, detected].mean(axis=0))) / (
        nonZeros[detected] / N
    )
    return meanExpr


def featureSelection(meanLogExpression, nearZeroRate, yoffset=0.02, decay=1.5, n=3000):
    low = 0
    up = 10
    nonan = ~np.isnan(meanLogExpression)
    xoffset = 5
    for _ in range(100):
        selected = np.zeros_like(nearZeroRate).astype(bool)
        selected[nonan] = (
            nearZeroRate[nonan]
            > np.exp(-decay * meanLogExpression[nonan] + xoffset) + yoffset
        )

        if np.sum(selected) == n:
            break

        if np.sum(selected) < n:
            up = xoffset
            xoffset = (xoffset + low) / 2
        else:
            low = xoffset
            xoffset = (xoffset + up) / 2

    return selected

time: 9.85 ms (started: 2022-09-11 21:09:53 -07:00)


### Select 3000 genes

* Get mean log non-zero expression of each gene
* Get near-zero frequency of each gene
* Find 3000 genes based on the above

In [16]:
mle = calcMeanLogExpression(tasic2018["counts"], threshold=32)
nzr = calcNearZeroRate(tasic2018["counts"], threshold=32)
selectedGenes = featureSelection(mle, nzr, n=3000)

time: 12.4 s (started: 2022-09-11 21:09:53 -07:00)


## Convert to log CPM

In [17]:
counts3k = tasic2018["counts"][:, selectedGenes]

# Compute library sizes
librarySizes = tasic2018["counts"].sum(axis=1)
# Library size normalisation
data = counts3k / librarySizes * 1e6
# Log-transformation
data = np.log2(data + 1)

time: 3.5 s (started: 2022-09-11 21:10:05 -07:00)


Also convert the `data` numpy matrix (a data type which is likely to go away) to an 2D array:

In [18]:
data = data.A1.reshape(data.shape)

time: 927 µs (started: 2022-09-11 21:10:09 -07:00)


### Prepare the target

Create a palette for plotting:

In [19]:
tasic2018_palette = dict(ClusterName=dict(zip(clusterNames, clusterColors)))

time: 1.66 ms (started: 2022-09-11 21:10:09 -07:00)


We can use the good cell names as the index for the `target`. We'll also store the cluster id, name and the per-cell color just in case that's easier to use at some point.

In [20]:
target = pd.DataFrame(
    dict(
        ClusterId=clusters,
        ClusterColor=clusterColors[clusters],
        ClusterName=clusterNames[clusters],
    ),
    index=goodCells,
)

time: 8.48 ms (started: 2022-09-11 21:10:09 -07:00)


In [21]:
target.head()

Unnamed: 0,ClusterId,ClusterColor,ClusterName
F1S4_161216_001_A01,93,#53D385,L5 PT ALM Slco2a1
F1S4_180124_314_A01,72,#33A9CE,L5 IT ALM Npw
F1S4_180124_315_A01,1,#FF88AD,Lamp5 Fam19a1 Pax6
F1S4_180124_315_B01,7,#9440F3,Sncg Slc17a8
F1S4_180124_315_C01,7,#9440F3,Sncg Slc17a8


time: 5.15 ms (started: 2022-09-11 21:10:09 -07:00)


## Data Pipeline

In [22]:
from drnb.dataset import create_data_pipeline

data_pipe = create_data_pipeline(
    convert=dict(dtype="float32", layout="c"),
    data_export=["csv", "npy"],
    target_export=["csv", "pkl"],
    neighbors=dict(
        n_neighbors=[15, 50, 150],
        method="exact",
        metric=["euclidean"],
        file_types=["csv", "npy"],
    ),
    triplets=dict(
        n_triplets_per_point=5,
        seed=1337,
        file_types=["csv", "npy"],
    ),
    verbose=True,
)

time: 4.44 s (started: 2022-09-11 21:10:09 -07:00)


In [23]:
data_result = data_pipe.run(
    "tasic2018",
    data=data,
    target=target,
    target_palette=tasic2018_palette,
    verbose=True,
)

time: 50.6 s (started: 2022-09-11 21:10:13 -07:00)


### PCA 50 pipeline

Preprocessing as done in the berenslab notebook

In [24]:
from drnb.dataset import create_data_pipeline

data_pipe_pca50 = create_data_pipeline(
    reduce=50,
    data_export=["csv", "npy"],
    target_export=["csv", "pkl"],
    neighbors=dict(
        n_neighbors=[15, 50, 150],
        method="exact",
        metric=["euclidean"],
        file_types=["csv", "npy"],
    ),
    triplets=dict(
        n_triplets_per_point=5,
        seed=1337,
        file_types=["csv", "npy"],
    ),
    verbose=True,
)

time: 10.4 ms (started: 2022-09-11 21:11:04 -07:00)


In [25]:
data_pca50_result = data_pipe_pca50.run(
    "tasic2018-pca50",
    data=data,
    target=target,
    target_palette=tasic2018_palette,
    verbose=True,
)

time: 24.2 s (started: 2022-09-11 21:11:04 -07:00)
