In [1]:
import os
import sys
import re
from pathlib import Path

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd
import scipy.sparse as sp_sparse

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import tables

# Project level imports
from larval_gonad.notebook import Nb
from larval_gonad.io import cellranger_umi
from larval_gonad.config import memory

In [4]:
NUCS = ['A', 'C', 'G', 'T']


def decompress_seq(x: int, length=16):
    """ Un-pack a DNA sequence from a 2-bit format

    Based on code from: https://github.com/10XGenomics/cellranger

    cellranger/lib/python/cellranger/utils.py

    Parameters
    ----------
    x : int
        Number sequence to be decoded.
    length : int
        Length of the barcode. This can be found in the molecular info hdf5
        file from 10x genome.
        molInfo.get_node_attr('/metrics', 'chemistry_barcode_read_length')

    """
    bits = 64
    x = np.uint64(x)
    assert length <= (bits/2 - 1)
    if x & (1 << (bits-1)):
        return 'N' * length
    result = bytearray(length)
    for i in range(length):
        result[(length-1)-i] = bytearray(NUCS[x & np.uint64(0b11)].encode())[0]
        x = x >> np.uint64(2)
    return result.decode()


def two_bit_mapper(iterable):
    """Return a dictionary mapping 2bit encoded Seqs.

    Parameters
    ----------
    iterable : list-like
        Unique list of 2bit encoded sequences.

    Returns
    -------
    dict : Mapper from encoded to decoded

    """
    return {k: decompress_seq(k) for k in iterable}


def decode_cell_names(iterable):
    """Use two_bit_mapper to decode cell names.

    iterable : np.array
        An array of twobit encoded cell names.

    """
    mapper = two_bit_mapper(np.unique(iterable))
    return [mapper[x] for x in iterable]


def cellranger_umi(fname):
    with tables.open_file(fname, 'r') as f:
        group = f.get_node('/')
        cell_ids = getattr(group, 'barcode').read()
        umi = getattr(group, 'umi').read()
        gene = getattr(group, 'gene').read()
        read_cnts = getattr(group, 'reads').read()
        gene_ids = getattr(group, 'gene_ids').read()

    # gene_ids: np.array
    gene_ids = np.append(gene_ids, ['Not a Gene'])
    cell_names = decode_cell_names(cell_ids)
    umi = decode_cell_names(umi)
    gene = [gene_ids[x] for x in gene]

    return pd.DataFrame(dict(
        cell_id=cell_names,
        umi=umi,
        gene=gene,
        read_cnt=read_cnts
    ))

In [5]:
rep = 1
umi = cellranger_umi(f'../output/scrnaseq-wf/scrnaseq_samples/testis{rep}_force/outs/molecule_info.h5')
umi = umi.query('gene != "Not a Gene"')
umi.to_csv(f'../output/notebook/2018-11-01_make_stack_for_salsa_rep{rep}.tsv', sep='\t', index=False)

In [6]:
rep = 2
umi = cellranger_umi(f'../output/scrnaseq-wf/scrnaseq_samples/testis{rep}_force/outs/molecule_info.h5')
umi = umi.query('gene != "Not a Gene"')
umi.to_csv(f'../output/notebook/2018-11-01_make_stack_for_salsa_rep{rep}.tsv', sep='\t', index=False)

MemoryError: 

In [None]:
rep = 3
umi = cellranger_umi(f'../output/scrnaseq-wf/scrnaseq_samples/testis{rep}_force/outs/molecule_info.h5')
umi = umi.query('gene != "Not a Gene"')
umi.to_csv(f'../output/notebook/2018-11-01_make_stack_for_salsa_rep{rep}.tsv', sep='\t', index=False)