In [1]:
import sys
import biom
from biom.util import biom_open
import pandas as pd
import numpy as np

In [2]:
# ref: https://github.com/sjanssen2/ggmap/blob/master/ggmap/snippets.py
def biom2pandas(file_biom, withTaxonomy=False, astype=int):
    """ Converts a biom file into a Pandas.DataFrame
    Parameters
    ----------
    file_biom : str
        The path to the biom file.
    withTaxonomy : bool
        If TRUE, returns a second Pandas.Series with lineage information for
        each feature, e.g. OTU or deblur-sequence. Default: FALSE
    astype : type
        datatype into each value of the biom table is casted. Default: int.
        Use e.g. float if biom table contains relative abundances instead of
        raw reads.
    Returns
    -------
    A Pandas.DataFrame holding holding numerical values from the biom file.
    If withTaxonomy is TRUE then a second Pandas.DataFrame is returned, holding
    lineage information about each feature.
    Raises
    ------
    IOError
        If file_biom cannot be read.
    ValueError
        If withTaxonomy=TRUE but biom file does not hold taxonomy information.
    """
    try:
        table = biom.load_table(file_biom)
        counts = pd.DataFrame(table.matrix_data.T.todense().astype(astype),
                              index=table.ids(axis='sample'),
                              columns=table.ids(axis='observation')).T
        if withTaxonomy:
            try:
                md = table.metadata_to_dataframe('observation')
                levels = [col
                          for col in md.columns
                          if col.startswith('taxonomy_')]
                if levels == []:
                    raise ValueError(('No taxonomy information found in '
                                      'biom file.'))
                else:
                    taxonomy = md.apply(lambda row:
                                        ";".join([row[l] for l in levels]),
                                        axis=1)
                    return counts, taxonomy
            except KeyError:
                raise ValueError(('Biom file does not have any '
                                  'observation metadata!'))
        else:
            return counts
    except IOError:
        raise IOError('Cannot read file "%s"' % file_biom)


def pandas2biom(file_biom, table, taxonomy=None, err=sys.stderr):
    """ Writes a Pandas.DataFrame into a biom file.
    Parameters
    ----------
    file_biom: str
        The filename of the BIOM file to be created.
    table: a Pandas.DataFrame
        The table that should be written as BIOM.
    taxonomy : pandas.Series
        Index is taxons corresponding to table, values are lineage strings like
        'k__Bacteria; p__Actinobacteria'
    err : StringIO
        Stream onto which errors / warnings should be printed.
        Default is sys.stderr
    Raises
    ------
    IOError
        If file_biom cannot be written.
    TODO
    ----
        1) also store taxonomy information
    """
    try:
        bt = biom.Table(table.values,
                        observation_ids=table.index,
                        sample_ids=table.columns)

        # add taxonomy metadata if provided, i.e. is not None
        if taxonomy is not None:
            if not isinstance(taxonomy, pd.core.series.Series):
                raise AttributeError('taxonomy must be a pandas.Series!')
            idx_missing_intable = set(table.index) - set(taxonomy.index)
            if len(idx_missing_intable) > 0:
                err.write(('Warning: following %i taxa are not in the '
                           'provided taxonomy:\n%s\n') % (
                          len(idx_missing_intable),
                          ", ".join(idx_missing_intable)))
                missing = pd.Series(
                    index=idx_missing_intable,
                    name='taxonomy',
                    data='k__missing_lineage_information')
                taxonomy = taxonomy.append(missing)
            idx_missing_intaxonomy = set(taxonomy.index) - set(table.index)
            if (len(idx_missing_intaxonomy) > 0) and err:
                err.write(('Warning: following %i taxa are not in the '
                           'provided count table, but in taxonomy:\n%s\n') % (
                          len(idx_missing_intaxonomy),
                          ", ".join(idx_missing_intaxonomy)))

            t = dict()
            for taxon, linstr in taxonomy.iteritems():
                # fill missing rank annotations with rank__
                orig_lineage = {annot[0].lower(): annot
                                for annot
                                in (map(str.strip, linstr.split(';')))}
                lineage = []
                for rank in settings.RANKS:
                    rank_char = rank[0].lower()
                    if rank_char in orig_lineage:
                        lineage.append(orig_lineage[rank_char])
                    else:
                        lineage.append(rank_char+'__')
                t[taxon] = {'taxonomy': ";".join(lineage)}
            bt.add_metadata(t, axis='observation')

        with biom_open(file_biom, 'w') as f:
            bt.to_hdf5(f, "example")
    except IOError:
        raise IOError('Cannot write to file "%s"' % file_biom)

In [3]:
microb_unrare = biom2pandas('../Qiita_Study11274_ID57316/57316_reference-hit.biom')

In [5]:
microb_unrare.shape

(43660, 599)

In [6]:
microb_unrare.head(3)

Unnamed: 0,11274.MN1837,11274.PO7410,11274.SD8837,11274.MN2250,11274.SD8603,11274.PI5263,11274.MN2373,11274.PI5340,11274.MN1590,11274.PI4847,...,11274.BI0904,11274.PA3993,11274.PA3846,11274.PO7476,11274.BI0730,11274.PO6998,11274.BI0539,11274.MN2181,11274.PO7226,11274.BI0552
TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCGGAGATTAAGCGTGTTGTGAAATGTAGACGCTCAACGTCTGCACTGCAGCGCGAACTGGTTTCCTTGAGTACGCACAAAGTGGGCGGAATTCGTGG,9178,4,2,2,6,5,2,2,0,3,...,5,0,0,7,493,3,0,0,524,0
TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCGGAGATTAAGCGTGTTGTGAAATGTAGATGCTCAACATCTGCACTGCAGCGCGAACTGGTTTCCTTGAGTACGCACAAAGTGGGCGGAATTCGTGG,3054,0,4,0,0,3,0,0,0,0,...,0,0,0,0,291,2,0,0,1209,0
TACGGAAGGTCCAGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGCAGGCGGCGGCGTAAGTCAGTTGTGAAATCGTGCGGCTTAACCGTGCAATTGCAGTTGATACTGCGTCGCTTGAGTGCACACAGGGATGTTGGAATTCATGG,1608,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# correct subject ID
microb_unrare.columns = microb_unrare.columns.str[6:13].values

In [8]:
microb_unrare.head(3)

Unnamed: 0,MN1837,PO7410,SD8837,MN2250,SD8603,PI5263,MN2373,PI5340,MN1590,PI4847,...,BI0904,PA3993,PA3846,PO7476,BI0730,PO6998,BI0539,MN2181,PO7226,BI0552
TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCGGAGATTAAGCGTGTTGTGAAATGTAGACGCTCAACGTCTGCACTGCAGCGCGAACTGGTTTCCTTGAGTACGCACAAAGTGGGCGGAATTCGTGG,9178,4,2,2,6,5,2,2,0,3,...,5,0,0,7,493,3,0,0,524,0
TACGGAAGGTCCGGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCCGGAGATTAAGCGTGTTGTGAAATGTAGATGCTCAACATCTGCACTGCAGCGCGAACTGGTTTCCTTGAGTACGCACAAAGTGGGCGGAATTCGTGG,3054,0,4,0,0,3,0,0,0,0,...,0,0,0,0,291,2,0,0,1209,0
TACGGAAGGTCCAGGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGCAGGCGGCGGCGTAAGTCAGTTGTGAAATCGTGCGGCTTAACCGTGCAATTGCAGTTGATACTGCGTCGCTTGAGTGCACACAGGGATGTTGGAATTCATGG,1608,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
pandas2biom('../data/57316_mros_deblur_otus_unrare.biom', microb_unrare)

### simple examination of the data

In [9]:
microb_unrare.sum(axis=0).describe()

count      599.000000
mean     34003.534224
std       6480.764579
min      10117.000000
25%      30310.000000
50%      33768.000000
75%      37009.500000
max      78762.000000
dtype: float64

In [10]:
np.min(microb_unrare.values)

0

In [11]:
np.max(microb_unrare.values)

28840

In [12]:
np.mean(microb_unrare.values)

0.7788257953208011

In [13]:
np.std(microb_unrare.values)

49.61767283321315

In [14]:
(np.count_nonzero(microb_unrare.values)/(microb_unrare.shape[0]*microb_unrare.shape[1]))*100

0.7384845868476778