In [1]:
import os
import glob
import numpy as np
import pandas as pd
import qiime2 as q2
from biom import Table, load_table
from qiime2.plugins.feature_table.methods import rarefy
from biom.util import biom_open

In [2]:
q2bt = q2.Artifact.load('../data/processed-data/table.qza')
q2mf = q2.Metadata.load('../data/processed-data/metadata.qza')
bt = q2bt.view(Table)
mf = q2mf.to_dataframe()
mf.shape

(12319, 52)

In [3]:
# additional mom samples
use_ = [('Mom', 'Nose'),
        ('Mom', 'Right_Areola')]
map_per = {bs_:mfbs.dropna(subset=['subjectid_unique',
                                     'date_sampling_category_days_continuous'])
           for bs_, mfbs in mf.groupby(['mom_baby', 'body_site_corrected'])}
for mb_bs in use_:
    # get subset
    mf_mbbs = map_per[mb_bs].drop(['body_site_corrected'], axis=1)
    mf_mbbs = mf_mbbs[mf_mbbs.date_sampling_category_days_continuous <= 360]
    bt_mbbs = bt.copy()
    out_ = os.path.join('../data/split-data',
                        '-'.join(list(mb_bs)))
    #filter
    bt_mbbs = bt_mbbs.filter(mf_mbbs.index)
    bt_mbbs = bt_mbbs.filter(bt_mbbs.ids('observation')[bt_mbbs.sum('observation') > 0],
                             axis='observation')
    mf_mbbs = mf_mbbs.reindex(bt_mbbs.ids())
    
    # make subject id
    mf_mbbs['host_subject_id'] = mf_mbbs.subjectid_unique.values

    # save an write
    q2bt_mbbs =  q2.Artifact.import_data('FeatureTable[Frequency]', bt_mbbs)
    q2mf_mbbs = q2.Metadata(mf_mbbs)
    
    # write out
    os.mkdir(out_)
    q2bt_mbbs.save(os.path.join(out_,'table.qza'))
    q2mf_mbbs.save(os.path.join(out_,'metadata.qza'))
    mf_mbbs.to_csv(os.path.join(out_,'metadata.tsv'), sep='\t')
    with biom_open(os.path.join(out_,'table.biom'), 'w') as f:
        bt_mbbs.to_hdf5(f, "bs_type")
    print(mb_bs)
    print(mf_mbbs.shape)

('Mom', 'Nose')
(440, 52)
('Mom', 'Right_Areola')
(947, 52)


In [4]:
# tables for import
tables_ = {'fecal':'../data/split-data/Mom-Feces',
            'oral':'../data/split-data/Mom-Mouth',
            'skin':'../data/split-data/Mom-Right_Forearm',
            'vagina':'../data/split-data/Mom-Vagina/'}

# get each table and run rarefication
for k_, path_ in tables_.items():
    print('Starting: %s' % k_)
    # get table(s)
    table_ = load_table(os.path.join(path_, 'table.biom'))
    table_ = pd.DataFrame(table_.matrix_data.toarray(),
                          table_.ids('observation'),
                          table_.ids()).T
    tq2able_ = q2.Artifact.import_data('FeatureTable[Frequency]', table_)
    # rar depth (hard set to 5000)
    #rar_depth = int(max(table_.sum(1).min(), 1250))
    rar_depth = 5000
    print('Rarefy-depth %i' % rar_depth)
    # run rare
    table_rar = rarefy(tq2able_, rar_depth).rarefied_table
    table_rar.save(os.path.join(path_, 'rarefy-table.qza'))


Starting: fecal
Rarefy-depth 5000
Starting: oral
Rarefy-depth 5000
Starting: skin
Rarefy-depth 5000
Starting: vagina
Rarefy-depth 5000


In [5]:
# tables for import
tables_ = {'nasal':'../data/split-data/Mom-Nose',
            'skin':'../data/split-data/Mom-Right_Areola'}

# get each table and run rarefication
for k_, path_ in tables_.items():
    print('Starting: %s' % k_)
    # get table(s)
    table_ = load_table(os.path.join(path_, 'table.biom'))
    table_ = pd.DataFrame(table_.matrix_data.toarray(),
                          table_.ids('observation'),
                          table_.ids()).T
    tq2able_ = q2.Artifact.import_data('FeatureTable[Frequency]', table_)
    # rar depth (hard set to 5000)
    #rar_depth = int(max(table_.sum(1).min(), 1250))
    rar_depth = 5000
    print('Rarefy-depth %i' % rar_depth)
    # run rare
    table_rar = rarefy(tq2able_, rar_depth).rarefied_table
    table_rar.save(os.path.join(path_, 'rarefy-table.qza'))

Starting: nasal
Rarefy-depth 5000
Starting: skin
Rarefy-depth 5000


In [25]:
bt_sum_count = pd.DataFrame({'sample_name': bt.ids(axis = "sample"), 'seqcount': bt.sum(axis = "sample")})
bt_sum_count.to_csv("../data/processed-data/table_sum_count.csv", index = False)