In [1]:
import os
import pandas as pd
import qiime2 as q2
import numpy as np
from skbio import TreeNode
from biom import load_table, Table
from qiime2.plugins.gemelli.actions import ctf
from qiime2.plugins.deicode.actions import rpca
from qiime2.plugins.longitudinal.actions import volatility


In [2]:
# time splits
splits_ = {'0-2':[0.0, 1.0, 2.0],
           '7-360':[7.0, 14.0, 30.0,
                    60, 120, 180, 360]}

ctf_splits = {}

for body_site in ['Baby-Feces',
                  'Baby-Mouth',
                  'Baby-Forearm']:

    for t_range_, t_use in splits_.items():

        # import metadata
        mf_tmp = q2.Metadata.load('../data/split-data/%s/metadata.qza' % body_site).to_dataframe()

        ### early daily samples ## 
        mf_tmp['date_sampling_category_days_continuous'] = mf_tmp.date_sampling_category_days_continuous.astype(float)
        mf_tmp = mf_tmp[mf_tmp.date_sampling_category_days_continuous.isin(t_use)]

        # build map of total subjects
        sdtypedf_ = mf_tmp[~mf_tmp.country.isin(['PuertoRico'])].dropna(subset=['country'])
        subsject_time_ = pd.concat({s_:pd.DataFrame(set(sdf_.date_sampling_category_days_continuous))
                                 for s_, sdf_ in sdtypedf_.groupby('subjectid_unique')})
        subsject_time_ = subsject_time_.reset_index().drop('level_1', axis=1)
        subsject_time_.columns = ['subject','month']
        subsject_time_['exist'] = 1
        # stack into map
        subsject_time_ = subsject_time_.pivot(index='subject',
                                          columns='month',
                                          values='exist').fillna(0)
        use_subjects = subsject_time_.index[subsject_time_.sum(1) >= len(t_use) - 2]
        mf_tmp = mf_tmp[mf_tmp.subjectid_unique.isin(use_subjects)]
        print(len(use_subjects))
        print(mf_tmp.shape)

        # subset fecal
        bt_tmp = q2.Artifact.load('../data/split-data/%s/table.qza' % body_site).view(Table).copy()
        tree = q2.Artifact.load('../data/processed-data/tree.qza').view(TreeNode)

        # pre-filter tables
        bt_tmp = bt_tmp.filter(bt_tmp.ids()[bt_tmp.sum('sample') > 0])
        freq_filts = bt_tmp.matrix_data.toarray().sum(axis=1) \
                      > 10 # 10
        bt_tmp = bt_tmp.filter(bt_tmp.ids('observation')[freq_filts], axis='observation')

        # subset shared 
        shared_ = set(bt_tmp.ids()) & set(mf_tmp.index)
        bt_tmp = bt_tmp.filter(shared_)
        mf_tmp = mf_tmp.reindex(shared_)

        # ensure tree matched
        seqs_ = [node.name for node in tree.tips()]
        shared_ = list(set(seqs_) & set(bt_tmp.ids('observation')))
        bt_tmp = bt_tmp.filter(shared_ , axis='observation')
        # save the splits
        ctf_splits[(body_site, t_range_)] = [bt_tmp, mf_tmp]


143
(223, 48)
67
(402, 48)
75
(190, 48)
43
(273, 48)
69
(173, 48)
40
(256, 48)


In [3]:
for (body_site, t_range_), (bt_tmp, mf_tmp) in ctf_splits.items():

    # import into qiime2
    print((body_site, t_range_))
    q2bt_tmp =  q2.Artifact.import_data('FeatureTable[Frequency]', bt_tmp)
    q2mf_tmp = q2.Metadata(mf_tmp)
    # run ctf
    fecal_res = ctf(q2bt_tmp,
                    q2mf_tmp,
                    'subjectid_unique',
                    'date_sampling_category_days_continuous')
    # save split
    directory = '../data/ctf-results/%s-%s' % (body_site, t_range_)
    if not os.path.exists(directory):
        os.mkdir(directory)
    q2bt_tmp.save('%s/%s' % (directory, 'table.qza'))
    q2mf_tmp.save('%s/%s' % (directory, 'metadata.qza'))
    for name_, art_ in fecal_res.__dict__.items():
        if name_ != '_fields':
            art_.save('%s/%s' % (directory, name_))


('Baby-Feces', '0-2')




('Baby-Feces', '7-360')




('Baby-Mouth', '0-2')




('Baby-Mouth', '7-360')




('Baby-Forearm', '0-2')




('Baby-Forearm', '7-360')


