In [1]:
import os
import glob
import numpy as np
from biom import Table
import qiime2 as q2
import pandas as pd
from biom.util import biom_open
from scipy.stats import sem


# Loading Processed Datasets

In [2]:
q2bt = q2.Artifact.load('../data/processed-data/table.qza')
q2mf = q2.Metadata.load('../data/processed-data/metadata.qza')
bt = q2bt.view(Table)
mf = q2mf.to_dataframe()
mf.shape # (Number of Samples, Number of Columns)

(13516, 87)

In [8]:
bt.shape # (ASV, Number of Samples)

(193216, 13516)

In [10]:
sum(bt.sum(axis='sample') > 1000) # count the number of smaples with more than 1000 reads

11585

In [11]:
bt.sum()

array(7.70779332e+08)

In [9]:
bt_mbbs = bt.copy()
filt_ = bt_mbbs.ids()[bt_mbbs.sum(axis='sample') > 1000]
bt_mbbs = bt_mbbs.filter(filt_)
bt_mbbs = bt_mbbs.filter(bt_mbbs.ids('observation')[bt_mbbs.sum('observation') > 0],
                         axis='observation') # tabel with samples have more than 1000 reads
print('demisons of data table', bt_mbbs.shape)
print('total reads', bt_mbbs.sum())
print('Average depth per sample (after filtering out <=1000 reads samples):', np.mean(bt_mbbs.sum(axis='sample')))
print('Min depth per sample', np.min(bt_mbbs.sum(axis='sample')))
print('Max depth per sample', np.max(bt_mbbs.sum(axis='sample')))
print('SD depth per sample', np.std(bt_mbbs.sum(axis='sample')))
print('SE depth per sample', sem(bt_mbbs.sum(axis='sample')))
print('Median depth per sample', np.median(bt_mbbs.sum(axis='sample')))


demisons of data table (192457, 12759)
total reads 794469124.0
Average depth per sample (after filtering out <=1000 reads samples): 62267.35041931186
Min depth per sample 1005.0
Max depth per sample 765606.0
SD depth per sample 62405.85234945933
SE depth per sample 552.5022110091264
Median depth per sample 41397.0


In [10]:
mf.columns

Index(['qiita_study', 'prep_name', 'run_name', '100nt_deblur', 'sequencing_id',
       'baby_birth_date', 'baby_sex', 'birth_mode', 'birth_mode_ms',
       'body_site_corrected', 'body_site_orig', 'body_site_type',
       'collection_timestamp', 'country', 'current_abx',
       'current_breast_feeding', 'current_formula', 'current_solids',
       'date_sampling', 'date_sampling_category',
       'date_sampling_category_days', 'date_sampling_category_days_continuous',
       'description', 'elevation', 'empo_1', 'empo_2', 'empo_3', 'env_biome',
       'env_feature', 'env_material', 'env_package', 'exclusive_breastfeed',
       'familyid', 'familyid_unique', 'geo_loc_name', 'hospital_name',
       'host_age', 'host_age_units', 'host_body_habitat',
       'host_body_mass_index', 'host_body_product', 'host_body_site',
       'host_common_name', 'host_height', 'host_height_units',
       'host_scientific_name', 'host_subject_id', 'host_taxid', 'host_weight',
       'host_weight_units', 'irb

In [11]:
mf.qiita_study.value_counts()

10894.0    11287
11648.0     1044
1718.0       505
Name: qiita_study, dtype: int64

In [12]:
mf.body_site_corrected.value_counts()

Feces            2850
Mouth            2091
Right_Forearm    1505
Forehead         1281
Nose             1199
Vagina           1064
Right_Areola      947
Right_Hand        916
Anus              307
Forearm           304
Breast_Milk       146
Right_Foot         85
Control            79
Left_Hand          54
Breast              8
Name: body_site_corrected, dtype: int64

In [13]:
mf.groupby('subjectid_unique').agg({'birth_mode_ms':'first'}).birth_mode_ms.value_counts()

Vag       200
CS        100
CSseed     53
CSself      2
Name: birth_mode_ms, dtype: int64

In [14]:
mf.manuscript_use.value_counts()

Possible          8280
No-LowDepth       1367
No-Replicates      893
No-notrelevant     518
No-misc            457
No-Misc            104
No-control          20
Name: manuscript_use, dtype: int64

In [16]:
map_per = {bs_:mfbs.dropna(subset=['subjectid_unique',
                                     'date_sampling_category_days_continuous'])
           for bs_, mfbs in mf.groupby(['mom_baby', 'manuscript_use', 'body_site_corrected'])}
{k:v.shape[0] for k,v in map_per.items()}


{('Baby', 'No-LowDepth', 'Anus'): 8,
 ('Baby', 'No-LowDepth', 'Control'): 0,
 ('Baby', 'No-LowDepth', 'Feces'): 159,
 ('Baby', 'No-LowDepth', 'Forehead'): 66,
 ('Baby', 'No-LowDepth', 'Mouth'): 111,
 ('Baby', 'No-LowDepth', 'Nose'): 101,
 ('Baby', 'No-LowDepth', 'Right_Foot'): 1,
 ('Baby', 'No-LowDepth', 'Right_Forearm'): 183,
 ('Baby', 'No-LowDepth', 'Right_Hand'): 60,
 ('Baby', 'No-LowDepth', 'Vagina'): 6,
 ('Baby', 'No-Misc', 'Feces'): 104,
 ('Baby', 'No-Replicates', 'Feces'): 354,
 ('Baby', 'No-Replicates', 'Forehead'): 33,
 ('Baby', 'No-Replicates', 'Left_Hand'): 32,
 ('Baby', 'No-Replicates', 'Mouth'): 193,
 ('Baby', 'No-Replicates', 'Nose'): 5,
 ('Baby', 'No-Replicates', 'Right_Forearm'): 3,
 ('Baby', 'No-Replicates', 'Right_Hand'): 36,
 ('Baby', 'No-Replicates', 'Vagina'): 27,
 ('Baby', 'No-control', 'Control'): 1,
 ('Baby', 'No-notrelevant', 'Anus'): 74,
 ('Baby', 'No-notrelevant', 'Feces'): 5,
 ('Baby', 'No-notrelevant', 'Mouth'): 50,
 ('Baby', 'No-notrelevant', 'Right_Forear

In [None]:
# Rename all Right_Forearm 

In [241]:
# use for now
use_ = [('Baby', 'Feces'),
        ('Baby', 'Mouth'),
        ('Baby', 'Right_Forearm'),
        ('Mom', 'Vagina'),
        ('Mom', 'Feces'),
        ('Mom', 'Mouth'),
        ('Mom', 'Right_Forearm')]

map_per = {bs_:mfbs.dropna(subset=['subjectid_unique',
                                     'date_sampling_category_days_continuous'])
           for bs_, mfbs in mf.groupby(['mom_baby', 'body_site_corrected'])}
for mb_bs in use_:
    # get subset
    mf_mbbs = map_per[mb_bs].drop(['body_site_corrected'], axis=1)
    mf_mbbs = mf_mbbs[mf_mbbs.date_sampling_category_days_continuous <= 360]
    bt_mbbs = bt.copy()
    out_ = os.path.join('../data/split-data',
                        '-'.join(list(mb_bs)))
    #filter
    bt_mbbs = bt_mbbs.filter(mf_mbbs.index)
    bt_mbbs = bt_mbbs.filter(bt_mbbs.ids('observation')[bt_mbbs.sum('observation') > 0],
                             axis='observation')
    mf_mbbs = mf_mbbs.reindex(bt_mbbs.ids())

    # life-stage groupings
    if mb_bs[0] == 'Baby':
        life_stage = {'0-2':[-2, 14],
                      '2-4':[14, 30],
                      '4-17':[30, 120],
                      '17-26':[120, 180],
                      '26-51':[180, 360],
                      '51-end':[360, 6000]}
        # invert the dict
        life_stage = {d:ls for ls, dr in life_stage.items()
                      for d in range(dr[0]+1, dr[1]+1)}
        # make lif-stage col.
        mf_mbbs.date_sampling_category_days_continuous = mf_mbbs.date_sampling_category_days_continuous.astype(float)
        mf_mbbs['life_stage'] = [life_stage[d]
                                 for d in mf_mbbs.date_sampling_category_days_continuous]
    # make subject id
    mf_mbbs['host_subject_id'] = mf_mbbs.subjectid_unique.values

    # save an write
    q2bt_mbbs =  q2.Artifact.import_data('FeatureTable[Frequency]', bt_mbbs)
    q2mf_mbbs = q2.Metadata(mf_mbbs)
    
    # write out
    os.mkdir(out_)
    q2bt_mbbs.save(os.path.join(out_,'table.qza'))
    q2mf_mbbs.save(os.path.join(out_,'metadata.qza'))
    mf_mbbs.to_csv(os.path.join(out_,'metadata.tsv'), sep='\t')
    with biom_open(os.path.join(out_,'table.biom'), 'w') as f:
        bt_mbbs.to_hdf5(f, "bs_type")
    print(mb_bs)
    print(mf_mbbs.shape)
    
    # split by life-stage
    if mb_bs[0] == 'Baby':
        for ls_, mf_mbbs_ls in mf_mbbs.groupby('life_stage'):
            out_ls = out_ + '-%s' % (ls_)
            # subset table
            bt_mbbs_ls = bt_mbbs.copy()
            bt_mbbs_ls = bt_mbbs_ls.filter(mf_mbbs_ls.index)
            bt_mbbs_ls = bt_mbbs_ls.filter(bt_mbbs_ls.ids('observation')[bt_mbbs_ls.sum('observation') > 0],
                                           axis='observation')
            mf_mbbs_ls = mf_mbbs_ls.reindex(bt_mbbs_ls.ids())
            # export all
            os.mkdir(out_ls)
            # save an write
            q2bt_mbbs_ls =  q2.Artifact.import_data('FeatureTable[Frequency]', bt_mbbs_ls)
            q2mf_mbbs_ls = q2.Metadata(mf_mbbs_ls)
            # write
            q2bt_mbbs_ls.save(os.path.join(out_ls,'table.qza'))
            q2mf_mbbs_ls.save(os.path.join(out_ls,'metadata.qza'))
            mf_mbbs_ls.to_csv(os.path.join(out_ls,'metadata.tsv'), sep='\t')
            with biom_open(os.path.join(out_ls,'table.biom'), 'w') as f:
                bt_mbbs_ls.to_hdf5(f, "bs_type")
            print(mb_bs, ls_)
            print(mf_mbbs_ls.shape)


('Baby', 'Feces')
(1780, 53)
('Baby', 'Feces') 0-2
(548, 53)
('Baby', 'Feces') 17-26
(183, 53)
('Baby', 'Feces') 2-4
(287, 53)
('Baby', 'Feces') 26-51
(423, 53)
('Baby', 'Feces') 4-17
(339, 53)
('Baby', 'Mouth')
(1014, 53)
('Baby', 'Mouth') 0-2
(401, 53)
('Baby', 'Mouth') 17-26
(80, 53)
('Baby', 'Mouth') 2-4
(177, 53)
('Baby', 'Mouth') 26-51
(206, 53)
('Baby', 'Mouth') 4-17
(150, 53)
('Baby', 'Right_Forearm')
(756, 53)
('Baby', 'Right_Forearm') 0-2
(323, 53)
('Baby', 'Right_Forearm') 17-26
(45, 53)
('Baby', 'Right_Forearm') 2-4
(143, 53)
('Baby', 'Right_Forearm') 26-51
(160, 53)
('Baby', 'Right_Forearm') 4-17
(85, 53)
('Mom', 'Vagina')
(1007, 52)
('Mom', 'Feces')
(808, 52)
('Mom', 'Mouth')
(785, 52)
('Mom', 'Right_Forearm')
(747, 52)
