In [1]:
import os
import glob
import numpy as np
from biom import Table, load_table
import qiime2 as q2
import pandas as pd
from biom.util import biom_open
from scipy.stats import sem
from qiime2.plugins.feature_table.methods import rarefy


# Loading Processed Datasets

In this section, the metadata from `../data/processed-data/metadata.qza` appeared to missing some information

In [2]:
q2bt = q2.Artifact.load('../data/processed-data/table.qza')
q2mf = q2.Metadata.load('../data/processed-data/metadata.qza')
bt = q2bt.view(Table)
mf = q2mf.to_dataframe()
mf.shape # (Number of Samples, Number of Columns)

(13516, 87)

In [3]:
bt.shape # (ASV, Number of Samples)

(193216, 13516)

In [4]:
sum(bt.sum(axis='sample') > 1000) # count the number of smaples with more than 1000 reads

12759

In [5]:
bt.sum()

array(7.94702397e+08)

In [6]:
bt_mbbs = bt.copy()
filt_ = bt_mbbs.ids()[bt_mbbs.sum(axis='sample') > 1000]
bt_mbbs = bt_mbbs.filter(filt_)
bt_mbbs = bt_mbbs.filter(bt_mbbs.ids('observation')[bt_mbbs.sum('observation') > 0],
                         axis='observation') # tabel with samples have more than 1000 reads
print('demisons of data table', bt_mbbs.shape)
print('total reads', bt_mbbs.sum())
print('Average depth per sample (after filtering out <=1000 reads samples):', np.mean(bt_mbbs.sum(axis='sample')))
print('Min depth per sample', np.min(bt_mbbs.sum(axis='sample')))
print('Max depth per sample', np.max(bt_mbbs.sum(axis='sample')))
print('SD depth per sample', np.std(bt_mbbs.sum(axis='sample')))
print('SE depth per sample', sem(bt_mbbs.sum(axis='sample')))
print('Median depth per sample', np.median(bt_mbbs.sum(axis='sample')))


demisons of data table (192457, 12759)
total reads 794469124.0
Average depth per sample (after filtering out <=1000 reads samples): 62267.35041931186
Min depth per sample 1005.0
Max depth per sample 765606.0
SD depth per sample 62405.85234945933
SE depth per sample 552.5022110091264
Median depth per sample 41397.0


# Importing the final curated metadata

In [3]:
mf_curated = pd.read_table('../data/processed-data/Metadata_Baby_Seeding_all_samples_final.txt', header = 0).dropna(1, how = 'all')
# filter mf so that only samples in bt appears
mf_curated.shape

  """Entry point for launching an IPython kernel.


(14784, 48)

In [5]:
mf_curated.manuscript_use.value_counts()

Possible           11133
No-NoSeq            1200
No-Replicates       1013
No-LaneRunError      724
No-misc              635
No-control            79
Name: manuscript_use, dtype: int64

## Filter the curated metadata

In [4]:
# filter the curated metadata, remove samples not in bt, and those with various issues
mf_curated_f = mf_curated[mf_curated.sample_name.isin(bt.ids()) & ~mf_curated.manuscript_use.isin(['No-Replicates', 'No-LaneRunError', 'No-misc'])] 
mf_curated_f.shape

(11212, 48)

## Check the final filtered curated metadata

In [10]:
mf_curated_f.manuscript_use.value_counts()

Possible      11133
No-control       79
Name: manuscript_use, dtype: int64

In [11]:
mf.columns

Index(['qiita_study', 'prep_name', 'run_name', '100nt_deblur', 'sequencing_id',
       'baby_birth_date', 'baby_sex', 'birth_mode', 'birth_mode_ms',
       'body_site_corrected', 'body_site_orig', 'body_site_type',
       'collection_timestamp', 'country', 'current_abx',
       'current_breast_feeding', 'current_formula', 'current_solids',
       'date_sampling', 'date_sampling_category',
       'date_sampling_category_days', 'date_sampling_category_days_continuous',
       'description', 'elevation', 'empo_1', 'empo_2', 'empo_3', 'env_biome',
       'env_feature', 'env_material', 'env_package', 'exclusive_breastfeed',
       'familyid', 'familyid_unique', 'geo_loc_name', 'hospital_name',
       'host_age', 'host_age_units', 'host_body_habitat',
       'host_body_mass_index', 'host_body_product', 'host_body_site',
       'host_common_name', 'host_height', 'host_height_units',
       'host_scientific_name', 'host_subject_id', 'host_taxid', 'host_weight',
       'host_weight_units', 'irb

In [12]:
mf_curated_f.columns

Index(['sample_name', 'seqcount', 'orig_sampleid', 'study_id', 'primer_plate',
       'well', 'lane', 'run', 'hospital_name', 'village', 'state', 'country',
       'irb_institution', 'project_name', 'body_site_orig',
       'body_site_corrected', 'body_site_type', 'familyid', 'familyid_unique',
       'mom_baby', 'subjectid', 'subjectid_unique', 'date_sampling',
       'real_sampling_time', 'date_sampling_category',
       'date_sampling_category_days', 'date_sampling_category_days_continuous',
       'baby_sex', 'birth_mode', 'seeding_method', 'baby_birth_date',
       'current_abx', 'mother_prenatal_gbs', 'mother_abx_perinatal',
       'mother_abx_perinatal_name', 'mother_abx_1st_trimester',
       'mother_abx_1st_trimester_name', 'mother_abx_2nd_trimester',
       'mother_abx_2nd_trimester_name', 'mother_abx_3rd_trimester',
       'mother_abx_3rd_trimester_name', 'mother_race',
       'current_breast_feeding', 'current_formula', 'current_solids',
       'exclusive_breastfeed', 'birt

In [13]:
mf.qiita_study.value_counts()

10894.0    11287
11648.0     1044
1718.0       505
Name: qiita_study, dtype: int64

In [14]:
mf_curated_f.study_id.value_counts()

10894.0    9899
10249.0     696
12261.0     452
1718.0      165
Name: study_id, dtype: int64

In [15]:
mf.body_site_corrected.value_counts()

Feces            2850
Mouth            2091
Right_Forearm    1505
Forehead         1281
Nose             1199
Vagina           1064
Right_Areola      947
Right_Hand        916
Anus              307
Forearm           304
Breast_Milk       146
Right_Foot         85
Control            79
Left_Hand          54
Breast              8
Name: body_site_corrected, dtype: int64

In [16]:
mf_curated_f.body_site_corrected.value_counts()

Feces            2323
Mouth            1801
Right_Forearm    1440
Forehead         1197
Nose             1140
Vagina            905
Right_Hand        832
Right_Areola      714
Forearm           304
Anus              221
Breast_Milk       141
Right_Foot         85
Control            79
Left_Hand          22
Breast              8
Name: body_site_corrected, dtype: int64

In [17]:
mf.groupby('familyid_unique').agg({'birth_mode_ms':'first'}).birth_mode_ms.value_counts()

Vag       102
CS         52
CSseed     28
Name: birth_mode_ms, dtype: int64

In [18]:
print("Number of families from each birth mode:")
mf_curated_f.groupby('familyid_unique').agg({'birth_mode_ms':'first'}).birth_mode_ms.value_counts()

Number of families from each birth mode:


Vag       101
CS         52
CSseed     28
Name: birth_mode_ms, dtype: int64

In [19]:
print("Number of babies from each birth mode:")
mf_curated_f[mf_curated_f.mom_baby.eq('Baby')].groupby('familyid_unique').agg({'birth_mode_ms':'first'}).birth_mode_ms.value_counts()

Number of babies from each birth mode:


Vag       99
CS        49
CSseed    28
Name: birth_mode_ms, dtype: int64

## House keeping on the curated final metadata

In [5]:
# change right_forearm to forearm for all samples
mf_curated_f.loc[mf_curated_f.body_site_corrected == 'Right_Forearm', 'body_site_corrected'] = 'Forearm'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [21]:
map_per = {bs_:mfbs.dropna(subset=['subjectid_unique',
                                     'date_sampling_category_days_continuous'])
           for bs_, mfbs in mf_curated_f.groupby(['mom_baby', 'manuscript_use', 'body_site_corrected'])}
{k:v.shape[0] for k,v in map_per.items()}


{('Baby', 'No-control', 'Control'): 1,
 ('Baby', 'Possible', 'Anus'): 85,
 ('Baby', 'Possible', 'Feces'): 1557,
 ('Baby', 'Possible', 'Forearm'): 896,
 ('Baby', 'Possible', 'Forehead'): 638,
 ('Baby', 'Possible', 'Left_Hand'): 11,
 ('Baby', 'Possible', 'Mouth'): 928,
 ('Baby', 'Possible', 'Nose'): 611,
 ('Baby', 'Possible', 'Right_Foot'): 42,
 ('Baby', 'Possible', 'Right_Hand'): 457,
 ('Baby', 'Possible', 'Vagina'): 11,
 ('Mom', 'Possible', 'Anus'): 130,
 ('Mom', 'Possible', 'Breast'): 8,
 ('Mom', 'Possible', 'Breast_Milk'): 141,
 ('Mom', 'Possible', 'Feces'): 766,
 ('Mom', 'Possible', 'Forearm'): 848,
 ('Mom', 'Possible', 'Forehead'): 559,
 ('Mom', 'Possible', 'Left_Hand'): 11,
 ('Mom', 'Possible', 'Mouth'): 873,
 ('Mom', 'Possible', 'Nose'): 529,
 ('Mom', 'Possible', 'Right_Areola'): 714,
 ('Mom', 'Possible', 'Right_Foot'): 43,
 ('Mom', 'Possible', 'Right_Hand'): 375,
 ('Mom', 'Possible', 'Vagina'): 888}

In [6]:
mf_curated_f.set_index('sample_name', inplace=True)

# Split the feature tables

In [23]:
# use for now
use_ = [('Baby', 'Feces'),
        ('Baby', 'Mouth'),
        ('Baby', 'Forearm'),
        ('Mom', 'Vagina'),
        ('Mom', 'Feces'),
        ('Mom', 'Mouth'),
        ('Mom', 'Forearm'),
       ('Mom', 'Nose'),
       ('Mom', 'Right_Areola')]

map_per = {bs_:mfbs.dropna(subset=['subjectid_unique',
                                     'date_sampling_category_days_continuous'])
           for bs_, mfbs in mf_curated_f.groupby(['mom_baby', 'body_site_corrected'])}
for mb_bs in use_:
    # get subset
    mf_mbbs = map_per[mb_bs].drop(['body_site_corrected'], axis=1)
    mf_mbbs = mf_mbbs[mf_mbbs.date_sampling_category_days_continuous <= 360]
    bt_mbbs = bt.copy()
    out_ = os.path.join('../data/split-data',
                        '-'.join(list(mb_bs)))
    #filter
    bt_mbbs = bt_mbbs.filter(mf_mbbs.index)
    bt_mbbs = bt_mbbs.filter(bt_mbbs.ids('observation')[bt_mbbs.sum('observation') > 0],
                             axis='observation')
    mf_mbbs = mf_mbbs.reindex(bt_mbbs.ids())

    # life-stage groupings
    if mb_bs[0] == 'Baby':
        life_stage = {'0-2':[-2, 14],
                      '2-4':[14, 30],
                      '4-17':[30, 120],
                      '17-26':[120, 180],
                      '26-51':[180, 360],
                      '51-end':[360, 6000]}
        # invert the dict
        life_stage = {d:ls for ls, dr in life_stage.items()
                      for d in range(dr[0]+1, dr[1]+1)}
        # make lif-stage col.
        mf_mbbs.date_sampling_category_days_continuous = mf_mbbs.date_sampling_category_days_continuous.astype(float)
        mf_mbbs['life_stage'] = [life_stage[d]
                                 for d in mf_mbbs.date_sampling_category_days_continuous]
    # make subject id
    mf_mbbs['host_subject_id'] = mf_mbbs.subjectid_unique.values

    # save an write
    q2bt_mbbs =  q2.Artifact.import_data('FeatureTable[Frequency]', bt_mbbs)
    q2mf_mbbs = q2.Metadata(mf_mbbs)
    
    # write out
    os.mkdir(out_)
    q2bt_mbbs.save(os.path.join(out_,'table.qza'))
    q2mf_mbbs.save(os.path.join(out_,'metadata.qza'))
    mf_mbbs.to_csv(os.path.join(out_,'metadata.tsv'), sep='\t')
    with biom_open(os.path.join(out_,'table.biom'), 'w') as f:
        bt_mbbs.to_hdf5(f, "bs_type")
    print(mb_bs)
    print(mf_mbbs.shape)
    
    # split by life-stage
    if mb_bs[0] == 'Baby':
        for ls_, mf_mbbs_ls in mf_mbbs.groupby('life_stage'):
            out_ls = out_ + '-%s' % (ls_)
            # subset table
            bt_mbbs_ls = bt_mbbs.copy()
            bt_mbbs_ls = bt_mbbs_ls.filter(mf_mbbs_ls.index)
            bt_mbbs_ls = bt_mbbs_ls.filter(bt_mbbs_ls.ids('observation')[bt_mbbs_ls.sum('observation') > 0],
                                           axis='observation')
            mf_mbbs_ls = mf_mbbs_ls.reindex(bt_mbbs_ls.ids())
            # export all
            os.mkdir(out_ls)
            # save an write
            q2bt_mbbs_ls =  q2.Artifact.import_data('FeatureTable[Frequency]', bt_mbbs_ls)
            q2mf_mbbs_ls = q2.Metadata(mf_mbbs_ls)
            # write
            q2bt_mbbs_ls.save(os.path.join(out_ls,'table.qza'))
            q2mf_mbbs_ls.save(os.path.join(out_ls,'metadata.qza'))
            mf_mbbs_ls.to_csv(os.path.join(out_ls,'metadata.tsv'), sep='\t')
            with biom_open(os.path.join(out_ls,'table.biom'), 'w') as f:
                bt_mbbs_ls.to_hdf5(f, "bs_type")
            print(mb_bs, ls_)
            print(mf_mbbs_ls.shape)


('Baby', 'Feces')
(1391, 48)
('Baby', 'Feces') 0-2
(428, 48)
('Baby', 'Feces') 17-26
(157, 48)
('Baby', 'Feces') 2-4
(212, 48)
('Baby', 'Feces') 26-51
(347, 48)
('Baby', 'Feces') 4-17
(247, 48)
('Baby', 'Mouth')
(926, 48)
('Baby', 'Mouth') 0-2
(390, 48)
('Baby', 'Mouth') 17-26
(76, 48)
('Baby', 'Mouth') 2-4
(164, 48)
('Baby', 'Mouth') 26-51
(158, 48)
('Baby', 'Mouth') 4-17
(138, 48)
('Baby', 'Forearm')
(894, 48)
('Baby', 'Forearm') 0-2
(372, 48)
('Baby', 'Forearm') 17-26
(63, 48)
('Baby', 'Forearm') 2-4
(161, 48)
('Baby', 'Forearm') 26-51
(176, 48)
('Baby', 'Forearm') 4-17
(122, 48)
('Mom', 'Vagina')
(888, 47)
('Mom', 'Feces')
(765, 47)
('Mom', 'Mouth')
(873, 47)
('Mom', 'Forearm')
(848, 47)
('Mom', 'Nose')
(529, 47)
('Mom', 'Right_Areola')
(714, 47)


# Rarefy the feature tables

In [30]:
# tables for import
tables_ = {'mom_fecal':'../data/split-data/Mom-Feces',
            'mom_oral':'../data/split-data/Mom-Mouth',
            'mom_skin':'../data/split-data/Mom-Forearm',
            'mom_vagina':'../data/split-data/Mom-Vagina/',
          'mom_areola':'../data/split-data/Mom-Right_Areola',
          'mom_nose':'../data/split-data/Mom-Nose',
          'baby_fecal':'../data/split-data/Baby-Feces',
            'baby_oral':'../data/split-data/Baby-Mouth',
            'baby_skin':'../data/split-data/Baby-Forearm'}

# get each table and run rarefication
for k_, path_ in tables_.items():
    print('Starting: %s' % k_)
    # get table(s)
    table_ = load_table(os.path.join(path_, 'table.biom'))
    table_ = pd.DataFrame(table_.matrix_data.toarray(),
                          table_.ids('observation'),
                          table_.ids()).T
    tq2able_ = q2.Artifact.import_data('FeatureTable[Frequency]', table_)
    # rar depth (hard set to 5000)
    #rar_depth = int(max(table_.sum(1).min(), 1250))
    rar_depth = 5000
    print('Rarefy-depth %i' % rar_depth)
    # run rare
    table_rar = rarefy(tq2able_, rar_depth).rarefied_table
    table_rar.save(os.path.join(path_, 'rarefy-table.qza'))


Starting: mom_fecal
Rarefy-depth 5000
Starting: mom_oral
Rarefy-depth 5000
Starting: mom_skin
Rarefy-depth 5000
Starting: mom_vagina
Rarefy-depth 5000
Starting: mom_areola
Rarefy-depth 5000
Starting: mom_nose
Rarefy-depth 5000
Starting: baby_fecal
Rarefy-depth 5000
Starting: baby_oral
Rarefy-depth 5000
Starting: baby_skin
Rarefy-depth 5000


# Create dataset for additional song bird run

In [22]:
{k:v.shape[0] for k, v in 
 mf_curated_f[(mf_curated_f.mom_baby=="Baby") & 
              (mf_curated_f.body_site_corrected.isin(['Feces', 'Mouth', 'Forearm']))
             ].groupby(['body_site_corrected', 'date_sampling_category_days_continuous', 'birth_mode_ms'])}

{('Feces', 0.0, 'CS'): 12,
 ('Feces', 0.0, 'CSseed'): 13,
 ('Feces', 0.0, 'Vag'): 48,
 ('Feces', 1.0, 'CS'): 20,
 ('Feces', 1.0, 'CSseed'): 10,
 ('Feces', 1.0, 'Vag'): 45,
 ('Feces', 2.0, 'CS'): 12,
 ('Feces', 2.0, 'CSseed'): 20,
 ('Feces', 2.0, 'Vag'): 47,
 ('Feces', 7.0, 'CS'): 23,
 ('Feces', 7.0, 'CSseed'): 20,
 ('Feces', 7.0, 'Vag'): 67,
 ('Feces', 14.0, 'CS'): 17,
 ('Feces', 14.0, 'CSseed'): 17,
 ('Feces', 14.0, 'Vag'): 57,
 ('Feces', 21.0, 'CS'): 13,
 ('Feces', 21.0, 'CSseed'): 10,
 ('Feces', 21.0, 'Vag'): 48,
 ('Feces', 30.0, 'CS'): 42,
 ('Feces', 30.0, 'CSseed'): 20,
 ('Feces', 30.0, 'Vag'): 79,
 ('Feces', 60.0, 'CS'): 31,
 ('Feces', 60.0, 'CSseed'): 18,
 ('Feces', 60.0, 'Vag'): 39,
 ('Feces', 90.0, 'CS'): 28,
 ('Feces', 90.0, 'CSseed'): 8,
 ('Feces', 90.0, 'Vag'): 39,
 ('Feces', 120.0, 'CS'): 27,
 ('Feces', 120.0, 'CSseed'): 16,
 ('Feces', 120.0, 'Vag'): 41,
 ('Feces', 150.0, 'CS'): 25,
 ('Feces', 150.0, 'CSseed'): 9,
 ('Feces', 150.0, 'Vag'): 37,
 ('Feces', 180.0, 'CS'): 28,


In [9]:
# use for now
use_ = [('Baby', 'Feces'),
        ('Baby', 'Mouth'),
        ('Baby', 'Forearm')]

map_per = {bs_:mfbs[mfbs.date_sampling_category_days_continuous.isin(['2', '30', '120', '180'])]
           for bs_, mfbs in mf_curated_f.groupby(['mom_baby', 'body_site_corrected'])}
os.mkdir('../data/diff-analysis-new')
for mb_bs in use_:
    # get subset
    mf_mbbs = map_per[mb_bs].drop(['body_site_corrected'], axis=1)
    bt_mbbs = bt.copy()
    out_ = os.path.join('../data/diff-analysis-new',
                        '-'.join(list(mb_bs)))
    #filter
    bt_mbbs = bt_mbbs.filter(mf_mbbs.index)
    bt_mbbs = bt_mbbs.filter(bt_mbbs.ids('observation')[bt_mbbs.sum('observation') > 0],
                             axis='observation')
    mf_mbbs = mf_mbbs.reindex(bt_mbbs.ids())

    # make subject id
    mf_mbbs['host_subject_id'] = mf_mbbs.subjectid_unique.values
    
    # split by life-stage
    if mb_bs[0] == 'Baby':
        for ls_, mf_mbbs_ls in mf_mbbs.groupby('date_sampling_category_days_continuous'):
            out_ls = out_ + '-%s' % (ls_)
            # subset table
            bt_mbbs_ls = bt_mbbs.copy()
            bt_mbbs_ls = bt_mbbs_ls.filter(mf_mbbs_ls.index)
            bt_mbbs_ls = bt_mbbs_ls.filter(bt_mbbs_ls.ids('observation')[bt_mbbs_ls.sum('observation') > 0],
                                           axis='observation')
            mf_mbbs_ls = mf_mbbs_ls.reindex(bt_mbbs_ls.ids())
            # export all
            os.mkdir(out_ls)
            # save an write
            q2bt_mbbs_ls =  q2.Artifact.import_data('FeatureTable[Frequency]', bt_mbbs_ls)
            q2mf_mbbs_ls = q2.Metadata(mf_mbbs_ls)
            # write
            q2bt_mbbs_ls.save(os.path.join(out_ls,'table.qza'))
            q2mf_mbbs_ls.save(os.path.join(out_ls,'metadata.qza'))
            mf_mbbs_ls.to_csv(os.path.join(out_ls,'metadata.tsv'), sep='\t')
            with biom_open(os.path.join(out_ls,'table.biom'), 'w') as f:
                bt_mbbs_ls.to_hdf5(f, "bs_type")
            print(mb_bs, ls_)
            print(mf_mbbs_ls.shape)

('Baby', 'Feces') 2.0
(79, 47)
('Baby', 'Feces') 30.0
(141, 47)
('Baby', 'Feces') 120.0
(84, 47)
('Baby', 'Feces') 180.0
(86, 47)
('Baby', 'Mouth') 2.0
(76, 47)
('Baby', 'Mouth') 30.0
(92, 47)
('Baby', 'Mouth') 120.0
(47, 47)
('Baby', 'Mouth') 180.0
(45, 47)
('Baby', 'Forearm') 2.0
(69, 47)
('Baby', 'Forearm') 30.0
(89, 47)
('Baby', 'Forearm') 120.0
(43, 47)
('Baby', 'Forearm') 180.0
(43, 47)
