This notebook outlines the filtering criteria that was used to focus on the samples of interest

In [1]:
import pandas as pd
import numpy as np
metadata = pd.read_table('../data/oral_metadata.txt', index_col=0)

First remove samples that aren't raw.

In [2]:
idx = list(map(lambda x: 'raw' in x, metadata.index)) 
metadata = metadata.loc[idx]

Now filter out B1/2, D1/2, J8/9, since there were technical issues with those samples

In [3]:
#exclusion_set = {'B1', 'B2', 'D1', 'D2', 'J8', 'J9'} 
exclusion_set = {'D1', 'D2'} 
idx = list(map(lambda x: x not in exclusion_set, metadata['paricipant-timepoint']))
metadata = metadata.loc[idx]

Filter out samples that aren't relevant to the brushing event

In [4]:
metadata = metadata.dropna(subset=['brushing_event'])

Clean up metadata to remove misspellings

In [5]:
metadata = metadata.rename(
    columns={
        'paricipant-timepoint': 'participant-timepoint',                
    }
)

Sort samples by host subject and timepoint

In [6]:
metadata = metadata.sort_values(by=['HostSubject', 'Timepoint#'])

Load biom table and match with the metadata

In [7]:
from biom import load_table
table = load_table('../data/oral_deblurred.biom')
ids_to_keep = lambda val, id_, md: id_ in set(metadata.index)
table.filter(ids_to_keep, axis='sample', inplace=True)

485 x 32 <class 'biom.table.Table'> with 3940 nonzero entries (25% dense)

Filter out OTUs that don't appear in at least 5 samples

In [8]:
filter_fn = lambda val, id_, md: np.sum(val>0) > 5
table.filter(filter_fn, axis='observation', inplace=True)

203 x 32 <class 'biom.table.Table'> with 3432 nonzero entries (52% dense)

Sort the OTU table to make sure that it matches the metadata table

In [9]:
table = table.sort_order(list(metadata.index))

Save both the metadata and the OTU biom table

In [10]:
from biom.util import biom_open
with biom_open('../data/oral_trimmed_deblur.biom', 'w') as f:
    table.to_hdf5(f, 'trimmed')
table.to_dataframe().T.to_csv('../data/oral_trimmed_deblur.txt', sep='\t')
metadata.to_csv('../data/oral_trimmed_metadata.txt', sep='\t')    

In [11]:
table.shape

(203, 32)

In [12]:
metadata['participant-timepoint']

#SampleID
A1.raw    A1
A2.raw    A2
A8.raw    A8
A9.raw    A9
C1.raw    C1
C2.raw    C2
C7.raw    C7
C8.raw    C8
D8.raw    D8
D9.raw    D9
E1.raw    E1
E2.raw    E2
E8.raw    E8
E9.raw    E9
F1.raw    F1
F2.raw    F2
F8.raw    F8
F9.raw    F9
G1.raw    G1
G2.raw    G2
G7.raw    G7
G8.raw    G8
H1.raw    H1
H2.raw    H2
I1.raw    I1
I2.raw    I2
I8.raw    I8
I9.raw    I9
J1.raw    J1
J2.raw    J2
J8.raw    J8
J9.raw    J9
Name: participant-timepoint, dtype: object