In [2]:
import io
import os
import glob
import zipfile
import subprocess
import numpy as np
import pandas as pd
import qiime2 as q2
from skbio import TreeNode 
from biom import load_table, Table
from biom.util import biom_open
from qiime2.plugins.feature_table.methods import merge
from qiime2.plugins.fragment_insertion.methods import sepp


In [231]:
# get map of qiita preps
qiita_mf = pd.read_csv('../data/qiita-tables-processing/qiita-mapping.csv').dropna(1)
qiita_mf.head(3)


Unnamed: 0,qiita_study,prep_name,run_name,100nt_deblur,sequencing_id
0,10894,2524,Run2 Lane 1,60614,lane_1_170216_D00611_0439_BCAJ1MANXX_Knight_2
1,10894,2525,Run2 Lane 2,57513,lane_2_170216_D00611_0439_BCAJ1MANXX_Knight_2
2,10894,2527,Run2 Lane 3,61023,lane_3_170216_D00611_0439_BCAJ1MANXX_Knight_2


In [233]:
# import the table ID's and import to Q2
mf = []
tbls = glob.glob('../data/qiita-tables-processing/biom-table-by-prep/*.biom')
for tbl in tbls:
    # import table
    tblid = tbl.split('/')[-1][:-5]
    tb = load_table(tbl)
    # meta-data subset
    mf_tmp = qiita_mf[qiita_mf['100nt_deblur'].isin([tblid])].copy()
    mf_tmp = pd.concat([mf_tmp for id_ in tb.ids()])
    mf_tmp.index = tb.ids()
    mf.append(mf_tmp)
    # import table to qiime2 and write
    q2tb = q2.Artifact.import_data('FeatureTable[Frequency]', tb)
    q2tb.save(os.path.join('../data/qiita-tables-processing/q2-tables', tblid))
# final metadata merged
mf = pd.concat(mf)
mf.index.name = '#SampleID'
q2.Metadata(mf).save('../data/qiita-tables-processing/qiita-mapped-metadata.qza')
mf.to_csv('../data/qiita-tables-processing/qiita-mapped-metadata.tsv', sep='\t')
mf.head(3)
    

Unnamed: 0_level_0,qiita_study,prep_name,run_name,100nt_deblur,sequencing_id
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10894.HR4234,10894,3908,Baby_2_1-6,58678,lane_1_171002_D00611_0535_BHY5LYBCXY_Knight_Gr...
10894.HR5600,10894,3908,Baby_2_1-6,58678,lane_1_171002_D00611_0535_BHY5LYBCXY_Knight_Gr...
10894.HR4183,10894,3908,Baby_2_1-6,58678,lane_1_171002_D00611_0535_BHY5LYBCXY_Knight_Gr...


In [43]:
# merge all the tables into one
!qiime feature-table merge\
    --i-tables ../data/qiita-tables-processing/q2-tables/*.qza\
    --p-overlap-method 'sum'\
    --o-merged-table ../data/qiita-tables-processing/merged-table.qza


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
[32mSaved FeatureTable[Frequency] to: ../data/qiita-tables-processing/merged-table.qza[0m


In [277]:
# build the set of all rep-seqs
seqs_ = q2.Artifact.load('../data/qiita-tables-processing/merged-table.qza').view(Table).ids('observation')
seqs_ = '\n'.join(['>'+i+'\n'+i for i in seqs_])
f = open("../data/qiita-tables-processing/rep-seqs.fa", "w")
f.write(seqs_)
f.close()


In [278]:
# import the rep-seqs
!qiime tools import \
    --input-path ../data/qiita-tables-processing/rep-seqs.fa\
    --output-path ../data/qiita-tables-processing/rep-seqs.qza\
    --type 'FeatureData[Sequence]'


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
[32mImported ../data/qiita-tables-processing/rep-seqs.fa as DNASequencesDirectoryFormat to ../data/qiita-tables-processing/rep-seqs.qza[0m


In [280]:
# run taxonomic classification (run on cluster - big compute step)
!qiime feature-classifier classify-sklearn \
  --i-classifier ../data/qiita-tables-processing/gg-13-8-99-515-806-nb-classifier.qza \
  --i-reads ../data/qiita-tables-processing/rep-seqs.qza \
  --o-classification ../data/qiita-tables-processing/taxonomy.qza


In [48]:
# generate sepp-insertion tree (run on cluster - big compute step)
!qiime fragment-insertion sepp\
    --i-representative-sequences ../data/qiita-tables-processing/rep-seqs.qza\
    --output-dir ../data/qiita-tables-processing/sepp-tree\


In [363]:
# table to filter
q2tb = q2.Artifact.load('../data/qiita-tables-processing/merged-table.qza').view(Table)
print(q2tb.shape)
# metadata (merged from MG on 01/21/2020 and ECAM metadata)
mf = pd.read_csv('../data/qiita-tables-processing/metadata.tsv',
                 sep='\t', index_col=0)
mf.index = [ind.replace('11648','10249') for ind in mf.index]
# filter table to match metadata
id_keep = sorted(set(mf.index) & set(q2tb.ids()))
q2tb = q2tb.filter(id_keep)
# ensure no zero sums
q2tb = q2tb.filter(q2tb.ids()[q2tb.sum('sample') > 0]) # samples
filt_ = q2tb.ids('observation')[q2tb.sum('observation') > 0] # features
q2tb = q2tb.filter(filt_, axis='observation')
# reindex metadata and add qiita prep map
mf = mf.reindex(q2tb.ids())
prepmf = pd.read_csv('../data/qiita-tables-processing/qiita-mapped-metadata.tsv',
                     sep='\t', index_col=0)
prepmf = prepmf.reindex(q2tb.ids())
mf = pd.concat([prepmf, mf],
               sort=True, axis=1)
mf.index.name = "#SampleID"
mf = mf[~mf.manuscript_use.isin(['No-LaneRunError'])]
# import the tree (get inserted seqs)
tree = q2.Artifact.load('../data/qiita-tables-processing/sepp-tree/tree.qza').view(TreeNode)
# filter out chloroplast/mitochondria hits
taxonomy = q2.Artifact.load('../data/qiita-tables-processing/taxonomy.qza').view(pd.DataFrame)
drop_ = set([t_ for t_ in taxonomy.Taxon if 'chloroplast' in t_.lower()\
                                         or 'mitochondria' in t_.lower()])
taxonomy = taxonomy[~taxonomy.Taxon.isin(drop_)]
# check shared ids for tree, table, and taxonomy
keep_ = list((set([node.name for node in tree.tips()])\
              & set(taxonomy.index))\
             & set(q2tb.ids('observation')))
#keep_ = list((set(taxonomy.index))\
#             & set(q2tb.ids('observation')))
# filter table 
q2tb = q2tb.filter(keep_, axis='observation')
# ensure no zero sums
q2tb = q2tb.filter(q2tb.ids()[q2tb.sum('sample') > 0]) # samples
filt_ = q2tb.ids('observation')[q2tb.sum('observation') > 0] # features
q2tb = q2tb.filter(filt_, axis='observation')
# match and write metadata
mf = mf.reindex(q2tb.ids())
q2.Metadata(mf).save('../data/processed-data/metadata.qza')
mf.to_csv('../data/processed-data/metadata.tsv', sep='\t')
# write table
print(q2tb.shape)
with biom_open('../data/processed-data/table.biom', 'w') as f:
    q2tb.to_hdf5(f, "example")
q2.Artifact.import_data('FeatureTable[Frequency]', q2tb).save('../data/processed-data/table.qza')
# write taxonomy
q2.Artifact.import_data('FeatureData[Taxonomy]', taxonomy).save('../data/processed-data/taxonomy.qza')
# write tree
q2.Artifact.import_data('Phylogeny[Rooted]', tree).save('../data/processed-data/tree.qza')

(189385, 12945)
(178142, 12319)


'../data/processed-data/tree.qza'

In [364]:
mf.qiita_study.value_counts()


10894.0    10090
11648.0     1044
1718.0       505
Name: qiita_study, dtype: int64