# Goal

* Merging feature tables for multiple sequencing runs (`georg` and `tony`)
* Then running standard processing:
  * dataset summary
  * taxonomy
  * phylogeny

# Var

In [None]:
work_dir = '/ebio/abt3_projects/Georg_animal_feces/data/16S/LLA/'
tony_sv_file = '/ebio/abt3_projects/Georg_animal_feces/data/16S/LLA/tony/table.qza'
tony_rep_seq_file = '/ebio/abt3_projects/Georg_animal_feces/data/16S/LLA/tony/rep-seqs.qza'
georg_sv_file = '/ebio/abt3_projects/Georg_animal_feces/data/16S/LLA/georg/table.qza'
georg_rep_seq_file = '/ebio/abt3_projects/Georg_animal_feces/data/16S/LLA/georg/rep-seqs.qza'

classifier = '/ebio/abt3_projects/databases/leylab16s/classifiers/silva-119-99-515-806-nb-classifier.qza'

In [None]:
# MAKE SURE to CHANGE this to your email!
email='nyoungblut@tuebingen.mpg.de'

# only used for multi-threaded steps
threads=20   

# Note: memory is per-thread (eg., 8 threads x 8G memory = 64G total memory)
memory='3G'

# Max job time 
job_time='24:0:0'

# Output for SGE job errors/warnings/info
SGE_out='~/SGE/LLA/'

# Path to conda installation
conda_env_path='/ebio/abt3_projects/software/miniconda3' 

# Particular conda environment to use
conda_env='qiime2'

# Init

In [3]:
import os
import time

In [4]:
# making directories (if they don't exist)

## working directory
work_dir = os.path.abspath(os.path.expanduser(work_dir))
if not os.path.isdir(work_dir):
    os.makedirs(work_dir)

## SGE output    
SGE_out = os.path.abspath(os.path.expanduser(SGE_out))
if not os.path.isdir(os.path.split(SGE_out)[0]):
    os.makedirs(SGE_out)

In [5]:
# changing directory
%cd $work_dir

/ebio/abt3_projects/Georg_animal_feces/data/16S/LLA


## Functions

Defining python functions

In [6]:
def write_file(s, file_name):
    """Writing out (multi-line) string to file
    """
    F = os.path.abspath(os.path.expanduser(file_name))
    with open(F, 'w') as outF:
        outF.write(s)
    print('File written: {}'.format(F))

In [7]:
def sge_job(job_name='LLA', threads=threads, email=email, 
            memory=memory, job_time=job_time, SGE_out=SGE_out,
            conda_env_path=conda_env_path, 
            conda_env=conda_env):
    """Creating an SGE job script template 
    """
    job = """#!/bin/bash
#$ -N {job_name}
#$ -pe parallel {threads}
#$ -l h_vmem={memory}
#$ -l h_rt={job_time}
#$ -o {SGE_out}
#$ -j y
#$ -cwd
#$ -m ea
#$ -M {email}

CONDA_INSTALLATION="{conda_env_path}"
QIIME2_ENV="{conda_env}"

export PATH="$CONDA_INSTALLATION/bin":$PATH
export PATH="$CONDA_INSTALLATION/envs/$QIIME2_ENV/bin":$PATH
export LC_ALL=C.UTF-8
export LANG=C.UTF-8

""".format(job_name=job_name, threads=threads, email=email,
           memory=memory, job_time=job_time, SGE_out=SGE_out,
           conda_env_path=conda_env_path, conda_env=conda_env)
    return job

In [None]:
def qsub_wait(file_name):
    """Submit SGE job via `qsub` then wait for the job to finish (success or abort)
    """
    # submit job
    ret = !! qsub $file_name
    # get job ID
    job_ID = ret[0].split(' ')[2]
    print('SGE Job ID: {}'.format(job_ID))
    
    # query job
    while 1:
        ret = !! qstat
        IDs = [x.lstrip().split(' ')[0] for x in ret]
        if job_ID in IDs:
            time.sleep(2)
            continue
        else:
            print('SGE job finished: {}'.format(job_ID))
            break 
    time.sleep(2)

# Merging

## SV artifact

In [None]:
cmd = sge_job(job_name='merge', threads=1) 
cmd = cmd + """
qiime feature-table merge \
  --i-table1 {in_tbl1} \
  --i-table2 {in_tbl2} \
  --o-merged-table {out_tbl} \
  --p-overlap-method sum
"""
merged_sv_file = os.path.join(work_dir, 'table_merged.qza')
cmd = cmd.format(in_tbl1=georg_sv_file,
                 in_tbl2=tony_sv_file,
                 out_tbl=merged_sv_file)
write_file(cmd, 'merge.sh')

In [None]:
# view job script
!cat merge.sh

In [None]:
# submit to the cluster and wait until job completion/abort
qsub_wait('merge.sh')

In [None]:
!sleep 2
!ls -thlc $merged_sv_file

### Summarizing SV feature table

In [None]:
cmd = sge_job(job_name='feature_table', threads=1) 
cmd = cmd + """
qiime feature-table summarize \
  --i-table table_merged.qza \
  --o-visualization table_merged.qzv
"""

write_file(cmd, 'feature_table.sh')

In [None]:
# visualize job script
!cat feature_table.sh

In [None]:
# submit to the cluster and wait until job completion/abort
qsub_wait('feature_table.sh')

In [None]:
!sleep 2; ls -thlc table_merged.qzv

## rep-seqs

In [None]:
cmd = sge_job(job_name='merge', threads=1) 
cmd = cmd + """
qiime feature-table merge-seq-data \
  --i-data1 {in_tbl1} \
  --i-data2 {in_tbl2} \
  --o-merged-data {out_tbl} 
"""
merged_rep_seq_file = os.path.join(work_dir, 'rep-seqs_merged.qza')
cmd = cmd.format(in_tbl1=georg_rep_seq_file,
                 in_tbl2=tony_rep_seq_file,
                 out_tbl=merged_rep_seq_file)
write_file(cmd, 'merge.sh')

In [None]:
# view job script
!cat merge.sh

In [None]:
# submit to the cluster and wait until job completion/abort
qsub_wait('merge.sh')

In [None]:
!sleep 2; ls -thlc $merged_rep_seq_file

### Summarizing rep-seqs

In [None]:
cmd = sge_job(job_name='rep_seqs', threads=1) 
cmd = cmd + """
qiime feature-table tabulate-seqs \
  --i-data rep-seqs_merged.qza \
  --o-visualization rep-seqs_merged.qzv
"""
write_file(cmd, 'rep_seqs.sh')

In [None]:
# view job script
!cat rep_seqs.sh

In [None]:
# submit to the cluster and wait until job completion/abort
qsub_wait('rep_seqs.sh')

In [None]:
!sleep 2; ls -thlc rep-seqs_merged.qzv

#### Notes

* Number of samples: 255
* Number of SVs: 33,591
* Total abund (frequency): 1,275,000

# Taxonomy

Taxonomic classifiers perform best when they are trained based on your specific sample preparation and sequencing parameters, including the primers that were used for amplification and the length of your sequence reads. In this case, we're using the V4 region of the 16S rRNA gene, amplified with the 515f/806r primers, which is currently the most used approach. This makes our life easier, as QIIME2 provides a classifier trained on this same region, therefore, there is no need to train it ourselves.

## Classification

In [12]:
cmd = sge_job(job_name='taxonomy', memory='5G') 
cmd = cmd + """
qiime feature-classifier classify-sklearn \
  --i-classifier {classifier} \
  --i-reads rep-seqs_merged.qza \
  --o-classification taxonomy.qza \
  --p-n-jobs {threads} \
  --p-reads-per-batch 1000 
"""
cmd = cmd.format(classifier=classifier, threads=threads)

write_file(cmd, 'taxonomy.sh')

File written: /ebio/abt3_projects/Georg_animal_feces/data/16S/LLA/taxonomy.sh


In [13]:
# view job script
!cat taxonomy.sh

#!/bin/bash
#$ -N taxonomy
#$ -pe parallel 20
#$ -l h_vmem=5G
#$ -l h_rt=24:0:0
#$ -o /ebio/abt3/nyoungblut/SGE/LLA
#$ -j y
#$ -cwd
#$ -m ea
#$ -M nyoungblut@tuebingen.mpg.de

CONDA_INSTALLATION="/ebio/abt3_projects/software/miniconda3"
QIIME2_ENV="qiime2"

export PATH="$CONDA_INSTALLATION/bin":$PATH
export PATH="$CONDA_INSTALLATION/envs/$QIIME2_ENV/bin":$PATH
export LC_ALL=C.UTF-8
export LANG=C.UTF-8


qiime feature-classifier classify-sklearn   --i-classifier /ebio/abt3_projects/databases/leylab16s/classifiers/silva-119-99-515-806-nb-classifier.qza   --i-reads rep-seqs_merged.qza   --o-classification taxonomy.qza   --p-n-jobs 20   --p-reads-per-batch 1000 


In [None]:
# submit to the cluster and wait until job completion/abort
qsub_wait('taxonomy.sh')

SGE Job ID: 794581


In [36]:
!ls -thlc taxonomy_r5k.qza

-rw-r--r-- 1 nyoungblut abt3 1.9M Dec  5 13:06 taxonomy_r5k.qza


## Viewing taxonomy

This step will generate a table of the taxonomy

In [None]:
cmd = sge_job(job_name='view_tax', threads=1) 
cmd = cmd + """
qiime metadata tabulate \
  --m-input-file taxonomy.qza \
  --o-visualization taxonomy.qzv
"""
write_file(cmd, 'view_tax.sh')

In [None]:
# view job script
!cat view_tax.sh

In [None]:
# submit to the cluster and wait until job completion/abort
qsub_wait('view_tax.sh')

In [32]:
!ls -thlc taxonomy.qzv

-rw-r--r-- 1 nyoungblut abt3 4.3M Dec  6 10:54 taxonomy.qzv


# Filtering based on taxonomy

Filtering out non-microbe sequences (eg., chloroplasts)

## Filter feature table

In [18]:
cmd = sge_job(job_name='tax_filter', threads=1) 
cmd = cmd + """
qiime taxa filter-table \
  --i-table table_merged.qza \
  --i-taxonomy taxonomy.qza \
  --p-include D_1__ \
  --p-exclude D_0__Eukaryota,D_4__mitochondria,D_2__Chloroplast \
  --o-filtered-table table_merged_filt.qza
"""

write_file(cmd, 'tax_filter.sh')

File written: /ebio/abt3_projects/Georg_animal_feces/data/16S/LLA/tax_filter.sh


In [20]:
# view job script
!cat tax_filter.sh

#!/bin/bash
#$ -N tax_filter
#$ -pe parallel 1
#$ -l h_vmem=3G
#$ -l h_rt=24:0:0
#$ -o /ebio/abt3/nyoungblut/SGE/LLA
#$ -j y
#$ -cwd
#$ -m ea
#$ -M nyoungblut@tuebingen.mpg.de

CONDA_INSTALLATION="/ebio/abt3_projects/software/miniconda3"
QIIME2_ENV="qiime2"

export PATH="$CONDA_INSTALLATION/bin":$PATH
export PATH="$CONDA_INSTALLATION/envs/$QIIME2_ENV/bin":$PATH
export LC_ALL=C.UTF-8
export LANG=C.UTF-8


qiime taxa filter-table   --i-table table_merged.qza   --i-taxonomy taxonomy.qza   --p-include D_1__   --p-exclude D_0__Eukaryota,D_4__mitochondria,D_2__Chloroplast   --o-filtered-table table_merged_filt.qza


In [22]:
# submit to the cluster and wait until job completion/abort
qsub_wait('tax_filter.sh')

SGE Job ID: 796792
SGE job finished: 796792


In [23]:
!ls -thlc table_merged_filt.qza

-rw-r--r-- 1 nyoungblut abt3 1.6M Dec  6 10:55 table_merged_filt.qza


## Filter sequences

Filtering representative sequences based on filtered feature table

### Extracting feature IDs

In [24]:
cmd = sge_job(job_name='export', threads=1, memory='5G') 
cmd = cmd + """
# export feature table
qiime tools export --output-dir . {input}

# convert biom to tsv
biom convert --to-tsv -i feature-table.biom -o feature-table.txt

# create table of just IDs (to make the table smaller)
echo "Feature" > {output}
cut -f 1 feature-table.txt | tail -n +3 >> {output}
rm -f feature-table.txt
"""
cmd = cmd.format(input='table_merged_filt.qza',
                 output='table_merged_filt.txt')

write_file(cmd, 'export.sh')

File written: /ebio/abt3_projects/Georg_animal_feces/data/16S/LLA/export.sh


In [25]:
# view job script
!cat export.sh

#!/bin/bash
#$ -N export
#$ -pe parallel 1
#$ -l h_vmem=5G
#$ -l h_rt=24:0:0
#$ -o /ebio/abt3/nyoungblut/SGE/LLA
#$ -j y
#$ -cwd
#$ -m ea
#$ -M nyoungblut@tuebingen.mpg.de

CONDA_INSTALLATION="/ebio/abt3_projects/software/miniconda3"
QIIME2_ENV="qiime2"

export PATH="$CONDA_INSTALLATION/bin":$PATH
export PATH="$CONDA_INSTALLATION/envs/$QIIME2_ENV/bin":$PATH
export LC_ALL=C.UTF-8
export LANG=C.UTF-8


# export feature table
qiime tools export --output-dir . table_merged_filt.qza

# convert biom to tsv
biom convert --to-tsv -i feature-table.biom -o feature-table.txt

# create table of just IDs (to make the table smaller)
echo "Feature" > table_merged_filt.txt
cut -f 1 feature-table.txt | tail -n +3 >> table_merged_filt.txt
rm -f feature-table.txt


In [26]:
# submit to the cluster and wait until job completion/abort
qsub_wait('export.sh')

SGE Job ID: 796793
SGE job finished: 796793


In [27]:
!ls -thlc table_merged_filt.txt

-rw-r--r-- 1 nyoungblut abt3 1.3M Dec  6 10:57 table_merged_filt.txt


### Filtering

In [28]:
cmd = sge_job(job_name='seq_filter', threads=1, memory='5G') 
cmd = cmd + """
qiime feature-table filter-seqs \
  --i-data rep-seqs_merged.qza \
  --m-metadata-file table_merged_filt.txt \
  --o-filtered-data rep-seqs_merged_filt.qza
"""
write_file(cmd, 'seq_filter.sh')

File written: /ebio/abt3_projects/Georg_animal_feces/data/16S/LLA/seq_filter.sh


In [29]:
# view job script
!cat seq_filter.sh

#!/bin/bash
#$ -N seq_filter
#$ -pe parallel 1
#$ -l h_vmem=5G
#$ -l h_rt=24:0:0
#$ -o /ebio/abt3/nyoungblut/SGE/LLA
#$ -j y
#$ -cwd
#$ -m ea
#$ -M nyoungblut@tuebingen.mpg.de

CONDA_INSTALLATION="/ebio/abt3_projects/software/miniconda3"
QIIME2_ENV="qiime2"

export PATH="$CONDA_INSTALLATION/bin":$PATH
export PATH="$CONDA_INSTALLATION/envs/$QIIME2_ENV/bin":$PATH
export LC_ALL=C.UTF-8
export LANG=C.UTF-8


qiime feature-table filter-seqs   --i-data rep-seqs_merged.qza   --m-metadata-file table_merged_filt.txt   --o-filtered-data rep-seqs_merged_filt.qza


In [30]:
# submit to the cluster and wait until job completion/abort
qsub_wait('seq_filter.sh')

SGE Job ID: 796798
SGE job finished: 796798


In [31]:
!ls -thlc rep-seqs_merged_filt.qza

-rw-r--r-- 1 nyoungblut abt3 3.2M Dec  6 10:58 rep-seqs_merged_filt.qza


# checking that taxa were filtered

In [54]:
# metadata_file = '/ebio/abt3_projects/Georg_animal_feces/data/mapping/unified_metadata_20171206_r5k.txt'
# cmd = sge_job(job_name='tax_barplot', threads=1, memory='5G') 
# cmd = cmd + """
# qiime taxa barplot \
#   --i-table table_merged_filt.qza \
#   --i-taxonomy taxonomy.qza \
#   --m-metadata-file {} \
#   --o-visualization taxonomy_filt_barplot.qzv
# """
# cmd = cmd.format(metadata_file)
# write_file(cmd, 'tax_barplot.sh')

In [55]:
# submit to the cluster and wait until job completion/abort
#qsub_wait('tax_barplot.sh')

In [56]:
#!tail /ebio/abt3/nyoungblut/SGE/LLA

In [57]:
#!ls -thlc taxonomy_filt_barplot.qzv

# Generate a tree for phylogenetic diversity analyses

In downstream analyses, such as the computing of UniFrac distances, the use of a phylogenetic tree is necessary. To do this, we must first perform a multiple sequence alignment of the sequences in our `FeatureData[Sequence]` to create a `FeatureData[AlignedSequence]` QIIME2 artifact. Here we do this using *mafft*.

## Alignment

This step will perform *de novo* multiple sequence alignment using MAFFT

In [58]:
cmd = sge_job(job_name='align_seqs') 
cmd = cmd + """
qiime alignment mafft \
  --i-sequences rep-seqs_merged_filt.qza \
  --o-alignment aligned-rep-seqs_filt.qza \
  --p-n-threads {threads}
"""
cmd = cmd.format(threads=threads)

write_file(cmd, 'align_seqs.sh')

File written: /ebio/abt3_projects/Georg_animal_feces/data/16S/LLA/align_seqs.sh


In [59]:
# view job script
!cat align_seqs.sh

#!/bin/bash
#$ -N align_seqs
#$ -pe parallel 20
#$ -l h_vmem=3G
#$ -l h_rt=24:0:0
#$ -o /ebio/abt3/nyoungblut/SGE/LLA
#$ -j y
#$ -cwd
#$ -m ea
#$ -M nyoungblut@tuebingen.mpg.de

CONDA_INSTALLATION="/ebio/abt3_projects/software/miniconda3"
QIIME2_ENV="qiime2"

export PATH="$CONDA_INSTALLATION/bin":$PATH
export PATH="$CONDA_INSTALLATION/envs/$QIIME2_ENV/bin":$PATH
export LC_ALL=C.UTF-8
export LANG=C.UTF-8


qiime alignment mafft   --i-sequences rep-seqs_merged_filt.qza   --o-alignment aligned-rep-seqs_filt.qza   --p-n-threads 20


In [60]:
# submit to the cluster and wait until job completion/abort
qsub_wait('align_seqs.sh')

SGE Job ID: 798373
SGE job finished: 798373


In [61]:
!ls -thlc aligned-rep-seqs_filt.qza

-rw-r--r-- 1 nyoungblut abt3 3.5M Dec  6 12:29 aligned-rep-seqs_filt.qza


## Mask alignment

This step will mask (or filter) the alignment to remove positions that are highly variable. These positions are generally considered to add noise to a resulting phylogenetic inference (tree).

In [62]:
cmd = sge_job(job_name='mask_align', threads=1) 
cmd = cmd + """
qiime alignment mask \
  --i-alignment aligned-rep-seqs_filt.qza \
  --o-masked-alignment aligned-rep-seqs_filt_masked.qza
"""

write_file(cmd, 'mask_align.sh')

File written: /ebio/abt3_projects/Georg_animal_feces/data/16S/LLA/mask_align.sh


In [63]:
# view job script
!cat mask_align.sh

#!/bin/bash
#$ -N mask_align
#$ -pe parallel 1
#$ -l h_vmem=3G
#$ -l h_rt=24:0:0
#$ -o /ebio/abt3/nyoungblut/SGE/LLA
#$ -j y
#$ -cwd
#$ -m ea
#$ -M nyoungblut@tuebingen.mpg.de

CONDA_INSTALLATION="/ebio/abt3_projects/software/miniconda3"
QIIME2_ENV="qiime2"

export PATH="$CONDA_INSTALLATION/bin":$PATH
export PATH="$CONDA_INSTALLATION/envs/$QIIME2_ENV/bin":$PATH
export LC_ALL=C.UTF-8
export LANG=C.UTF-8


qiime alignment mask   --i-alignment aligned-rep-seqs_filt.qza   --o-masked-alignment aligned-rep-seqs_filt_masked.qza


In [64]:
# submit to the cluster and wait until job completion/abort
qsub_wait('mask_align.sh')

SGE Job ID: 798399
SGE job finished: 798399


In [65]:
!ls -thlc aligned-rep-seqs_filt_masked.qza

-rw-r--r-- 1 nyoungblut abt3 3.3M Dec  6 12:41 aligned-rep-seqs_filt_masked.qza


## Infer phylogeny

### Unrooted tree

This step will use `FastTree` to infer a phylogenetic tree from the masked alignment.

In [66]:
cmd = sge_job(job_name='unrooted_tree') 
cmd = cmd + """
qiime phylogeny fasttree \
  --i-alignment aligned-rep-seqs_filt_masked.qza \
  --o-tree aligned-rep-seqs_filt_masked_unroot-tree.qza \
  --p-n-threads {threads}
"""
cmd = cmd.format(threads=threads)

write_file(cmd, 'unrooted_tree.sh')

File written: /ebio/abt3_projects/Georg_animal_feces/data/16S/LLA/unrooted_tree.sh


In [67]:
# view job script
!cat unrooted_tree.sh

#!/bin/bash
#$ -N unrooted_tree
#$ -pe parallel 20
#$ -l h_vmem=3G
#$ -l h_rt=24:0:0
#$ -o /ebio/abt3/nyoungblut/SGE/LLA
#$ -j y
#$ -cwd
#$ -m ea
#$ -M nyoungblut@tuebingen.mpg.de

CONDA_INSTALLATION="/ebio/abt3_projects/software/miniconda3"
QIIME2_ENV="qiime2"

export PATH="$CONDA_INSTALLATION/bin":$PATH
export PATH="$CONDA_INSTALLATION/envs/$QIIME2_ENV/bin":$PATH
export LC_ALL=C.UTF-8
export LANG=C.UTF-8


qiime phylogeny fasttree   --i-alignment aligned-rep-seqs_filt_masked.qza   --o-tree aligned-rep-seqs_filt_masked_unroot-tree.qza   --p-n-threads 20


In [68]:
# submit to the cluster and wait until job completion/abort
qsub_wait('unrooted_tree.sh')

SGE Job ID: 798452
SGE job finished: 798452


In [69]:
!ls -thlc aligned-rep-seqs_r5k_filt_masked_unroot-tree.qza

ls: cannot access 'aligned-rep-seqs_r5k_filt_masked_unroot-tree.qza': No such file or directory


### Rooted tree

The `FastTree` program creates an unrooted tree.

This step will apply midpoint rooting, which will place the root of the tree at the midpoint of the longest tip-to-tip distance in the unrooted tree.

In [70]:
cmd = sge_job(job_name='rooted_tree', threads=1) 
cmd = cmd + """
qiime phylogeny midpoint-root \
  --i-tree aligned-rep-seqs_filt_masked_unroot-tree.qza \
  --o-rooted-tree aligned-rep-seqs_filt_masked_midroot-tree.qza
"""

write_file(cmd, 'rooted_tree.sh')

File written: /ebio/abt3_projects/Georg_animal_feces/data/16S/LLA/rooted_tree.sh


In [71]:
# view job script
!cat rooted_tree.sh

#!/bin/bash
#$ -N rooted_tree
#$ -pe parallel 1
#$ -l h_vmem=3G
#$ -l h_rt=24:0:0
#$ -o /ebio/abt3/nyoungblut/SGE/LLA
#$ -j y
#$ -cwd
#$ -m ea
#$ -M nyoungblut@tuebingen.mpg.de

CONDA_INSTALLATION="/ebio/abt3_projects/software/miniconda3"
QIIME2_ENV="qiime2"

export PATH="$CONDA_INSTALLATION/bin":$PATH
export PATH="$CONDA_INSTALLATION/envs/$QIIME2_ENV/bin":$PATH
export LC_ALL=C.UTF-8
export LANG=C.UTF-8


qiime phylogeny midpoint-root   --i-tree aligned-rep-seqs_filt_masked_unroot-tree.qza   --o-rooted-tree aligned-rep-seqs_filt_masked_midroot-tree.qza


In [72]:
# submit to the cluster and wait until job completion/abort
qsub_wait('rooted_tree.sh')

SGE Job ID: 798474
SGE job finished: 798474


In [73]:
!ls -thlc aligned-rep-seqs_filt_masked_midroot-tree.qza

-rw-r--r-- 1 nyoungblut abt3 1.9M Dec  6 12:57 aligned-rep-seqs_filt_masked_midroot-tree.qza
