## Goal
-  import bioms from Qiita to Qiime2 format
-  assign taxonomy to representative sequences
-  generate phylogenetic tree
-  calculate distance matrix (beta diversity) and plot PCoAs
-  generate metagenome predictions
-  find differentially abundant ASVs, KOs, and KEGG pathways

### versions
-  Qiime2 2018.6
-  PICRUSt 1.1.3

In [None]:
import os

In [None]:
!pwd

In [None]:
#set file paths
wd = "/projects/templeton/subprojects/sanguivory/analysis"

map_file = os.path.abspath("./analysis/TableS1_metadata.txt")

In [None]:
#import biom files downloaded from Qiita
!for i in 56 70 5233 5238; \
do qiime tools import \
  --input-path /projects/templeton/subprojects/sanguivory/analysis/data/${i}.biom \
  --type 'FeatureTable[Frequency]' \
  --source-format BIOMV210Format \
  --output-path /projects/templeton/subprojects/sanguivory/analysis/data/${i}_biom.qza ; done

In [None]:
#import rep seq files downloaded from Qiita
!for i in 56 70 5233 5238; \
do qiime tools import \
  --input-path /projects/templeton/subprojects/sanguivory/analysis/data/${i}.reference-hit.seqs.fa \
  --type 'FeatureData[Sequence]' \
  --output-path /projects/templeton/subprojects/sanguivory/analysis/data/${i}_repseq.qza ; done

In [None]:
#merge tables into one
!qiime feature-table merge \
  --i-tables {wd}/data/56_biom.qza \
  --i-tables {wd}/data/70_biom.qza \
  --i-tables {wd}/data/5233_biom.qza \
  --i-tables {wd}/data/5238_biom.qza \
  --o-merged-table {wd}/data/merged-table.qza

In [None]:
#merge rep seq files into one
!qiime feature-table merge-seqs \
  --i-data {wd}/data/56_repseq.qza \
  --i-data {wd}/data/70_repseq.qza \
  --i-data {wd}/data/5233_repseq.qza \
  --i-data {wd}/data/5238_repseq.qza \
  --o-merged-data {wd}/data/merged-repseqs.qza

In [None]:
#filter samples and repseqs to samples in the metadata file
!qiime feature-table filter-samples \
  --i-table {wd}/data/merged-table.qza \
  --m-metadata-file {map_file} \
  --o-filtered-table {wd}/data/merged-table_metadata-matched.qza

!qiime feature-table filter-seqs \
  --i-table {wd}/data/merged-table_metadata-matched.qza \
  --i-data {wd}/data/merged-repseqs.qza \
  --o-filtered-data {wd}/data/merged-repseqs-metadata-matched.qza


In [None]:
!mkdir {wd}/taxonomy
!wget -o {wd}/taxonomy/gg-13-8-99-515-806-nb-classifier.qza https://data.qiime2.org/2018.11/common/gg-13-8-99-515-806-nb-classifier.qza

In [None]:
#classify taxonomy using a pretrained classifier on the Greengenes 13_8 database
!echo "qiime feature-classifier classify-sklearn \
  --i-classifier {wd}/taxonomy/gg-13-8-99-515-806-nb-classifier.qza \
  --i-reads {wd}/data/merged-repseqs-metadata-matched.qza \
  --o-classification {wd}/taxonomy/sk_gg_tax.qza" | qsub -k eo -N sang_nb -l mem=8gb -l walltime=24:00:00 -V

In [None]:
#place 'novel' sequences into the Greengenes backbone tree
!mkdir {wd}/tree
!echo "qiime fragment-insertion sepp \
  --i-representative-sequences {wd}/data/merged-repseqs-metadata-matched.qza \
  --o-tree {wd}/tree/insertion-tree.qza \
  --o-placements {wd}/tree/insertion-placements.qza \
  --p-threads 16" | qsub -k eo -N sang_ins -l nodes=1:ppn=16 -l mem=128gb -l walltime=100:00:00 -V

In [None]:
#filter out Chloroplast and mitochondrial sequences
!qiime taxa filter-table \
  --i-table {wd}/data/merged-table_metadata-matched.qza \
  --i-taxonomy {wd}/taxonomy/sk_gg_tax.qza \
  --p-exclude mitochondria,chloroplast \
  --o-filtered-table {wd}/data/merged-table_metadata-matched.nochloro-nomito.qza

In [None]:
#rarefy the feature table to 5000 seqs/sample
#NOTE: each rarefaction will result in a slightly different dataset, which will affect all downstream analyses
!qiime feature-table rarefy \
  --i-table {wd}/data/merged-table_metadata-matched.nochloro-nomito.qza \
  --p-sampling-depth 5000 \
  --o-rarefied-table {wd}/data/merged-table_metadata-matched.nochloro-nomito.d5k.qza

In [None]:
#calculate unifrac distance among samples
!echo "qiime diversity beta-phylogenetic \
  --i-phylogeny {wd}/tree/insertion-tree.qza \
  --i-table {wd}/data/merged-table_metadata-matched.nochloro-nomito.d5k.qza \
  --p-metric unweighted_unifrac \
  --o-distance-matrix {wd}/bdiv/unw_unifrac.qza" \
 | qsub -k eo -N uu -l pmem=8gb -l walltime=60:00:00 -V

## PCoA plots of full dataset, and split into birds and bats

In [None]:
!qiime diversity filter-distance-matrix \
  --i-distance-matrix {wd}/bdiv/unw_unifrac.qza \
  --m-metadata-file {map_file} \
  --p-where "Taxonomy_Order='Chiroptera'" \
  --o-filtered-distance-matrix {wd}/bdiv/unw_unifrac_bats.qza

!qiime diversity filter-distance-matrix \
  --i-distance-matrix {wd}/bdiv/unw_unifrac.qza \
  --m-metadata-file {map_file} \
  --p-where "Taxonomy_Genus='Geospiza'" \
  --o-filtered-distance-matrix {wd}/bdiv/unw_unifrac_birds.qza

In [None]:
!qiime diversity pcoa \
  --i-distance-matrix {wd}/bdiv/unw_unifrac.qza \
  --o-pcoa {wd}/bdiv/pcoa_unw_unifrac.qza

!qiime diversity pcoa \
  --i-distance-matrix {wd}/bdiv/unw_unifrac_bats.qza \
  --o-pcoa {wd}/bdiv/pcoa_unw_unifrac_bats.qza

!qiime diversity pcoa \
  --i-distance-matrix {wd}/bdiv/unw_unifrac_birds.qza \
  --o-pcoa {wd}/bdiv/pcoa_unw_unifrac_birds.qza

In [None]:
!qiime emperor plot \
  --i-pcoa {wd}/bdiv/pcoa_unw_unifrac.qza \
  --m-metadata-file {map_file} \
  --o-visualization {wd}/bdiv/plot_pcoa_unw_unifrac.qza

!qiime emperor plot \
  --i-pcoa {wd}/bdiv/pcoa_unw_unifrac_bats.qza \
  --m-metadata-file {map_file} \
  --o-visualization {wd}/bdiv/plot_pcoa_unw_unifrac_bats.qza

!qiime emperor plot \
  --i-pcoa {wd}/bdiv/pcoa_unw_unifrac_birds.qza \
  --m-metadata-file {map_file} \
  --o-visualization {wd}/bdiv/plot_pcoa_unw_unifrac_birds.qza

## PICRUST

In [None]:
!qiime feature-table filter-seqs \
  --i-data {wd}/data/merged-repseqs-metadata-matched.qza \
  --i-table {wd}/data/merged-table_metadata-matched.nochloro-nomito.qza \
  --o-filtered-data {wd}/data/merged-repseqs-metadata-matched.nochloro-nomito.qza

In [None]:
#perform closed reference OTU picking using the deblurred sequences to be used in PICRUSt
#recommended reference is Greengenes version 13_5
!echo "qiime vsearch cluster-features-closed-reference \
  --i-sequences {wd}/data/merged-repseqs-metadata-matched.nochloro-nomito.qza \
  --i-table {wd}/data/merged-table_metadata-matched.nochloro-nomito.qza \
  --i-reference-sequences /home/ssong/bin/gg_13_5_otus/rep_set/gg_13_5_otu_99.qza \
  --p-perc-identity 1 \
  --p-threads 16 \
  --output-dir {wd}/closed_ref_gg13_5" \
 | qsub -k eo -N sang_cr -l nodes=1:ppn=16 -l mem=64gb -l walltime=20:00:00 -V

In [None]:
#rarefy feature table to 5000 seqs/sample
#NOTE: each rarefaction will result in a slightly different dataset, which will affect all downstream analyses
!qiime feature-table rarefy \
  --i-table {wd}/closed_ref_gg13_5/clustered_table.qza \
  --p-sampling-depth 5000 \
  --o-rarefied-table {wd}/closed_ref_gg13_5/clustered_table.d5k.qza

In [None]:
#convert the Q2 artifact into a biom table (readable by PICRUSt)
!qiime tools export \
  --input-path {wd}/closed_ref_gg13_5/clustered_table.d5k.qza \
  --output-path {wd}/closed_ref_gg13_5/biom/

!biom convert \
  -i {wd}/closed_ref_gg13_5/biom/feature-table.biom \
  -o {wd}/closed_ref_gg13_5/clustered_table.d5k.biom \
  --to-json \
  --table-type "OTU table"

In [None]:
!source deactivate

In [None]:
!source activate picrust

In [None]:
!mkdir {wd}/picrust

In [None]:
!normalize_by_copy_number.py \
 -i {wd}/closed_ref_gg13_5/clustered_table.d5k.biom\
 -o {wd}/picrust/cr_d5k.normalized_otus.biom

!predict_metagenomes.py \
 -i {wd}/picrust/cr_d5k.normalized_otus.biom \
 -o {wd}/picrust/cr_d5k.metagenome_predictions.biom \
 -a {wd}/picrust/cr_d5k.nsti_per_sample.txt

!categorize_by_function.py \
 -i {wd}/picrust/cr_d5k.metagenome_predictions.biom \
 -c KEGG_Pathways -l 3 \
 -o {wd}/picrust/cr_d5k.metagenome_predictions.L3.biom

!categorize_by_function.py \
 -i {wd}/picrust/cr_d5k.metagenome_predictions.biom \
 -c KEGG_Pathways -l 2 \
 -o {wd}/picrust/cr_d5k.metagenome_predictions.L2.biom

In [None]:
#import back into Qiime2 and split into birds and bats
!qiime tools import \
  --input-path {wd}/picrust/cr_d5k.metagenome_predictions.biom \
  --output-path {wd}/picrust/cr_d5k.metagenome_predictions.qza \
  --type 'FeatureTable[Frequency]' \
  --input-format BIOMV210Format \

!qiime tools import \
  --input-path {wd}/picrust/cr_d5k.metagenome_predictions.L3.biom \
  --output-path {wd}/picrust/cr_d5k.metagenome_predictions.L3.qza \
  --type 'FeatureTable[Frequency]' \
  --input-format BIOMV210Format \

!qiime feature-table filter-samples \
  --i-table {wd}/picrust/cr_d5k.metagenome_predictions.qza \
  --m-metadata-file {map_file} \
  --p-where "Taxonomy_Order='Chiroptera'" \
  --o-filtered-table {wd}/picrust/cr_d5k.metagenome_predictions_bats.qza

!qiime feature-table filter-samples \
  --i-table {wd}/picrust/cr_d5k.metagenome_predictions.qza \
  --m-metadata-file {map_file} \
  --p-where "Taxonomy_Genus='Geospiza'" \
  --o-filtered-table {wd}/picrust/cr_d5k.metagenome_predictions_birds.qza

!qiime feature-table filter-samples \
  --i-table {wd}/picrust/cr_d5k.metagenome_predictions.L3.qza \
  --m-metadata-file {map_file} \
  --p-where "Taxonomy_Order='Chiroptera'" \
  --o-filtered-table {wd}/picrust/cr_d5k.metagenome_predictions_bats.L3.qza

!qiime feature-table filter-samples \
  --i-table {wd}/picrust/cr_d5k.metagenome_predictions.qza \
  --m-metadata-file {map_file} \
  --p-where "Taxonomy_Genus='Geospiza'" \
  --o-filtered-table {wd}/picrust/cr_d5k.metagenome_predictions_birds.L3.qza

## ANCOM for differential abundance testing

In [None]:
#using unrarefied data
!qiime feature-table filter-samples \
  --i-table {wd}/data/merged-table_metadata-matched.nochloro-nomito.qza \
  --m-metadata-file {map_file} \
  --p-where "Taxonomy_Order='Chiroptera'" \
  --o-filtered-table {wd}/data/merged-table_metadata-matched.nochloro-nomito_bats.qza
    
!qiime feature-table filter-samples \
  --i-table {wd}/data/merged-table_metadata-matched.nochloro-nomito.qza \
  --m-metadata-file {map_file} \
  --p-where "Taxonomy_Genus='Geospiza'" \
  --o-filtered-table {wd}/data/merged-table_metadata-matched.nochloro-nomito_birds.qza

In [None]:
#ancom works on log ratios, and thus zeroes must not be present. 
#a pseudocount of 1 is added to everything
!mkdir {wd}/ancom
!qiime composition add-pseudocount \
  --i-table {wd}/data/merged-table_metadata-matched.nochloro-nomito_bats.qza \
  --o-composition-table {wd}/ancom/ancom_table_bats.qza

!qiime composition add-pseudocount \
  --i-table {wd}/data/merged-table_metadata-matched.nochloro-nomito_birds.qza \
  --o-composition-table {wd}/ancom/ancom_table_birds.qza

In [None]:
!mkdir {wd}/picrust/ancom
!qiime composition add-pseudocount \
  --i-table {wd}/picrust/cr_d5k.metagenome_predictions_bats.qza \
  --o-composition-table {wd}/picrust/ancom/ancom_table_bats.qza

!qiime composition add-pseudocount \
  --i-table {wd}/picrust/cr_d5k.metagenome_predictions_birds.qza \
  --o-composition-table {wd}/picrust/ancom/ancom_table_birds.qza

!qiime composition add-pseudocount \
  --i-table {wd}/picrust/cr_d5k.metagenome_predictions_bats.L3.qza \
  --o-composition-table {wd}/picrust/ancom/ancom_table_bats.L3.qza

!qiime composition add-pseudocount \
  --i-table {wd}/picrust/cr_d5k.metagenome_predictions_birds.L3.qza \
  --o-composition-table {wd}/picrust/ancom/ancom_table_birds.L3.qza

In [None]:
#identify sequences that are differentially abundant between hematophagous and non-hematophagous bats and birds
!echo "qiime composition ancom \
  --i-table {wd}/ancom/ancom_table_bats.qza \
  --m-metadata-file {map_file} \
  --m-metadata-column hematophagous \
  --o-visualization {wd}/ancom/ancom-results.bats.qzv" \
 | qsub -k eo -N bat_ancom -l pmem=32gb -l walltime=200:00:00 -V
        
!echo "qiime composition ancom \
  --i-table {wd}/ancom/ancom_table_birds.qza \
  --m-metadata-file {map_file} \
  --m-metadata-column hematophagous \
  --o-visualization {wd}/ancom/ancom-results.birds.qzv" \
 | qsub -k eo -N bird_ancom -l pmem=32gb -l walltime=200:00:00 -V

In [None]:
#identify KOs and pathways that are differentially abundant
!echo "qiime composition ancom \
  --i-table {wd}/picrust/ancom/ancom_table_bats.qza \
  --m-metadata-file {map_file} \
  --m-metadata-column hematophagous \
  --o-visualization {wd}/picrust/ancom/ancom-results.bats.qzv" \
 | qsub -k eo -N pbat_ancom -l pmem=8gb -l walltime=20:00:00 -V
        
!echo "qiime composition ancom \
  --i-table {wd}/picrust/ancom/ancom_table_birds.qza \
  --m-metadata-file {map_file} \
  --m-metadata-column hematophagous \
  --o-visualization {wd}/picrust/ancom/ancom-results.birds.qzv" \
 | qsub -k eo -N pbird_ancom -l pmem=8gb -l walltime=20:00:00 -V
        
!echo "qiime composition ancom \
  --i-table {wd}/picrust/ancom/ancom_table_bats.L3.qza \
  --m-metadata-file {map_file} \
  --m-metadata-column hematophagous \
  --o-visualization {wd}/picrust/ancom/ancom-results.bats.L3.qzv" \
 | qsub -k eo -N pbatl3_ancom -l pmem=4gb -l walltime=10:00:00 -V
        
!echo "qiime composition ancom \
  --i-table {wd}/picrust/ancom/ancom_table_birds.L3.qza \
  --m-metadata-file {map_file} \
  --m-metadata-column hematophagous \
  --o-visualization {wd}/picrust/ancom/ancom-results.birds.L3.qzv" \
 | qsub -k eo -N pbirdl3_ancom -l pmem=4gb -l walltime=10:00:00 -V

#### view resulting pcoas and ancom visualizations in qiime2 viewer (view.qiime2.org)

### use picrust to determine contributions of OTUs to the KOs identified by ANCOM

In [None]:
!metagenome_contributions.py \
  -i {wd}/picrust/cr_d5k.normalized_otus.biom \
  -l K07650,K10793,K10795,K10796,K01777,K05020,K06714,K07710,K10670,K10672,K12960,K01583,K12267,K08100,K00176,K00177,K00532,K00171,K00175,K00179,K00180,K10829,K00929,K01034,K00043,K01035,K01905,K02688,K09696,K09697,K03191,K14048 \
  -o {wd}/picrust/ko_metagenome_contributions.tsv
