# Set up notebook environment
## NOTE: Use a QIIME2 kernel

In [2]:
import os
import biom
import warnings
import pickle
import numpy as np
import pandas as pd
import qiime2 as q2
from biom import Table
from skbio import OrdinationResults
from skbio.stats import subsample_counts
from skbio.stats.distance import permanova, anosim, mantel
from skbio.stats.distance import DistanceMatrix
from qiime2.plugins.deicode.actions import rpca
from qiime2.plugins.feature_table.actions import rarefy
from qiime2.plugins.diversity.actions import beta_group_significance
from qiime2.plugins.emperor.actions import biplot, plot
from qiime2.plugins.diversity.actions import (beta,
                                              beta_phylogenetic,
                                              pcoa)
from qiime2.plugins import demux, deblur, quality_filter, \
                           metadata, feature_table, alignment, \
                           phylogeny, diversity, emperor, feature_classifier, \
                           taxa, composition

from assets.step_wise_anova import run_stepwise_anova
from qiime2.plugins.fragment_insertion.actions import filter_features
warnings.filterwarnings("ignore", category=DeprecationWarning)

# helper functions
from assets.util_updated import (mantel_matched, simulate_depth,
                        all_dists, all_dists_no_tree, nested_permanova)

# plotting
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

plt.style.use('ggplot')
%matplotlib inline


# Subset metadata to make paired files between extraction kits

In [None]:
# Read in sample metadata
md = pd.read_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/sample_metadata/12201_metadata.txt',
                sep = '\t')


In [None]:
# Subset sample metadata to make files for round 1 and round 2
md_round1and2 = md[md['round'] != 3]
md_round1 = md_round1and2[md_round1and2['round'] == 1]
md_round2 = md_round1and2[md_round1and2['round'] == 2]


In [None]:
# Subset round-specific metadata files to make files for each kit
md_round1_powersoil = md_round1[md_round1['extraction_kit'] == 'PowerSoil']
md_round1_powersoil_pro = md_round1[md_round1['extraction_kit'] == 'PowerSoil Pro']
md_round1_norgen = md_round1[md_round1['extraction_kit'] == 'Norgen']
md_round2_powersoil = md_round2[md_round2['extraction_kit'] == 'PowerSoil']
md_round2_magmax = md_round2[md_round2['extraction_kit'] == 'MagMAX Microbiome']
md_round2_nucleomag = md_round2[md_round2['extraction_kit'] == 'NucleoMag Food']
md_round2_zymo = md_round2[md_round2['extraction_kit'] == 'Zymo MagBead']


In [None]:
# Merge kit-specific files to make paired files for comparison
md_round1_ps_vs_pro = pd.concat([md_round1_powersoil, md_round1_powersoil_pro])
md_round1_ps_vs_norgen = pd.concat([md_round1_powersoil, md_round1_norgen])
md_round2_ps_vs_magmax = pd.concat([md_round2_powersoil, md_round2_magmax])
md_round2_ps_vs_nucleomag = pd.concat([md_round2_powersoil, md_round2_nucleomag])
md_round2_ps_vs_zymo = pd.concat([md_round2_powersoil, md_round2_zymo])


In [None]:
# Export paired files
md_round1_ps_vs_pro.to_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/sample_metadata/01_paired_files/12201_metadata_round1_ps_vs_pro.txt',
                           sep = '\t',
                           index = False)
md_round1_ps_vs_norgen.to_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/sample_metadata/01_paired_files/12201_metadata_round1_ps_vs_norgen.txt',
                           sep = '\t',
                           index = False)
md_round2_ps_vs_magmax.to_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/sample_metadata/01_paired_files/12201_metadata_round2_ps_vs_magmax.txt',
                           sep = '\t',
                           index = False)
md_round2_ps_vs_nucleomag.to_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/sample_metadata/01_paired_files/12201_metadata_round2_ps_vs_nucleomag.txt',
                           sep = '\t',
                           index = False)
md_round2_ps_vs_zymo.to_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/sample_metadata/01_paired_files/12201_metadata_round2_ps_vs_zymo.txt',
                           sep = '\t',
                           index = False)


# Mantel tests between pairs of kits

## 16S data

In [3]:
# Import data
md_round1_ps_vs_pro_q2 = q2.Metadata.load('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/sample_metadata/01_paired_files/12201_metadata_round1_ps_vs_pro.txt')
md_round1_ps_vs_norgen_q2 = q2.Metadata.load('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/sample_metadata/01_paired_files/12201_metadata_round1_ps_vs_norgen.txt')
md_round2_ps_vs_magmax_q2 = q2.Metadata.load('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/sample_metadata/01_paired_files/12201_metadata_round2_ps_vs_magmax.txt')
md_round2_ps_vs_nucleomag_q2 = q2.Metadata.load('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/sample_metadata/01_paired_files/12201_metadata_round2_ps_vs_nucleomag.txt')
md_round2_ps_vs_zymo_q2 = q2.Metadata.load('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/sample_metadata/01_paired_files/12201_metadata_round2_ps_vs_zymo.txt')

table_16S_hbm = q2.Artifact.load('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/data/16S/10_filtered_data/dna_bothPS_16S_deblur_biom_lod_noChl_noMit_sepp_gg_noNTCs_noMock_hbm.qza')
table_16S_lbm = q2.Artifact.load('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/data/16S/10_filtered_data/dna_bothPS_16S_deblur_biom_lod_noChl_noMit_sepp_gg_noNTCs_noMock_lbm.qza')

tree_16S = q2.Artifact.load('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/data/16S/09_fragment_insertion/dna_all_16S_deblur_seqs_noChl_noMit_tree_gg.qza')


In [4]:
# PowerSoil vs. PowerSoil Pro - High biomass samples
## Filter table
table_16S_hbm_biom = table_16S_hbm.view(Table)
md_round1_ps_vs_pro_df_hbm = md_round1_ps_vs_pro_q2.to_dataframe()
shared_ = list(set(table_16S_hbm_biom.ids()) & set(md_round1_ps_vs_pro_df_hbm.index))
md_round1_ps_vs_pro_df_hbm = md_round1_ps_vs_pro_df_hbm.reindex(shared_)
table_16S_hbm_biom_ps_vs_pro = table_16S_hbm_biom.filter(shared_)
keep_ = table_16S_hbm_biom_ps_vs_pro.ids('observation')[table_16S_hbm_biom_ps_vs_pro.sum('observation') > 0]
table_16S_hbm_biom_ps_vs_pro.filter(keep_, axis='observation')

## Import filtered table and re-indexed metadata file
table_16S_hbm_ps_vs_pro = q2.Artifact.import_data('FeatureTable[Frequency]', table_16S_hbm_biom_ps_vs_pro)
md_round1_ps_vs_pro_q2_hbm = q2.Metadata(md_round1_ps_vs_pro_df_hbm)

## Generate distance matrices using 'all_dissts' utils
rare_depth_16S_hbm = 12690
dists_res_16S_hbm = all_dists(table_16S_hbm_ps_vs_pro,
                      rare_depth_16S_hbm, tree_16S)

## Make a unique ID
md_round1_ps_vs_pro_q2_dist = md_round1_ps_vs_pro_q2_hbm.to_dataframe().copy()
md_round1_ps_vs_pro_q2_dist['unique_sample_id'] = ['.'.join(rn_.split('.')[:-2])
                               for rn_ in md_round1_ps_vs_pro_q2_dist.index]
grouping = 'extraction_kit'
ids = 'unique_sample_id'

## Run Mantel test for each distance matrix
mantel_res_16S_hbm = {}
for metric_, dist_mantel in dists_res_16S_hbm.items():
    # subset mf for dist (rare)
    dist_mantel = dist_mantel.distance_matrix.view(DistanceMatrix)
    md_round1_ps_vs_pro_q2_dist_sub = md_round1_ps_vs_pro_q2_dist.reindex(dist_mantel.ids)
    # corr, p, n
    mantel_res_16S_hbm[metric_] = mantel_matched(dist_mantel,
                                         md_round1_ps_vs_pro_q2_dist_sub,
                                         grouping,
                                         ids)

## Compile 
mantel_res_16S_hbm = pd.DataFrame(mantel_res_16S_hbm,
                          ['corr', 'p', 'n'])
mantel_res_16S_hbm.to_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/mantel_correlations/table_mantel_16S_hbm_ps_vs_pro.txt', sep='\t')
mantel_res_16S_hbm


Unnamed: 0,Jaccard,Unweighted UniFrac,Weighted UniFrac,RPCA
corr,0.838001,0.789207,0.769943,0.84296
p,0.0002,0.0002,0.0002,0.0002
n,45.0,45.0,45.0,45.0


In [5]:
# PowerSoil vs. PowerSoil Pro - Low biomass samples
## Filter table
table_16S_lbm_biom = table_16S_lbm.view(Table)
md_round1_ps_vs_pro_df_lbm = md_round1_ps_vs_pro_q2.to_dataframe()
shared_ = list(set(table_16S_lbm_biom.ids()) & set(md_round1_ps_vs_pro_df_lbm.index))
md_round1_ps_vs_pro_df_lbm = md_round1_ps_vs_pro_df_lbm.reindex(shared_)
table_16S_lbm_biom_ps_vs_pro = table_16S_lbm_biom.filter(shared_)
keep_ = table_16S_lbm_biom_ps_vs_pro.ids('observation')[table_16S_lbm_biom_ps_vs_pro.sum('observation') > 0]
table_16S_lbm_biom_ps_vs_pro.filter(keep_, axis='observation')

## Import filtered table and re-indexed metadata file
table_16S_lbm_ps_vs_pro = q2.Artifact.import_data('FeatureTable[Frequency]', table_16S_lbm_biom_ps_vs_pro)
md_round1_ps_vs_pro_q2_lbm = q2.Metadata(md_round1_ps_vs_pro_df_lbm)

## Generate distance matrices using 'all_dissts' utils
rare_depth_16S_lbm = 3295
dists_res_16S_lbm = all_dists(table_16S_lbm_ps_vs_pro,
                      rare_depth_16S_lbm, tree_16S)

## Make a unique ID
md_round1_ps_vs_pro_q2_dist = md_round1_ps_vs_pro_q2_lbm.to_dataframe().copy()
md_round1_ps_vs_pro_q2_dist['unique_sample_id'] = ['.'.join(rn_.split('.')[:-2])
                               for rn_ in md_round1_ps_vs_pro_q2_dist.index]
grouping = 'extraction_kit'
ids = 'unique_sample_id'

## Run Mantel test for each distance matrix
mantel_res_16S_lbm = {}
for metric_, dist_mantel in dists_res_16S_lbm.items():
    # subset mf for dist (rare)
    dist_mantel = dist_mantel.distance_matrix.view(DistanceMatrix)
    md_round1_ps_vs_pro_q2_dist_sub = md_round1_ps_vs_pro_q2_dist.reindex(dist_mantel.ids)
    # corr, p, n
    mantel_res_16S_lbm[metric_] = mantel_matched(dist_mantel,
                                         md_round1_ps_vs_pro_q2_dist_sub,
                                         grouping,
                                         ids)

## Compile 
mantel_res_16S_lbm = pd.DataFrame(mantel_res_16S_lbm,
                          ['corr', 'p', 'n'])
mantel_res_16S_lbm.to_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/mantel_correlations/table_mantel_16S_lbm_ps_vs_pro.txt', sep='\t')
mantel_res_16S_lbm


Unnamed: 0,Jaccard,Unweighted UniFrac,Weighted UniFrac,RPCA
corr,0.907415,0.904647,0.90518,0.883156
p,0.0002,0.0002,0.0002,0.0002
n,28.0,28.0,28.0,28.0


In [6]:
# PowerSoil vs. Norgen - High biomass samples
## Filter table
table_16S_hbm_biom = table_16S_hbm.view(Table)
md_round1_ps_vs_norgen_df_hbm = md_round1_ps_vs_norgen_q2.to_dataframe()
shared_ = list(set(table_16S_hbm_biom.ids()) & set(md_round1_ps_vs_norgen_df_hbm.index))
md_round1_ps_vs_norgen_df_hbm = md_round1_ps_vs_norgen_df_hbm.reindex(shared_)
table_16S_hbm_biom_ps_vs_norgen = table_16S_hbm_biom.filter(shared_)
keep_ = table_16S_hbm_biom_ps_vs_norgen.ids('observation')[table_16S_hbm_biom_ps_vs_norgen.sum('observation') > 0]
table_16S_hbm_biom_ps_vs_norgen.filter(keep_, axis='observation')

## Import filtered table and re-indexed metadata file
table_16S_hbm_ps_vs_norgen = q2.Artifact.import_data('FeatureTable[Frequency]', table_16S_hbm_biom_ps_vs_norgen)
md_round1_ps_vs_norgen_q2_hbm = q2.Metadata(md_round1_ps_vs_norgen_df_hbm)

## Generate distance matrices using 'all_dissts' utils
rare_depth_16S_hbm = 12690
dists_res_16S_hbm = all_dists(table_16S_hbm_ps_vs_norgen,
                      rare_depth_16S_hbm, tree_16S)

## Make a unique ID
md_round1_ps_vs_norgen_q2_dist = md_round1_ps_vs_norgen_q2_hbm.to_dataframe().copy()
md_round1_ps_vs_norgen_q2_dist['unique_sample_id'] = ['.'.join(rn_.split('.')[:-2])
                               for rn_ in md_round1_ps_vs_norgen_q2_dist.index]
grouping = 'extraction_kit'
ids = 'unique_sample_id'

## Run Mantel test for each distance matrix
mantel_res_16S_hbm = {}
for metric_, dist_mantel in dists_res_16S_hbm.items():
    # subset mf for dist (rare)
    dist_mantel = dist_mantel.distance_matrix.view(DistanceMatrix)
    md_round1_ps_vs_norgen_q2_dist_sub = md_round1_ps_vs_norgen_q2_dist.reindex(dist_mantel.ids)
    # corr, p, n
    mantel_res_16S_hbm[metric_] = mantel_matched(dist_mantel,
                                         md_round1_ps_vs_norgen_q2_dist_sub,
                                         grouping,
                                         ids)

## Compile 
mantel_res_16S_hbm = pd.DataFrame(mantel_res_16S_hbm,
                          ['corr', 'p', 'n'])
mantel_res_16S_hbm.to_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/mantel_correlations/table_mantel_16S_hbm_ps_vs_norgen.txt', sep='\t')
mantel_res_16S_hbm


Unnamed: 0,Jaccard,Unweighted UniFrac,Weighted UniFrac,RPCA
corr,0.923284,0.873744,0.860568,0.860604
p,0.0002,0.0002,0.0002,0.0002
n,42.0,42.0,42.0,42.0


In [7]:
# PowerSoil vs. Norgen - High biomass samples
## Filter table
table_16S_lbm_biom = table_16S_lbm.view(Table)
md_round1_ps_vs_norgen_df_lbm = md_round1_ps_vs_norgen_q2.to_dataframe()
shared_ = list(set(table_16S_lbm_biom.ids()) & set(md_round1_ps_vs_norgen_df_lbm.index))
md_round1_ps_vs_norgen_df_lbm = md_round1_ps_vs_norgen_df_lbm.reindex(shared_)
table_16S_lbm_biom_ps_vs_norgen = table_16S_lbm_biom.filter(shared_)
keep_ = table_16S_lbm_biom_ps_vs_norgen.ids('observation')[table_16S_lbm_biom_ps_vs_norgen.sum('observation') > 0]
table_16S_lbm_biom_ps_vs_norgen.filter(keep_, axis='observation')

## Import filtered table and re-indexed metadata file
table_16S_lbm_ps_vs_norgen = q2.Artifact.import_data('FeatureTable[Frequency]', table_16S_lbm_biom_ps_vs_norgen)
md_round1_ps_vs_norgen_q2_lbm = q2.Metadata(md_round1_ps_vs_norgen_df_lbm)

## Generate distance matrices using 'all_dissts' utils
rare_depth_16S_lbm = 3295
dists_res_16S_lbm = all_dists(table_16S_lbm_ps_vs_norgen,
                      rare_depth_16S_lbm, tree_16S)

## Make a unique ID
md_round1_ps_vs_norgen_q2_dist = md_round1_ps_vs_norgen_q2_lbm.to_dataframe().copy()
md_round1_ps_vs_norgen_q2_dist['unique_sample_id'] = ['.'.join(rn_.split('.')[:-2])
                               for rn_ in md_round1_ps_vs_norgen_q2_dist.index]
grouping = 'extraction_kit'
ids = 'unique_sample_id'

## Run Mantel test for each distance matrix
mantel_res_16S_lbm = {}
for metric_, dist_mantel in dists_res_16S_lbm.items():
    # subset mf for dist (rare)
    dist_mantel = dist_mantel.distance_matrix.view(DistanceMatrix)
    md_round1_ps_vs_norgen_q2_dist_sub = md_round1_ps_vs_norgen_q2_dist.reindex(dist_mantel.ids)
    # corr, p, n
    mantel_res_16S_lbm[metric_] = mantel_matched(dist_mantel,
                                         md_round1_ps_vs_norgen_q2_dist_sub,
                                         grouping,
                                         ids)

## Compile 
mantel_res_16S_lbm = pd.DataFrame(mantel_res_16S_lbm,
                          ['corr', 'p', 'n'])
mantel_res_16S_lbm.to_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/mantel_correlations/table_mantel_16S_lbm_ps_vs_norgen.txt', sep='\t')
mantel_res_16S_lbm


Unnamed: 0,Jaccard,Unweighted UniFrac,Weighted UniFrac,RPCA
corr,0.81055,0.327616,0.850769,0.07074
p,0.0022,0.211158,0.007598,0.777045
n,7.0,7.0,7.0,7.0


In [8]:
# PowerSoil vs. MagMAX Microbiome - High biomass samples
## Filter table
table_16S_hbm_biom = table_16S_hbm.view(Table)
md_round2_ps_vs_magmax_df_hbm = md_round2_ps_vs_magmax_q2.to_dataframe()
shared_ = list(set(table_16S_hbm_biom.ids()) & set(md_round2_ps_vs_magmax_df_hbm.index))
md_round2_ps_vs_magmax_df_hbm = md_round2_ps_vs_magmax_df_hbm.reindex(shared_)
table_16S_hbm_biom_ps_vs_magmax = table_16S_hbm_biom.filter(shared_)
keep_ = table_16S_hbm_biom_ps_vs_magmax.ids('observation')[table_16S_hbm_biom_ps_vs_magmax.sum('observation') > 0]
table_16S_hbm_biom_ps_vs_magmax.filter(keep_, axis='observation')

## Import filtered table and re-indexed metadata file
table_16S_hbm_ps_vs_magmax = q2.Artifact.import_data('FeatureTable[Frequency]', table_16S_hbm_biom_ps_vs_magmax)
md_round2_ps_vs_magmax_q2_hbm = q2.Metadata(md_round2_ps_vs_magmax_df_hbm)

## Generate distance matrices using 'all_dissts' utils
rare_depth_16S_hbm = 12690
dists_res_16S_hbm = all_dists(table_16S_hbm_ps_vs_magmax,
                      rare_depth_16S_hbm, tree_16S)

## Make a unique ID
md_round2_ps_vs_magmax_q2_dist = md_round2_ps_vs_magmax_q2_hbm.to_dataframe().copy()
md_round2_ps_vs_magmax_q2_dist['unique_sample_id'] = ['.'.join(rn_.split('.')[:-2])
                               for rn_ in md_round2_ps_vs_magmax_q2_dist.index]
grouping = 'extraction_kit'
ids = 'unique_sample_id'

## Run Mantel test for each distance matrix
mantel_res_16S_hbm = {}
for metric_, dist_mantel in dists_res_16S_hbm.items():
    # subset mf for dist (rare)
    dist_mantel = dist_mantel.distance_matrix.view(DistanceMatrix)
    md_round2_ps_vs_magmax_q2_dist_sub = md_round2_ps_vs_magmax_q2_dist.reindex(dist_mantel.ids)
    # corr, p, n
    mantel_res_16S_hbm[metric_] = mantel_matched(dist_mantel,
                                         md_round2_ps_vs_magmax_q2_dist_sub,
                                         grouping,
                                         ids)

## Compile 
mantel_res_16S_hbm = pd.DataFrame(mantel_res_16S_hbm,
                          ['corr', 'p', 'n'])
mantel_res_16S_hbm.to_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/mantel_correlations/table_mantel_16S_hbm_ps_vs_magmax.txt', sep='\t')
mantel_res_16S_hbm


Unnamed: 0,Jaccard,Unweighted UniFrac,Weighted UniFrac,RPCA
corr,0.954889,0.895365,0.931581,0.904735
p,0.0002,0.0002,0.0002,0.0002
n,40.0,40.0,40.0,40.0


In [9]:
# PowerSoil vs. MagMAX Microbiome - Low biomass samples
## Filter table
table_16S_lbm_biom = table_16S_lbm.view(Table)
md_round2_ps_vs_magmax_df_lbm = md_round2_ps_vs_magmax_q2.to_dataframe()
shared_ = list(set(table_16S_lbm_biom.ids()) & set(md_round2_ps_vs_magmax_df_lbm.index))
md_round2_ps_vs_magmax_df_lbm = md_round2_ps_vs_magmax_df_lbm.reindex(shared_)
table_16S_lbm_biom_ps_vs_magmax = table_16S_lbm_biom.filter(shared_)
keep_ = table_16S_lbm_biom_ps_vs_magmax.ids('observation')[table_16S_lbm_biom_ps_vs_magmax.sum('observation') > 0]
table_16S_lbm_biom_ps_vs_magmax.filter(keep_, axis='observation')

## Import filtered table and re-indexed metadata file
table_16S_lbm_ps_vs_magmax = q2.Artifact.import_data('FeatureTable[Frequency]', table_16S_lbm_biom_ps_vs_magmax)
md_round2_ps_vs_magmax_q2_lbm = q2.Metadata(md_round2_ps_vs_magmax_df_lbm)

## Generate distance matrices using 'all_dissts' utils
rare_depth_16S_lbm = 3295
dists_res_16S_lbm = all_dists(table_16S_lbm_ps_vs_magmax,
                      rare_depth_16S_lbm, tree_16S)

## Make a unique ID
md_round2_ps_vs_magmax_q2_dist = md_round2_ps_vs_magmax_q2_lbm.to_dataframe().copy()
md_round2_ps_vs_magmax_q2_dist['unique_sample_id'] = ['.'.join(rn_.split('.')[:-2])
                               for rn_ in md_round2_ps_vs_magmax_q2_dist.index]
grouping = 'extraction_kit'
ids = 'unique_sample_id'

## Run Mantel test for each distance matrix
mantel_res_16S_lbm = {}
for metric_, dist_mantel in dists_res_16S_lbm.items():
    # subset mf for dist (rare)
    dist_mantel = dist_mantel.distance_matrix.view(DistanceMatrix)
    md_round2_ps_vs_magmax_q2_dist_sub = md_round2_ps_vs_magmax_q2_dist.reindex(dist_mantel.ids)
    # corr, p, n
    mantel_res_16S_lbm[metric_] = mantel_matched(dist_mantel,
                                         md_round2_ps_vs_magmax_q2_dist_sub,
                                         grouping,
                                         ids)

## Compile 
mantel_res_16S_lbm = pd.DataFrame(mantel_res_16S_lbm,
                          ['corr', 'p', 'n'])
mantel_res_16S_lbm.to_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/mantel_correlations/table_mantel_16S_lbm_ps_vs_magmax.txt', sep='\t')
mantel_res_16S_lbm


Unnamed: 0,Jaccard,Unweighted UniFrac,Weighted UniFrac,RPCA
corr,0.912166,0.922532,0.926807,0.831227
p,0.0002,0.0002,0.0002,0.0002
n,48.0,48.0,48.0,48.0


In [10]:
# PowerSoil vs. NucleoMag Food - High biomass samples
## Filter table
table_16S_hbm_biom = table_16S_hbm.view(Table)
md_round2_ps_vs_nucleomag_df_hbm = md_round2_ps_vs_nucleomag_q2.to_dataframe()
shared_ = list(set(table_16S_hbm_biom.ids()) & set(md_round2_ps_vs_nucleomag_df_hbm.index))
md_round2_ps_vs_nucleomag_df_hbm = md_round2_ps_vs_nucleomag_df_hbm.reindex(shared_)
table_16S_hbm_biom_ps_vs_nucleomag = table_16S_hbm_biom.filter(shared_)
keep_ = table_16S_hbm_biom_ps_vs_nucleomag.ids('observation')[table_16S_hbm_biom_ps_vs_nucleomag.sum('observation') > 0]
table_16S_hbm_biom_ps_vs_nucleomag.filter(keep_, axis='observation')

## Import filtered table and re-indexed metadata file
table_16S_hbm_ps_vs_nucleomag = q2.Artifact.import_data('FeatureTable[Frequency]', table_16S_hbm_biom_ps_vs_nucleomag)
md_round2_ps_vs_nucleomag_q2_hbm = q2.Metadata(md_round2_ps_vs_nucleomag_df_hbm)

## Generate distance matrices using 'all_dissts' utils
rare_depth_16S_hbm = 12690
dists_res_16S_hbm = all_dists(table_16S_hbm_ps_vs_nucleomag,
                      rare_depth_16S_hbm, tree_16S)

## Make a unique ID
md_round2_ps_vs_nucleomag_q2_dist = md_round2_ps_vs_nucleomag_q2_hbm.to_dataframe().copy()
md_round2_ps_vs_nucleomag_q2_dist['unique_sample_id'] = ['.'.join(rn_.split('.')[:-2])
                               for rn_ in md_round2_ps_vs_nucleomag_q2_dist.index]
grouping = 'extraction_kit'
ids = 'unique_sample_id'

## Run Mantel test for each distance matrix
mantel_res_16S_hbm = {}
for metric_, dist_mantel in dists_res_16S_hbm.items():
    # subset mf for dist (rare)
    dist_mantel = dist_mantel.distance_matrix.view(DistanceMatrix)
    md_round2_ps_vs_nucleomag_q2_dist_sub = md_round2_ps_vs_nucleomag_q2_dist.reindex(dist_mantel.ids)
    # corr, p, n
    mantel_res_16S_hbm[metric_] = mantel_matched(dist_mantel,
                                         md_round2_ps_vs_nucleomag_q2_dist_sub,
                                         grouping,
                                         ids)

## Compile 
mantel_res_16S_hbm = pd.DataFrame(mantel_res_16S_hbm,
                          ['corr', 'p', 'n'])
mantel_res_16S_hbm.to_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/mantel_correlations/table_mantel_16S_hbm_ps_vs_nucleomag.txt', sep='\t')
mantel_res_16S_hbm


Unnamed: 0,Jaccard,Unweighted UniFrac,Weighted UniFrac,RPCA
corr,0.956863,0.896437,0.939377,0.928647
p,0.0002,0.0002,0.0002,0.0002
n,45.0,45.0,45.0,45.0


In [11]:
# PowerSoil vs. NucleoMag Food - Low biomass samples
## Filter table
table_16S_lbm_biom = table_16S_lbm.view(Table)
md_round2_ps_vs_nucleomag_df_lbm = md_round2_ps_vs_nucleomag_q2.to_dataframe()
shared_ = list(set(table_16S_lbm_biom.ids()) & set(md_round2_ps_vs_nucleomag_df_lbm.index))
md_round2_ps_vs_nucleomag_df_lbm = md_round2_ps_vs_nucleomag_df_lbm.reindex(shared_)
table_16S_lbm_biom_ps_vs_nucleomag = table_16S_lbm_biom.filter(shared_)
keep_ = table_16S_lbm_biom_ps_vs_nucleomag.ids('observation')[table_16S_lbm_biom_ps_vs_nucleomag.sum('observation') > 0]
table_16S_lbm_biom_ps_vs_nucleomag.filter(keep_, axis='observation')

## Import filtered table and re-indexed metadata file
table_16S_lbm_ps_vs_nucleomag = q2.Artifact.import_data('FeatureTable[Frequency]', table_16S_lbm_biom_ps_vs_nucleomag)
md_round2_ps_vs_nucleomag_q2_lbm = q2.Metadata(md_round2_ps_vs_nucleomag_df_lbm)

## Generate distance matrices using 'all_dissts' utils
rare_depth_16S_lbm = 3295
dists_res_16S_lbm = all_dists(table_16S_lbm_ps_vs_nucleomag,
                      rare_depth_16S_lbm, tree_16S)

## Make a unique ID
md_round2_ps_vs_nucleomag_q2_dist = md_round2_ps_vs_nucleomag_q2_lbm.to_dataframe().copy()
md_round2_ps_vs_nucleomag_q2_dist['unique_sample_id'] = ['.'.join(rn_.split('.')[:-2])
                               for rn_ in md_round2_ps_vs_nucleomag_q2_dist.index]
grouping = 'extraction_kit'
ids = 'unique_sample_id'

## Run Mantel test for each distance matrix
mantel_res_16S_lbm = {}
for metric_, dist_mantel in dists_res_16S_lbm.items():
    # subset mf for dist (rare)
    dist_mantel = dist_mantel.distance_matrix.view(DistanceMatrix)
    md_round2_ps_vs_nucleomag_q2_dist_sub = md_round2_ps_vs_nucleomag_q2_dist.reindex(dist_mantel.ids)
    # corr, p, n
    mantel_res_16S_lbm[metric_] = mantel_matched(dist_mantel,
                                         md_round2_ps_vs_nucleomag_q2_dist_sub,
                                         grouping,
                                         ids)

## Compile 
mantel_res_16S_lbm = pd.DataFrame(mantel_res_16S_lbm,
                          ['corr', 'p', 'n'])
mantel_res_16S_lbm.to_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/mantel_correlations/table_mantel_16S_lbm_ps_vs_nucleomag.txt', sep='\t')
mantel_res_16S_lbm


Unnamed: 0,Jaccard,Unweighted UniFrac,Weighted UniFrac,RPCA
corr,0.891625,0.897431,0.897413,0.818539
p,0.0002,0.0002,0.0002,0.0002
n,44.0,44.0,44.0,44.0


In [12]:
# PowerSoil vs. Zymo MagBead - High biomass samples
## Filter table
table_16S_hbm_biom = table_16S_hbm.view(Table)
md_round2_ps_vs_zymo_df_hbm = md_round2_ps_vs_zymo_q2.to_dataframe()
shared_ = list(set(table_16S_hbm_biom.ids()) & set(md_round2_ps_vs_zymo_df_hbm.index))
md_round2_ps_vs_zymo_df_hbm = md_round2_ps_vs_zymo_df_hbm.reindex(shared_)
table_16S_hbm_biom_ps_vs_zymo = table_16S_hbm_biom.filter(shared_)
keep_ = table_16S_hbm_biom_ps_vs_zymo.ids('observation')[table_16S_hbm_biom_ps_vs_zymo.sum('observation') > 0]
table_16S_hbm_biom_ps_vs_zymo.filter(keep_, axis='observation')

## Import filtered table and re-indexed metadata file
table_16S_hbm_ps_vs_zymo = q2.Artifact.import_data('FeatureTable[Frequency]', table_16S_hbm_biom_ps_vs_zymo)
md_round2_ps_vs_zymo_q2_hbm = q2.Metadata(md_round2_ps_vs_zymo_df_hbm)

## Generate distance matrices using 'all_dissts' utils
rare_depth_16S_hbm = 12690
dists_res_16S_hbm = all_dists(table_16S_hbm_ps_vs_zymo,
                      rare_depth_16S_hbm, tree_16S)

## Make a unique ID
md_round2_ps_vs_zymo_q2_dist = md_round2_ps_vs_zymo_q2_hbm.to_dataframe().copy()
md_round2_ps_vs_zymo_q2_dist['unique_sample_id'] = ['.'.join(rn_.split('.')[:-2])
                               for rn_ in md_round2_ps_vs_zymo_q2_dist.index]
grouping = 'extraction_kit'
ids = 'unique_sample_id'

## Run Mantel test for each distance matrix
mantel_res_16S_hbm = {}
for metric_, dist_mantel in dists_res_16S_hbm.items():
    # subset mf for dist (rare)
    dist_mantel = dist_mantel.distance_matrix.view(DistanceMatrix)
    md_round2_ps_vs_zymo_q2_dist_sub = md_round2_ps_vs_zymo_q2_dist.reindex(dist_mantel.ids)
    # corr, p, n
    mantel_res_16S_hbm[metric_] = mantel_matched(dist_mantel,
                                         md_round2_ps_vs_zymo_q2_dist_sub,
                                         grouping,
                                         ids)

## Compile 
mantel_res_16S_hbm = pd.DataFrame(mantel_res_16S_hbm,
                          ['corr', 'p', 'n'])
mantel_res_16S_hbm.to_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/mantel_correlations/table_mantel_16S_hbm_ps_vs_zymo.txt', sep='\t')
mantel_res_16S_hbm


Unnamed: 0,Jaccard,Unweighted UniFrac,Weighted UniFrac,RPCA
corr,0.964286,0.906934,0.901048,0.960285
p,0.0002,0.0002,0.0002,0.0002
n,45.0,45.0,45.0,45.0


In [13]:
# PowerSoil vs. Zymo MagBead - High biomass samples
## Filter table
table_16S_lbm_biom = table_16S_lbm.view(Table)
md_round2_ps_vs_zymo_df_lbm = md_round2_ps_vs_zymo_q2.to_dataframe()
shared_ = list(set(table_16S_lbm_biom.ids()) & set(md_round2_ps_vs_zymo_df_lbm.index))
md_round2_ps_vs_zymo_df_lbm = md_round2_ps_vs_zymo_df_lbm.reindex(shared_)
table_16S_lbm_biom_ps_vs_zymo = table_16S_lbm_biom.filter(shared_)
keep_ = table_16S_lbm_biom_ps_vs_zymo.ids('observation')[table_16S_lbm_biom_ps_vs_zymo.sum('observation') > 0]
table_16S_lbm_biom_ps_vs_zymo.filter(keep_, axis='observation')

## Import filtered table and re-indexed metadata file
table_16S_lbm_ps_vs_zymo = q2.Artifact.import_data('FeatureTable[Frequency]', table_16S_lbm_biom_ps_vs_zymo)
md_round2_ps_vs_zymo_q2_lbm = q2.Metadata(md_round2_ps_vs_zymo_df_lbm)

## Generate distance matrices using 'all_dissts' utils
rare_depth_16S_lbm = 3295
dists_res_16S_lbm = all_dists(table_16S_lbm_ps_vs_zymo,
                      rare_depth_16S_lbm, tree_16S)

## Make a unique ID
md_round2_ps_vs_zymo_q2_dist = md_round2_ps_vs_zymo_q2_lbm.to_dataframe().copy()
md_round2_ps_vs_zymo_q2_dist['unique_sample_id'] = ['.'.join(rn_.split('.')[:-2])
                               for rn_ in md_round2_ps_vs_zymo_q2_dist.index]
grouping = 'extraction_kit'
ids = 'unique_sample_id'

## Run Mantel test for each distance matrix
mantel_res_16S_lbm = {}
for metric_, dist_mantel in dists_res_16S_lbm.items():
    # subset mf for dist (rare)
    dist_mantel = dist_mantel.distance_matrix.view(DistanceMatrix)
    md_round2_ps_vs_zymo_q2_dist_sub = md_round2_ps_vs_zymo_q2_dist.reindex(dist_mantel.ids)
    # corr, p, n
    mantel_res_16S_lbm[metric_] = mantel_matched(dist_mantel,
                                         md_round2_ps_vs_zymo_q2_dist_sub,
                                         grouping,
                                         ids)

## Compile 
mantel_res_16S_lbm = pd.DataFrame(mantel_res_16S_lbm,
                          ['corr', 'p', 'n'])
mantel_res_16S_lbm.to_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/mantel_correlations/table_mantel_16S_lbm_ps_vs_zymo.txt', sep='\t')
mantel_res_16S_lbm


Unnamed: 0,Jaccard,Unweighted UniFrac,Weighted UniFrac,RPCA
corr,0.892554,0.888277,0.866754,0.810922
p,0.0002,0.0002,0.0002,0.0002
n,18.0,18.0,18.0,18.0


# Stepwise ANOVA

In [14]:
md_round2_ps_vs_zymo_q2_dist[ids]


sample_name
12201.built.keyboard.A.1.2.zymoMA.L    12201.built.keyboard.A.1.2
12201.urine.male.C.2.powersoil.L             12201.urine.male.C.2
12201.skin.armpit.C.1.zymoMA.L              12201.skin.armpit.C.1
12201.human.milk.Z.1.powersoil.L             12201.human.milk.Z.1
12201.urine.female.D.1.powersoil.L         12201.urine.female.D.1
                                                  ...            
12201.urine.male.B.3.powersoil.L             12201.urine.male.B.3
12201.skin.armpit.B.2.powersoil.L           12201.skin.armpit.B.2
12201.urine.female.E.2.powersoil.L         12201.urine.female.E.2
12201.built.keyboard.A.1.1.zymoMA.L    12201.built.keyboard.A.1.1
12201.urine.female.F.3.zymoMA.L            12201.urine.female.F.3
Name: unique_sample_id, Length: 96, dtype: object

In [15]:
# Generate ordinations (row=samples, cols=axes)
pcoa_res = {}
pcoa_res['Jaccard'] = pcoa(dists_res_16S_lbm['Jaccard'].distance_matrix).pcoa.view(OrdinationResults).samples
pcoa_res['Unweighted UniFrac'] = pcoa(dists_res_16S_lbm['Unweighted UniFrac'].distance_matrix).pcoa.view(OrdinationResults).samples
pcoa_res['Weighted UniFrac'] = pcoa(dists_res_16S_lbm['Weighted UniFrac'].distance_matrix).pcoa.view(OrdinationResults).samples
pcoa_res['RPCA'] = dists_res_16S_lbm['RPCA'].biplot.view(OrdinationResults).samples


In [None]:
es_all = {}
use_ = ['sample_type', 'sample_type_2','sample_type_3','biomass_sample','sample_technical_replicate', 'bead_beating']
# clean up meta (only stuff to run)
mf_ord = mf.to_dataframe().copy()
# shit filter but works for now
keep_ = [v_ for v_ in mf_ord.columns
         if len(set(mf_ord[v_])) > 1 and
         len(set(mf_ord[v_])) < mf_ord.shape[0]//2]
mf_ord = mf_ord[keep_]
# run stp-wise ANOVA for all ords
for metric_, ord_ in  pcoa_res.items():
    # get first three axes
    ord_ = ord_[[0,1,2]]
    ord_.columns = ['PC1','PC2','PC3']
    # subset/match
    mf_ord_ = mf_ord.copy()
    shared_ids = list(set(ord_.index)\
                      & set(mf_ord_.index))
    mf_ord_ = mf_ord_.loc[shared_ids,:]
    ord_ = ord_.loc[shared_ids,:]
    es_all[metric_] = run_stepwise_anova(ord_, mf_ord_, use_) #mf_ord_.columns)
# concat all runs
es_alldf = pd.concat(es_all).rename({'+ sample_type_2':'Sample Type'}, axis=0)
es_alldf.to_csv('results/tables/effect-size_2min_20min.tsv', sep='\t')
es_alldf


There appears to be differences between bead_beating, biomass, and incubation with various metrics.