# Set up notebook environment
## NOTE: Use a QIIME2 kernel

In [5]:
import os
import biom
import warnings
import pickle
import numpy as np
import pandas as pd
import qiime2 as q2
from biom import Table
from skbio import OrdinationResults
from skbio.stats import subsample_counts
from skbio.stats.distance import permanova, anosim, mantel
from skbio.stats.distance import DistanceMatrix
from qiime2.plugins.deicode.actions import rpca
from qiime2.plugins.feature_table.actions import rarefy
from qiime2.plugins.diversity.actions import beta_group_significance
from qiime2.plugins.emperor.actions import biplot, plot
from qiime2.plugins.diversity.actions import (beta,
                                              beta_phylogenetic,
                                              pcoa)
from qiime2.plugins import demux, deblur, quality_filter, \
                           metadata, feature_table, alignment, \
                           phylogeny, diversity, emperor, feature_classifier, \
                           taxa, composition

from assets.step_wise_anova import run_stepwise_anova
from qiime2.plugins.fragment_insertion.actions import filter_features
warnings.filterwarnings("ignore", category=DeprecationWarning)

# helper functions
from assets.util_updated import (mantel_matched, simulate_depth,
                        all_dists, all_dists_no_tree, nested_permanova)

# plotting
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

plt.style.use('ggplot')
%matplotlib inline


# Subset metadata to make paired files between extraction kits

In [3]:
# Read in sample metadata
md = pd.read_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/sample_metadata/12201_metadata.txt',
                sep = '\t')


In [6]:
# Subset sample metadata to make files for round 1 and round 2
md_round1and2 = md[md['round'] != 3]
md_round1 = md_round1and2[md_round1and2['round'] == 1]
md_round2 = md_round1and2[md_round1and2['round'] == 2]


In [7]:
# Subset round-specific metadata files to make files for each kit
md_round1_powersoil = md_round1[md_round1['extraction_kit'] == 'PowerSoil']
md_round1_powersoil_pro = md_round1[md_round1['extraction_kit'] == 'PowerSoil Pro']
md_round1_norgen = md_round1[md_round1['extraction_kit'] == 'Norgen']
md_round2_powersoil = md_round2[md_round2['extraction_kit'] == 'PowerSoil']
md_round2_magmax = md_round2[md_round2['extraction_kit'] == 'MagMAX Microbiome']
md_round2_nucleomag = md_round2[md_round2['extraction_kit'] == 'NucleoMag Food']
md_round2_zymo = md_round2[md_round2['extraction_kit'] == 'Zymo MagBead']


In [9]:
# Merge kit-specific files to make paired files for comparison
md_round1_ps_vs_pro = pd.concat([md_round1_powersoil, md_round1_powersoil_pro])
md_round1_ps_vs_norgen = pd.concat([md_round1_powersoil, md_round1_norgen])
md_round2_ps_vs_magmax = pd.concat([md_round2_powersoil, md_round2_magmax])
md_round2_ps_vs_nucleomag = pd.concat([md_round2_powersoil, md_round2_nucleomag])
md_round2_ps_vs_zymo = pd.concat([md_round2_powersoil, md_round2_zymo])


In [10]:
# Export paired files
md_round1_ps_vs_pro.to_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/sample_metadata/01_paired_files/12201_metadata_round1_ps_vs_pro.txt',
                           sep = '\t',
                           index = False)
md_round1_ps_vs_norgen.to_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/sample_metadata/01_paired_files/12201_metadata_round1_ps_vs_norgen.txt',
                           sep = '\t',
                           index = False)
md_round2_ps_vs_magmax.to_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/sample_metadata/01_paired_files/12201_metadata_round2_ps_vs_magmax.txt',
                           sep = '\t',
                           index = False)
md_round2_ps_vs_nucleomag.to_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/sample_metadata/01_paired_files/12201_metadata_round2_ps_vs_nucleomag.txt',
                           sep = '\t',
                           index = False)
md_round2_ps_vs_zymo.to_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/sample_metadata/01_paired_files/12201_metadata_round2_ps_vs_zymo.txt',
                           sep = '\t',
                           index = False)


# Mantel tests between pairs of kits

## ITS data

In [2]:
# Import data
md_round1_ps_vs_pro_q2 = q2.Metadata.load('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/sample_metadata/01_paired_files/12201_metadata_round1_ps_vs_pro.txt')
md_round1_ps_vs_norgen_q2 = q2.Metadata.load('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/sample_metadata/01_paired_files/12201_metadata_round1_ps_vs_norgen.txt')
md_round2_ps_vs_magmax_q2 = q2.Metadata.load('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/sample_metadata/01_paired_files/12201_metadata_round2_ps_vs_magmax.txt')
md_round2_ps_vs_nucleomag_q2 = q2.Metadata.load('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/sample_metadata/01_paired_files/12201_metadata_round2_ps_vs_nucleomag.txt')
md_round2_ps_vs_zymo_q2 = q2.Metadata.load('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/sample_metadata/01_paired_files/12201_metadata_round2_ps_vs_zymo.txt')

table_its_hbm = q2.Artifact.load('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/data/ITS/08_filtered_data/dna_bothPS_ITS_deblur_biom_lod_noNTCs_noMock_hbm.qza')
table_its_lbm = q2.Artifact.load('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/data/ITS/08_filtered_data/dna_bothPS_ITS_deblur_biom_lod_noNTCs_noMock_lbm.qza')


In [6]:
# PowerSoil vs. PowerSoil Pro - High biomass samples
## Filter table
table_its_hbm_biom = table_its_hbm.view(Table)
md_round1_ps_vs_pro_df_hbm = md_round1_ps_vs_pro_q2.to_dataframe()
shared_ = list(set(table_its_hbm_biom.ids()) & set(md_round1_ps_vs_pro_df_hbm.index))
md_round1_ps_vs_pro_df_hbm = md_round1_ps_vs_pro_df_hbm.reindex(shared_)
table_its_hbm_biom_ps_vs_pro = table_its_hbm_biom.filter(shared_)
keep_ = table_its_hbm_biom_ps_vs_pro.ids('observation')[table_its_hbm_biom_ps_vs_pro.sum('observation') > 0]
table_its_hbm_biom_ps_vs_pro.filter(keep_, axis='observation')

## Import filtered table and re-indexed metadata file
table_its_hbm_ps_vs_pro = q2.Artifact.import_data('FeatureTable[Frequency]', table_its_hbm_biom_ps_vs_pro)
md_round1_ps_vs_pro_q2_hbm = q2.Metadata(md_round1_ps_vs_pro_df_hbm)

## Generate distance matrices using 'all_dissts' utils
rare_depth_its_hbm = 1491
dists_res_its_hbm = all_dists_no_tree(table_its_hbm_ps_vs_pro,
                      rare_depth_its_hbm)

## Make a unique ID
md_round1_ps_vs_pro_q2_dist = md_round1_ps_vs_pro_q2_hbm.to_dataframe().copy()
md_round1_ps_vs_pro_q2_dist['unique_sample_id'] = ['.'.join(rn_.split('.')[:-2])
                               for rn_ in md_round1_ps_vs_pro_q2_dist.index]
grouping = 'extraction_kit'
ids = 'unique_sample_id'

## Run Mantel test for each distance matrix
mantel_res_its_hbm = {}
for metric_, dist_mantel in dists_res_its_hbm.items():
    # subset mf for dist (rare)
    dist_mantel = dist_mantel.distance_matrix.view(DistanceMatrix)
    md_round1_ps_vs_pro_q2_dist_sub = md_round1_ps_vs_pro_q2_dist.reindex(dist_mantel.ids)
    # corr, p, n
    mantel_res_its_hbm[metric_] = mantel_matched(dist_mantel,
                                         md_round1_ps_vs_pro_q2_dist_sub,
                                         grouping,
                                         ids)

## Compile 
mantel_res_its_hbm = pd.DataFrame(mantel_res_its_hbm,
                          ['corr', 'p', 'n'])
mantel_res_its_hbm.to_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/mantel_correlations/table_mantel_its_hbm_ps_vs_pro.txt', sep='\t')
mantel_res_its_hbm


Unnamed: 0,Jaccard,RPCA
corr,0.802387,0.772924
p,0.0002,0.0002
n,25.0,25.0


In [7]:
# PowerSoil vs. PowerSoil Pro - Low biomass samples
## Filter table
table_its_lbm_biom = table_its_lbm.view(Table)
md_round1_ps_vs_pro_df_lbm = md_round1_ps_vs_pro_q2.to_dataframe()
shared_ = list(set(table_its_lbm_biom.ids()) & set(md_round1_ps_vs_pro_df_lbm.index))
md_round1_ps_vs_pro_df_lbm = md_round1_ps_vs_pro_df_lbm.reindex(shared_)
table_its_lbm_biom_ps_vs_pro = table_its_lbm_biom.filter(shared_)
keep_ = table_its_lbm_biom_ps_vs_pro.ids('observation')[table_its_lbm_biom_ps_vs_pro.sum('observation') > 0]
table_its_lbm_biom_ps_vs_pro.filter(keep_, axis='observation')

## Import filtered table and re-indexed metadata file
table_its_lbm_ps_vs_pro = q2.Artifact.import_data('FeatureTable[Frequency]', table_its_lbm_biom_ps_vs_pro)
md_round1_ps_vs_pro_q2_lbm = q2.Metadata(md_round1_ps_vs_pro_df_lbm)

## Generate distance matrices using 'all_dissts' utils
rare_depth_its_lbm = 344
dists_res_its_lbm = all_dists_no_tree(table_its_lbm_ps_vs_pro,
                      rare_depth_its_lbm)

## Make a unique ID
md_round1_ps_vs_pro_q2_dist = md_round1_ps_vs_pro_q2_lbm.to_dataframe().copy()
md_round1_ps_vs_pro_q2_dist['unique_sample_id'] = ['.'.join(rn_.split('.')[:-2])
                               for rn_ in md_round1_ps_vs_pro_q2_dist.index]
grouping = 'extraction_kit'
ids = 'unique_sample_id'

## Run Mantel test for each distance matrix
mantel_res_its_lbm = {}
for metric_, dist_mantel in dists_res_its_lbm.items():
    # subset mf for dist (rare)
    dist_mantel = dist_mantel.distance_matrix.view(DistanceMatrix)
    md_round1_ps_vs_pro_q2_dist_sub = md_round1_ps_vs_pro_q2_dist.reindex(dist_mantel.ids)
    # corr, p, n
    mantel_res_its_lbm[metric_] = mantel_matched(dist_mantel,
                                         md_round1_ps_vs_pro_q2_dist_sub,
                                         grouping,
                                         ids)

## Compile 
mantel_res_its_lbm = pd.DataFrame(mantel_res_its_lbm,
                          ['corr', 'p', 'n'])
mantel_res_its_lbm.to_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/mantel_correlations/table_mantel_its_lbm_ps_vs_pro.txt', sep='\t')
mantel_res_its_lbm


Unnamed: 0,Jaccard,RPCA
corr,0.671698,0.654427
p,0.0002,0.0004
n,26.0,26.0


In [8]:
# PowerSoil vs. Norgen - High biomass samples
## Filter table
table_its_hbm_biom = table_its_hbm.view(Table)
md_round1_ps_vs_norgen_df_hbm = md_round1_ps_vs_norgen_q2.to_dataframe()
shared_ = list(set(table_its_hbm_biom.ids()) & set(md_round1_ps_vs_norgen_df_hbm.index))
md_round1_ps_vs_norgen_df_hbm = md_round1_ps_vs_norgen_df_hbm.reindex(shared_)
table_its_hbm_biom_ps_vs_norgen = table_its_hbm_biom.filter(shared_)
keep_ = table_its_hbm_biom_ps_vs_norgen.ids('observation')[table_its_hbm_biom_ps_vs_norgen.sum('observation') > 0]
table_its_hbm_biom_ps_vs_norgen.filter(keep_, axis='observation')

## Import filtered table and re-indexed metadata file
table_its_hbm_ps_vs_norgen = q2.Artifact.import_data('FeatureTable[Frequency]', table_its_hbm_biom_ps_vs_norgen)
md_round1_ps_vs_norgen_q2_hbm = q2.Metadata(md_round1_ps_vs_norgen_df_hbm)

## Generate distance matrices using 'all_dissts' utils
rare_depth_its_hbm = 1491
dists_res_its_hbm = all_dists_no_tree(table_its_hbm_ps_vs_norgen,
                      rare_depth_its_hbm)

## Make a unique ID
md_round1_ps_vs_norgen_q2_dist = md_round1_ps_vs_norgen_q2_hbm.to_dataframe().copy()
md_round1_ps_vs_norgen_q2_dist['unique_sample_id'] = ['.'.join(rn_.split('.')[:-2])
                               for rn_ in md_round1_ps_vs_norgen_q2_dist.index]
grouping = 'extraction_kit'
ids = 'unique_sample_id'

## Run Mantel test for each distance matrix
mantel_res_its_hbm = {}
for metric_, dist_mantel in dists_res_its_hbm.items():
    # subset mf for dist (rare)
    dist_mantel = dist_mantel.distance_matrix.view(DistanceMatrix)
    md_round1_ps_vs_norgen_q2_dist_sub = md_round1_ps_vs_norgen_q2_dist.reindex(dist_mantel.ids)
    # corr, p, n
    mantel_res_its_hbm[metric_] = mantel_matched(dist_mantel,
                                         md_round1_ps_vs_norgen_q2_dist_sub,
                                         grouping,
                                         ids)

## Compile 
mantel_res_its_hbm = pd.DataFrame(mantel_res_its_hbm,
                          ['corr', 'p', 'n'])
mantel_res_its_hbm.to_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/mantel_correlations/table_mantel_its_hbm_ps_vs_norgen.txt', sep='\t')
mantel_res_its_hbm


Unnamed: 0,Jaccard,RPCA
corr,0.684379,0.592883
p,0.020996,0.062987
n,6.0,6.0


In [9]:
# PowerSoil vs. Norgen - High biomass samples
## Filter table
table_its_lbm_biom = table_its_lbm.view(Table)
md_round1_ps_vs_norgen_df_lbm = md_round1_ps_vs_norgen_q2.to_dataframe()
shared_ = list(set(table_its_lbm_biom.ids()) & set(md_round1_ps_vs_norgen_df_lbm.index))
md_round1_ps_vs_norgen_df_lbm = md_round1_ps_vs_norgen_df_lbm.reindex(shared_)
table_its_lbm_biom_ps_vs_norgen = table_its_lbm_biom.filter(shared_)
keep_ = table_its_lbm_biom_ps_vs_norgen.ids('observation')[table_its_lbm_biom_ps_vs_norgen.sum('observation') > 0]
table_its_lbm_biom_ps_vs_norgen.filter(keep_, axis='observation')

## Import filtered table and re-indexed metadata file
table_its_lbm_ps_vs_norgen = q2.Artifact.import_data('FeatureTable[Frequency]', table_its_lbm_biom_ps_vs_norgen)
md_round1_ps_vs_norgen_q2_lbm = q2.Metadata(md_round1_ps_vs_norgen_df_lbm)

## Generate distance matrices using 'all_dissts' utils
rare_depth_its_lbm = 344
dists_res_its_lbm = all_dists_no_tree(table_its_lbm_ps_vs_norgen,
                      rare_depth_its_lbm)

## Make a unique ID
md_round1_ps_vs_norgen_q2_dist = md_round1_ps_vs_norgen_q2_lbm.to_dataframe().copy()
md_round1_ps_vs_norgen_q2_dist['unique_sample_id'] = ['.'.join(rn_.split('.')[:-2])
                               for rn_ in md_round1_ps_vs_norgen_q2_dist.index]
grouping = 'extraction_kit'
ids = 'unique_sample_id'

## Run Mantel test for each distance matrix
mantel_res_its_lbm = {}
for metric_, dist_mantel in dists_res_its_lbm.items():
    # subset mf for dist (rare)
    dist_mantel = dist_mantel.distance_matrix.view(DistanceMatrix)
    md_round1_ps_vs_norgen_q2_dist_sub = md_round1_ps_vs_norgen_q2_dist.reindex(dist_mantel.ids)
    # corr, p, n
    mantel_res_its_lbm[metric_] = mantel_matched(dist_mantel,
                                         md_round1_ps_vs_norgen_q2_dist_sub,
                                         grouping,
                                         ids)

## Compile 
mantel_res_its_lbm = pd.DataFrame(mantel_res_its_lbm,
                          ['corr', 'p', 'n'])
mantel_res_its_lbm.to_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/mantel_correlations/table_mantel_its_lbm_ps_vs_norgen.txt', sep='\t')
mantel_res_its_lbm


ValueError: Distance matrices must have at least 3 matching IDs between them (i.e., minimum 3x3 in size).

In [10]:
# PowerSoil vs. MagMAX Microbiome - High biomass samples
## Filter table
table_its_hbm_biom = table_its_hbm.view(Table)
md_round2_ps_vs_magmax_df_hbm = md_round2_ps_vs_magmax_q2.to_dataframe()
shared_ = list(set(table_its_hbm_biom.ids()) & set(md_round2_ps_vs_magmax_df_hbm.index))
md_round2_ps_vs_magmax_df_hbm = md_round2_ps_vs_magmax_df_hbm.reindex(shared_)
table_its_hbm_biom_ps_vs_magmax = table_its_hbm_biom.filter(shared_)
keep_ = table_its_hbm_biom_ps_vs_magmax.ids('observation')[table_its_hbm_biom_ps_vs_magmax.sum('observation') > 0]
table_its_hbm_biom_ps_vs_magmax.filter(keep_, axis='observation')

## Import filtered table and re-indexed metadata file
table_its_hbm_ps_vs_magmax = q2.Artifact.import_data('FeatureTable[Frequency]', table_its_hbm_biom_ps_vs_magmax)
md_round2_ps_vs_magmax_q2_hbm = q2.Metadata(md_round2_ps_vs_magmax_df_hbm)

## Generate distance matrices using 'all_dissts' utils
rare_depth_its_hbm = 1491
dists_res_its_hbm = all_dists_no_tree(table_its_hbm_ps_vs_magmax,
                      rare_depth_its_hbm)

## Make a unique ID
md_round2_ps_vs_magmax_q2_dist = md_round2_ps_vs_magmax_q2_hbm.to_dataframe().copy()
md_round2_ps_vs_magmax_q2_dist['unique_sample_id'] = ['.'.join(rn_.split('.')[:-2])
                               for rn_ in md_round2_ps_vs_magmax_q2_dist.index]
grouping = 'extraction_kit'
ids = 'unique_sample_id'

## Run Mantel test for each distance matrix
mantel_res_its_hbm = {}
for metric_, dist_mantel in dists_res_its_hbm.items():
    # subset mf for dist (rare)
    dist_mantel = dist_mantel.distance_matrix.view(DistanceMatrix)
    md_round2_ps_vs_magmax_q2_dist_sub = md_round2_ps_vs_magmax_q2_dist.reindex(dist_mantel.ids)
    # corr, p, n
    mantel_res_its_hbm[metric_] = mantel_matched(dist_mantel,
                                         md_round2_ps_vs_magmax_q2_dist_sub,
                                         grouping,
                                         ids)

## Compile 
mantel_res_its_hbm = pd.DataFrame(mantel_res_its_hbm,
                          ['corr', 'p', 'n'])
mantel_res_its_hbm.to_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/mantel_correlations/table_mantel_its_hbm_ps_vs_magmax.txt', sep='\t')
mantel_res_its_hbm


Unnamed: 0,Jaccard,RPCA
corr,0.704506,0.711708
p,0.0002,0.0002
n,36.0,35.0


In [11]:
# PowerSoil vs. MagMAX Microbiome - Low biomass samples
## Filter table
table_its_lbm_biom = table_its_lbm.view(Table)
md_round2_ps_vs_magmax_df_lbm = md_round2_ps_vs_magmax_q2.to_dataframe()
shared_ = list(set(table_its_lbm_biom.ids()) & set(md_round2_ps_vs_magmax_df_lbm.index))
md_round2_ps_vs_magmax_df_lbm = md_round2_ps_vs_magmax_df_lbm.reindex(shared_)
table_its_lbm_biom_ps_vs_magmax = table_its_lbm_biom.filter(shared_)
keep_ = table_its_lbm_biom_ps_vs_magmax.ids('observation')[table_its_lbm_biom_ps_vs_magmax.sum('observation') > 0]
table_its_lbm_biom_ps_vs_magmax.filter(keep_, axis='observation')

## Import filtered table and re-indexed metadata file
table_its_lbm_ps_vs_magmax = q2.Artifact.import_data('FeatureTable[Frequency]', table_its_lbm_biom_ps_vs_magmax)
md_round2_ps_vs_magmax_q2_lbm = q2.Metadata(md_round2_ps_vs_magmax_df_lbm)

## Generate distance matrices using 'all_dissts' utils
rare_depth_its_lbm = 344
dists_res_its_lbm = all_dists_no_tree(table_its_lbm_ps_vs_magmax,
                      rare_depth_its_lbm)

## Make a unique ID
md_round2_ps_vs_magmax_q2_dist = md_round2_ps_vs_magmax_q2_lbm.to_dataframe().copy()
md_round2_ps_vs_magmax_q2_dist['unique_sample_id'] = ['.'.join(rn_.split('.')[:-2])
                               for rn_ in md_round2_ps_vs_magmax_q2_dist.index]
grouping = 'extraction_kit'
ids = 'unique_sample_id'

## Run Mantel test for each distance matrix
mantel_res_its_lbm = {}
for metric_, dist_mantel in dists_res_its_lbm.items():
    # subset mf for dist (rare)
    dist_mantel = dist_mantel.distance_matrix.view(DistanceMatrix)
    md_round2_ps_vs_magmax_q2_dist_sub = md_round2_ps_vs_magmax_q2_dist.reindex(dist_mantel.ids)
    # corr, p, n
    mantel_res_its_lbm[metric_] = mantel_matched(dist_mantel,
                                         md_round2_ps_vs_magmax_q2_dist_sub,
                                         grouping,
                                         ids)

## Compile 
mantel_res_its_lbm = pd.DataFrame(mantel_res_its_lbm,
                          ['corr', 'p', 'n'])
mantel_res_its_lbm.to_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/mantel_correlations/table_mantel_its_lbm_ps_vs_magmax.txt', sep='\t')
mantel_res_its_lbm


Unnamed: 0,Jaccard,RPCA
corr,0.493647,0.308133
p,0.0002,0.002999
n,33.0,33.0


In [12]:
# PowerSoil vs. NucleoMag Food - High biomass samples
## Filter table
table_its_hbm_biom = table_its_hbm.view(Table)
md_round2_ps_vs_nucleomag_df_hbm = md_round2_ps_vs_nucleomag_q2.to_dataframe()
shared_ = list(set(table_its_hbm_biom.ids()) & set(md_round2_ps_vs_nucleomag_df_hbm.index))
md_round2_ps_vs_nucleomag_df_hbm = md_round2_ps_vs_nucleomag_df_hbm.reindex(shared_)
table_its_hbm_biom_ps_vs_nucleomag = table_its_hbm_biom.filter(shared_)
keep_ = table_its_hbm_biom_ps_vs_nucleomag.ids('observation')[table_its_hbm_biom_ps_vs_nucleomag.sum('observation') > 0]
table_its_hbm_biom_ps_vs_nucleomag.filter(keep_, axis='observation')

## Import filtered table and re-indexed metadata file
table_its_hbm_ps_vs_nucleomag = q2.Artifact.import_data('FeatureTable[Frequency]', table_its_hbm_biom_ps_vs_nucleomag)
md_round2_ps_vs_nucleomag_q2_hbm = q2.Metadata(md_round2_ps_vs_nucleomag_df_hbm)

## Generate distance matrices using 'all_dissts' utils
rare_depth_its_hbm = 1491
dists_res_its_hbm = all_dists_no_tree(table_its_hbm_ps_vs_nucleomag,
                      rare_depth_its_hbm)

## Make a unique ID
md_round2_ps_vs_nucleomag_q2_dist = md_round2_ps_vs_nucleomag_q2_hbm.to_dataframe().copy()
md_round2_ps_vs_nucleomag_q2_dist['unique_sample_id'] = ['.'.join(rn_.split('.')[:-2])
                               for rn_ in md_round2_ps_vs_nucleomag_q2_dist.index]
grouping = 'extraction_kit'
ids = 'unique_sample_id'

## Run Mantel test for each distance matrix
mantel_res_its_hbm = {}
for metric_, dist_mantel in dists_res_its_hbm.items():
    # subset mf for dist (rare)
    dist_mantel = dist_mantel.distance_matrix.view(DistanceMatrix)
    md_round2_ps_vs_nucleomag_q2_dist_sub = md_round2_ps_vs_nucleomag_q2_dist.reindex(dist_mantel.ids)
    # corr, p, n
    mantel_res_its_hbm[metric_] = mantel_matched(dist_mantel,
                                         md_round2_ps_vs_nucleomag_q2_dist_sub,
                                         grouping,
                                         ids)

## Compile 
mantel_res_its_hbm = pd.DataFrame(mantel_res_its_hbm,
                          ['corr', 'p', 'n'])
mantel_res_its_hbm.to_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/mantel_correlations/table_mantel_its_hbm_ps_vs_nucleomag.txt', sep='\t')
mantel_res_its_hbm


Unnamed: 0,Jaccard,RPCA
corr,0.575869,0.64412
p,0.0002,0.0002
n,33.0,32.0


In [13]:
# PowerSoil vs. NucleoMag Food - Low biomass samples
## Filter table
table_its_lbm_biom = table_its_lbm.view(Table)
md_round2_ps_vs_nucleomag_df_lbm = md_round2_ps_vs_nucleomag_q2.to_dataframe()
shared_ = list(set(table_its_lbm_biom.ids()) & set(md_round2_ps_vs_nucleomag_df_lbm.index))
md_round2_ps_vs_nucleomag_df_lbm = md_round2_ps_vs_nucleomag_df_lbm.reindex(shared_)
table_its_lbm_biom_ps_vs_nucleomag = table_its_lbm_biom.filter(shared_)
keep_ = table_its_lbm_biom_ps_vs_nucleomag.ids('observation')[table_its_lbm_biom_ps_vs_nucleomag.sum('observation') > 0]
table_its_lbm_biom_ps_vs_nucleomag.filter(keep_, axis='observation')

## Import filtered table and re-indexed metadata file
table_its_lbm_ps_vs_nucleomag = q2.Artifact.import_data('FeatureTable[Frequency]', table_its_lbm_biom_ps_vs_nucleomag)
md_round2_ps_vs_nucleomag_q2_lbm = q2.Metadata(md_round2_ps_vs_nucleomag_df_lbm)

## Generate distance matrices using 'all_dissts' utils
rare_depth_its_lbm = 344
dists_res_its_lbm = all_dists_no_tree(table_its_lbm_ps_vs_nucleomag,
                      rare_depth_its_lbm)

## Make a unique ID
md_round2_ps_vs_nucleomag_q2_dist = md_round2_ps_vs_nucleomag_q2_lbm.to_dataframe().copy()
md_round2_ps_vs_nucleomag_q2_dist['unique_sample_id'] = ['.'.join(rn_.split('.')[:-2])
                               for rn_ in md_round2_ps_vs_nucleomag_q2_dist.index]
grouping = 'extraction_kit'
ids = 'unique_sample_id'

## Run Mantel test for each distance matrix
mantel_res_its_lbm = {}
for metric_, dist_mantel in dists_res_its_lbm.items():
    # subset mf for dist (rare)
    dist_mantel = dist_mantel.distance_matrix.view(DistanceMatrix)
    md_round2_ps_vs_nucleomag_q2_dist_sub = md_round2_ps_vs_nucleomag_q2_dist.reindex(dist_mantel.ids)
    # corr, p, n
    mantel_res_its_lbm[metric_] = mantel_matched(dist_mantel,
                                         md_round2_ps_vs_nucleomag_q2_dist_sub,
                                         grouping,
                                         ids)

## Compile 
mantel_res_its_lbm = pd.DataFrame(mantel_res_its_lbm,
                          ['corr', 'p', 'n'])
mantel_res_its_lbm.to_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/mantel_correlations/table_mantel_its_lbm_ps_vs_nucleomag.txt', sep='\t')
mantel_res_its_lbm


Unnamed: 0,Jaccard,RPCA
corr,0.497391,0.242143
p,0.0002,0.022595
n,32.0,32.0


In [14]:
# PowerSoil vs. Zymo MagBead - High biomass samples
## Filter table
table_its_hbm_biom = table_its_hbm.view(Table)
md_round2_ps_vs_zymo_df_hbm = md_round2_ps_vs_zymo_q2.to_dataframe()
shared_ = list(set(table_its_hbm_biom.ids()) & set(md_round2_ps_vs_zymo_df_hbm.index))
md_round2_ps_vs_zymo_df_hbm = md_round2_ps_vs_zymo_df_hbm.reindex(shared_)
table_its_hbm_biom_ps_vs_zymo = table_its_hbm_biom.filter(shared_)
keep_ = table_its_hbm_biom_ps_vs_zymo.ids('observation')[table_its_hbm_biom_ps_vs_zymo.sum('observation') > 0]
table_its_hbm_biom_ps_vs_zymo.filter(keep_, axis='observation')

## Import filtered table and re-indexed metadata file
table_its_hbm_ps_vs_zymo = q2.Artifact.import_data('FeatureTable[Frequency]', table_its_hbm_biom_ps_vs_zymo)
md_round2_ps_vs_zymo_q2_hbm = q2.Metadata(md_round2_ps_vs_zymo_df_hbm)

## Generate distance matrices using 'all_dissts' utils
rare_depth_its_hbm = 1491
dists_res_its_hbm = all_dists_no_tree(table_its_hbm_ps_vs_zymo,
                      rare_depth_its_hbm)

## Make a unique ID
md_round2_ps_vs_zymo_q2_dist = md_round2_ps_vs_zymo_q2_hbm.to_dataframe().copy()
md_round2_ps_vs_zymo_q2_dist['unique_sample_id'] = ['.'.join(rn_.split('.')[:-2])
                               for rn_ in md_round2_ps_vs_zymo_q2_dist.index]
grouping = 'extraction_kit'
ids = 'unique_sample_id'

## Run Mantel test for each distance matrix
mantel_res_its_hbm = {}
for metric_, dist_mantel in dists_res_its_hbm.items():
    # subset mf for dist (rare)
    dist_mantel = dist_mantel.distance_matrix.view(DistanceMatrix)
    md_round2_ps_vs_zymo_q2_dist_sub = md_round2_ps_vs_zymo_q2_dist.reindex(dist_mantel.ids)
    # corr, p, n
    mantel_res_its_hbm[metric_] = mantel_matched(dist_mantel,
                                         md_round2_ps_vs_zymo_q2_dist_sub,
                                         grouping,
                                         ids)

## Compile 
mantel_res_its_hbm = pd.DataFrame(mantel_res_its_hbm,
                          ['corr', 'p', 'n'])
mantel_res_its_hbm.to_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/mantel_correlations/table_mantel_its_hbm_ps_vs_zymo.txt', sep='\t')
mantel_res_its_hbm


Unnamed: 0,Jaccard,RPCA
corr,0.735241,0.7349
p,0.0002,0.0002
n,45.0,45.0


In [15]:
# PowerSoil vs. Zymo MagBead - Low biomass samples
## Filter table
table_its_lbm_biom = table_its_lbm.view(Table)
md_round2_ps_vs_zymo_df_lbm = md_round2_ps_vs_zymo_q2.to_dataframe()
shared_ = list(set(table_its_lbm_biom.ids()) & set(md_round2_ps_vs_zymo_df_lbm.index))
md_round2_ps_vs_zymo_df_lbm = md_round2_ps_vs_zymo_df_lbm.reindex(shared_)
table_its_lbm_biom_ps_vs_zymo = table_its_lbm_biom.filter(shared_)
keep_ = table_its_lbm_biom_ps_vs_zymo.ids('observation')[table_its_lbm_biom_ps_vs_zymo.sum('observation') > 0]
table_its_lbm_biom_ps_vs_zymo.filter(keep_, axis='observation')

## Import filtered table and re-indexed metadata file
table_its_lbm_ps_vs_zymo = q2.Artifact.import_data('FeatureTable[Frequency]', table_its_lbm_biom_ps_vs_zymo)
md_round2_ps_vs_zymo_q2_lbm = q2.Metadata(md_round2_ps_vs_zymo_df_lbm)

## Generate distance matrices using 'all_dissts' utils
rare_depth_its_lbm = 344
dists_res_its_lbm = all_dists_no_tree(table_its_lbm_ps_vs_zymo,
                      rare_depth_its_lbm)

## Make a unique ID
md_round2_ps_vs_zymo_q2_dist = md_round2_ps_vs_zymo_q2_lbm.to_dataframe().copy()
md_round2_ps_vs_zymo_q2_dist['unique_sample_id'] = ['.'.join(rn_.split('.')[:-2])
                               for rn_ in md_round2_ps_vs_zymo_q2_dist.index]
grouping = 'extraction_kit'
ids = 'unique_sample_id'

## Run Mantel test for each distance matrix
mantel_res_its_lbm = {}
for metric_, dist_mantel in dists_res_its_lbm.items():
    # subset mf for dist (rare)
    dist_mantel = dist_mantel.distance_matrix.view(DistanceMatrix)
    md_round2_ps_vs_zymo_q2_dist_sub = md_round2_ps_vs_zymo_q2_dist.reindex(dist_mantel.ids)
    # corr, p, n
    mantel_res_its_lbm[metric_] = mantel_matched(dist_mantel,
                                         md_round2_ps_vs_zymo_q2_dist_sub,
                                         grouping,
                                         ids)

## Compile 
mantel_res_its_lbm = pd.DataFrame(mantel_res_its_lbm,
                          ['corr', 'p', 'n'])
mantel_res_its_lbm.to_csv('/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/mantel_correlations/table_mantel_its_lbm_ps_vs_zymo.txt', sep='\t')
mantel_res_its_lbm


Unnamed: 0,Jaccard,RPCA
corr,0.230052,0.183615
p,0.021996,0.096581
n,19.0,19.0


# Stepwise ANOVA

In [9]:
mf_dist[ids]

sample_name
skin.nares.A.2.1.microbiome2.C                    skin.nares.A.2.1
built.doorknob.D.2.microbiome20.C               built.doorknob.D.2
skin.forehead.eluent.P.1.microbiome2.C    skin.forehead.eluent.P.1
uriM.O.2.microbiome20.L                                   uriM.O.2
oral.throat.eluent.U.microbiome20.C           oral.throat.eluent.U
                                                    ...           
wat.S.A.1.microbiome20.L                                 wat.S.A.1
skin.nares.B.1.1.microbiome2.C                    skin.nares.B.1.1
oral.saline.vtm.A.3.microbiome20.C             oral.saline.vtm.A.3
skin.forehead.B.3.2.microbiome20.C             skin.forehead.B.3.2
skin.nares.D.1.1.microbiome2.C                    skin.nares.D.1.1
Name: unique_sample_id, Length: 576, dtype: object

In [10]:
# get ordinations (row=samples, cols=axes)
pcoa_res = {}
pcoa_res['Jaccard'] = pcoa(dists_res['Jaccard'].distance_matrix).pcoa.view(OrdinationResults).samples
pcoa_res['Unweighted UniFrac'] = pcoa(dists_res['Unweighted UniFrac'].distance_matrix).pcoa.view(OrdinationResults).samples
pcoa_res['Weighted UniFrac'] = pcoa(dists_res['Weighted UniFrac'].distance_matrix).pcoa.view(OrdinationResults).samples
pcoa_res['RPCA'] = dists_res['RPCA'].biplot.view(OrdinationResults).samples


In [11]:
es_all = {}
use_ = ['sample_type', 'sample_type_2','sample_type_3','biomass_sample', 'incubation','sample_technical_replicate', 'bead_beating']
# clean up meta (only stuff to run)
mf_ord = mf.to_dataframe().copy()
# shit filter but works for now
keep_ = [v_ for v_ in mf_ord.columns
         if len(set(mf_ord[v_])) > 1 and
         len(set(mf_ord[v_])) < mf_ord.shape[0]//2]
mf_ord = mf_ord[keep_]
# run stp-wise ANOVA for all ords
for metric_, ord_ in  pcoa_res.items():
    # get first three axes
    ord_ = ord_[[0,1,2]]
    ord_.columns = ['PC1','PC2','PC3']
    # subset/match
    mf_ord_ = mf_ord.copy()
    shared_ids = list(set(ord_.index)\
                      & set(mf_ord_.index))
    mf_ord_ = mf_ord_.loc[shared_ids,:]
    ord_ = ord_.loc[shared_ids,:]
    es_all[metric_] = run_stepwise_anova(ord_, mf_ord_, use_) #mf_ord_.columns)
# concat all runs
es_alldf = pd.concat(es_all).rename({'+ sample_type_2':'Sample Type'}, axis=0)
es_alldf.to_csv('results/tables/effect-size_2min_20min.tsv', sep='\t')
es_alldf


Running external command line application(s). This may print messages to stdout and/or stderr.
The command(s) being run are below. These commands cannot be manually re-run as they will depend on temporary files that no longer exist.

Command: /home/mestaki/Desktop/post-doc/magmax_comparison/assets/stepwise-rda.R /tmp/tmp1sqqp8k4/ord_.tsv /tmp/tmp1sqqp8k4/mf_.txt /tmp/tmp1sqqp8k4/output.effect.size.tsv

Running external command line application(s). This may print messages to stdout and/or stderr.
The command(s) being run are below. These commands cannot be manually re-run as they will depend on temporary files that no longer exist.

Command: /home/mestaki/Desktop/post-doc/magmax_comparison/assets/stepwise-rda.R /tmp/tmpg10q06hu/ord_.tsv /tmp/tmpg10q06hu/mf_.txt /tmp/tmpg10q06hu/output.effect.size.tsv

Running external command line application(s). This may print messages to stdout and/or stderr.
The command(s) being run are below. These commands cannot be manually re-run as they will dep

Unnamed: 0,Unnamed: 1,R2.adj,Df,AIC,F,Pr(>F)
Jaccard,+ sample_type_3,0.907956,85,-444.599389,47.768527,0.0002
Jaccard,+ incubation,0.00691,1,-475.401064,26.81167,0.0002
Unweighted UniFrac,+ sample_type_3,0.928937,85,-549.1169,62.976966,0.0002
Unweighted UniFrac,+ incubation,0.002175,1,-560.948711,11.041108,0.0002
Weighted UniFrac,+ sample_type_3,0.915989,85,-481.495494,52.694256,0.0002
Weighted UniFrac,+ biomass_sample,0.003247,1,-496.691615,13.784276,0.004399
Weighted UniFrac,+ bead_beating,0.001834,1,-505.247732,8.365581,0.0004
RPCA,+ sample_type_3,0.918215,85,-492.341302,54.229845,0.0002
RPCA,+ bead_beating,0.002746,1,-505.409262,12.046378,0.0002


There appears to be differences between bead_beating, biomass, and incubation with various metrics.