# Streamlined Extraction of Nucleic Acids and Metabolites from Low- and High-Biomass Samples Using Isopropanol and Matrix Tubes

## 16S Mantel tests

## Set up notebook environment
### Note: This notebook should be run in an environment with QIIME2 and the package gemelli installed. A folder called 'assets' is also needed, which should contain the assets provided in the same repo as this code.


In [20]:
import os
import biom
import warnings
import pickle
import numpy as np
import pandas as pd
import qiime2 as q2
from biom import Table
from skbio import OrdinationResults
from skbio.stats import subsample_counts
from skbio.stats.distance import permanova, anosim, mantel
from skbio.stats.distance import DistanceMatrix
from qiime2.plugins.gemelli.actions import rpca
from qiime2.plugins.gemelli.actions import phylogenetic_rpca_with_taxonomy
from qiime2.plugins.feature_table.actions import rarefy
from qiime2.plugins.diversity.actions import beta_group_significance
from qiime2.plugins.emperor.actions import biplot, plot
from qiime2.plugins.diversity.actions import (beta,
                                              beta_phylogenetic,
                                              pcoa)
from qiime2.plugins import demux, deblur, quality_filter, \
                           metadata, feature_table, alignment, \
                           phylogeny, diversity, emperor, feature_classifier, \
                           taxa, composition

from assets.step_wise_anova import run_stepwise_anova
from qiime2.plugins.fragment_insertion.actions import filter_features
warnings.filterwarnings("ignore", category=DeprecationWarning)

# helper functions
from assets.util_updated_again import (mantel_matched, simulate_depth,
                        all_dists, all_dists_no_tree, nested_permanova)

# plotting
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

plt.style.use('ggplot')
%matplotlib inline


# Subset files and make paired files

In [5]:
# Read in sample metadata
md = pd.read_csv('/matrix/metadata_samples/metadata_samples_qiita_20250205.txt',
                sep = '\t')


In [7]:
# Subset metadata files to make files for each extraction protocol and storage solution
md_plate = md[md['extraction_protocol'] == 'MagMax']
md_matrix = md[md['extraction_protocol'] == 'Matrix']
md_etoh = md[md['storage_solution'] == 'etoh']
md_isop = md[md['storage_solution'] == 'isopropanol']


In [8]:
# Merge kit-specific files to make paired files for comparison
md_compare_protocols = pd.concat([md_plate, md_matrix])
md_compare_storage = pd.concat([md_etoh, md_isop])


In [10]:
# Export paired files
md_compare_protocols.to_csv('/matrix/data/16S/00_mantel/metadata_samples_mantel_protocols.txt',
                           sep = '\t',
                           index = False)

md_compare_storage.to_csv('/matrix/data/16S/00_mantel/metadata_samples_mantel_storage.txt',
                           sep = '\t',
                           index = False)


# Mantel tests (extraction_protocol) Matrix vs. MagMax

In [25]:
# Import data
md_compare_protocols = q2.Metadata.load('/matrix/data/16S/00_mantel/metadata_samples_mantel_protocols.txt')
md_compare_storage = q2.Metadata.load('/matrix/data/16S/00_mantel/metadata_samples_mantel_storage.txt')

table_16S_hbm = q2.Artifact.load('/matrix/data/16S/matrix_16s_deblur_gg2_biom_silva_noMit_noChl_noUnassigned_noEuk_noDomain_noControls_noSpike_hbm_noSingletons.qza')
table_16S_lbm = q2.Artifact.load('/matrix/data/16S/matrix_16s_deblur_gg2_biom_silva_noMit_noChl_noUnassigned_noEuk_noDomain_noControls_noSpike_lbm_noSingletons.qza')

tree_16S = q2.Artifact.load('/databases/gg2/2024.09/2024.09.phylogeny.asv.nwk.qza')
taxonomy_16S = q2.Metadata.load('/matrix/data/16S/matrix_16s_deblur_gg2_seqs_taxonomy.tsv')


In [13]:
# Matrix vs. Plate
## LBM
### Filter table
table_16S_lbm_biom = table_16S_lbm.view(Table)
md_compare_protocols_lbm = md_compare_protocols.to_dataframe()
shared_ = list(set(table_16S_lbm_biom.ids()) & set(md_compare_protocols_lbm.index))
md_compare_protocols_lbm = md_compare_protocols_lbm.reindex(shared_)
table_16S_lbm_biom_protocols = table_16S_lbm_biom.filter(shared_)
keep_ = table_16S_lbm_biom_protocols.ids('observation')[table_16S_lbm_biom_protocols.sum('observation') > 0]
table_16S_lbm_biom_protocols.filter(keep_, axis='observation')

## Import filtered table and re-indexed metadata file
table_16S_lbm_protocols = q2.Artifact.import_data('FeatureTable[Frequency]', table_16S_lbm_biom_protocols)
md_compare_protocols_lbm_q2 = q2.Metadata(md_compare_protocols_lbm)

## Generate distance matrices using 'all_dists' utils
rare_depth_16S_lbm = 277
dists_res_16S_lbm = all_dists(table_16S_lbm_protocols, rare_depth_16S_lbm, tree_16S, taxonomy_16S)

## Make a unique ID
md_lbm_protocols_dist_q2 = md_compare_protocols_lbm_q2.to_dataframe().copy()
#md_lbm_protocols_dist_q2['sample_name_mantel_protocol'] = ['.'.join(rn_.split('.')[:-2])
#                               for rn_ in md_lbm_protocols_dist_q2.index]
grouping = 'extraction_protocol'
ids = 'sample_name_mantel_protocol'

## Run Mantel test for each distance matrix
mantel_res_16S_lbm = {}
for metric_, dist_mantel in dists_res_16S_lbm.items():
    # subset mf for dist (rare)
    dist_mantel = dist_mantel.distance_matrix.view(DistanceMatrix)
    md_lbm_protocols_dist_q2_sub = md_lbm_protocols_dist_q2.reindex(dist_mantel.ids)
    # corr, p, n
    mantel_res_16S_lbm[metric_] = mantel_matched(dist_mantel,
                                         md_lbm_protocols_dist_q2_sub,
                                         grouping,
                                         ids)

## Compile 
mantel_res_16S_lbm = pd.DataFrame(mantel_res_16S_lbm,
                          ['corr', 'p', 'n'])
mantel_res_16S_lbm.to_csv('/matrix/results/mantel/mantel_16s_lbm_protocols.txt', sep='\t')
mantel_res_16S_lbm


1928 x 143 <class 'biom.table.Table'> with 13577 nonzero entries (4% dense)

In [14]:
# Matrix vs. Plate
## HBM
### Filter table
table_16S_hbm_biom = table_16S_hbm.view(Table)
md_compare_protocols_hbm = md_compare_protocols.to_dataframe()
shared_ = list(set(table_16S_hbm_biom.ids()) & set(md_compare_protocols_hbm.index))
md_compare_protocols_hbm = md_compare_protocols_hbm.reindex(shared_)
table_16S_hbm_biom_protocols = table_16S_hbm_biom.filter(shared_)
keep_ = table_16S_hbm_biom_protocols.ids('observation')[table_16S_hbm_biom_protocols.sum('observation') > 0]
table_16S_hbm_biom_protocols.filter(keep_, axis='observation')

## Import filtered table and re-indexed metadata file
table_16S_hbm_protocols = q2.Artifact.import_data('FeatureTable[Frequency]', table_16S_hbm_biom_protocols)
md_compare_protocols_hbm_q2 = q2.Metadata(md_compare_protocols_hbm)

## Generate distance matrices using 'all_dists' utils
rare_depth_16S_hbm = 20636
dists_res_16S_hbm = all_dists(table_16S_hbm_protocols, rare_depth_16S_hbm, tree_16S, taxonomy_16S)

## Make a unique ID
md_hbm_protocols_dist_q2 = md_compare_protocols_hbm_q2.to_dataframe().copy()
#md_hbm_protocols_dist_q2['sample_name_mantel_protocol'] = ['.'.join(rn_.split('.')[:-2])
#                               for rn_ in md_hbm_protocols_dist_q2.index]
grouping = 'extraction_protocol'
ids = 'sample_name_mantel_protocol'

## Run Mantel test for each distance matrix
mantel_res_16S_hbm = {}
for metric_, dist_mantel in dists_res_16S_hbm.items():
    # subset mf for dist (rare)
    dist_mantel = dist_mantel.distance_matrix.view(DistanceMatrix)
    md_hbm_protocols_dist_q2_sub = md_hbm_protocols_dist_q2.reindex(dist_mantel.ids)
    # corr, p, n
    mantel_res_16S_hbm[metric_] = mantel_matched(dist_mantel,
                                         md_hbm_protocols_dist_q2_sub,
                                         grouping,
                                         ids)

## Compile 
mantel_res_16S_hbm = pd.DataFrame(mantel_res_16S_hbm,
                          ['corr', 'p', 'n'])
mantel_res_16S_hbm.to_csv('/matrix/results/mantel/mantel_16s_hbm_protocols.txt', sep='\t')
mantel_res_16S_hbm


In [None]:
# EtOH vs. Isopropanol
## LBM
### Filter table
table_16S_lbm_biom = table_16S_lbm.view(Table)
md_compare_storage_lbm = md_compare_storage.to_dataframe()
shared_ = list(set(table_16S_lbm_biom.ids()) & set(md_compare_storage_lbm.index))
md_compare_storage_lbm = md_compare_storage_lbm.reindex(shared_)
table_16S_lbm_biom_storage = table_16S_lbm_biom.filter(shared_)
keep_ = table_16S_lbm_biom_storage.ids('observation')[table_16S_lbm_biom_storage.sum('observation') > 0]
table_16S_lbm_biom_storage.filter(keep_, axis='observation')

## Import filtered table and re-indexed metadata file
table_16S_lbm_storage = q2.Artifact.import_data('FeatureTable[Frequency]', table_16S_lbm_biom_storage)
md_compare_storage_lbm_q2 = q2.Metadata(md_compare_storage_lbm)

## Generate distance matrices using 'all_dists' utils
rare_depth_16S_lbm = 277
dists_res_16S_lbm = all_dists(table_16S_lbm_storage, rare_depth_16S_lbm, tree_16S, taxonomy_16S)

## Make a unique ID
md_lbm_storage_dist_q2 = md_compare_storage_lbm_q2.to_dataframe().copy()
#md_lbm_storage_dist_q2['sample_name_mantel_protocol'] = ['.'.join(rn_.split('.')[:-2])
#                               for rn_ in md_lbm_storage_dist_q2.index]
grouping = 'storage_solution'
ids = 'sample_name_mantel_solution'

## Run Mantel test for each distance matrix
mantel_res_16S_lbm = {}
for metric_, dist_mantel in dists_res_16S_lbm.items():
    # subset mf for dist (rare)
    dist_mantel = dist_mantel.distance_matrix.view(DistanceMatrix)
    md_lbm_storage_dist_q2_sub = md_lbm_storage_dist_q2.reindex(dist_mantel.ids)
    # corr, p, n
    mantel_res_16S_lbm[metric_] = mantel_matched(dist_mantel,
                                         md_lbm_storage_dist_q2_sub,
                                         grouping,
                                         ids)

## Compile 
mantel_res_16S_lbm = pd.DataFrame(mantel_res_16S_lbm,
                          ['corr', 'p', 'n'])
mantel_res_16S_lbm.to_csv('/matrix/results/mantel/mantel_16s_lbm_storage.txt', sep='\t')
mantel_res_16S_lbm


In [None]:
# EtOH vs. Isopropanol
## HBM
### Filter table
table_16S_hbm_biom = table_16S_hbm.view(Table)
md_compare_storage_hbm = md_compare_storage.to_dataframe()
shared_ = list(set(table_16S_hbm_biom.ids()) & set(md_compare_storage_hbm.index))
md_compare_storage_hbm = md_compare_storage_hbm.reindex(shared_)
table_16S_hbm_biom_storage = table_16S_hbm_biom.filter(shared_)
keep_ = table_16S_hbm_biom_storage.ids('observation')[table_16S_hbm_biom_storage.sum('observation') > 0]
table_16S_hbm_biom_storage.filter(keep_, axis='observation')

## Import filtered table and re-indexed metadata file
table_16S_hbm_storage = q2.Artifact.import_data('FeatureTable[Frequency]', table_16S_hbm_biom_storage)
md_compare_storage_hbm_q2 = q2.Metadata(md_compare_storage_hbm)

## Generate distance matrices using 'all_dists' utils
rare_depth_16S_hbm = 20636
dists_res_16S_hbm = all_dists(table_16S_hbm_storage, rare_depth_16S_hbm, tree_16S, taxonomy_16S)

## Make a unique ID
md_hbm_storage_dist_q2 = md_compare_storage_hbm_q2.to_dataframe().copy()
#md_hbm_storage_dist_q2['sample_name_mantel_protocol'] = ['.'.join(rn_.split('.')[:-2])
#                               for rn_ in md_hbm_storage_dist_q2.index]
grouping = 'storage_solution'
ids = 'sample_name_mantel_solution'

## Run Mantel test for each distance matrix
mantel_res_16S_hbm = {}
for metric_, dist_mantel in dists_res_16S_hbm.items():
    # subset mf for dist (rare)
    dist_mantel = dist_mantel.distance_matrix.view(DistanceMatrix)
    md_hbm_storage_dist_q2_sub = md_hbm_storage_dist_q2.reindex(dist_mantel.ids)
    # corr, p, n
    mantel_res_16S_hbm[metric_] = mantel_matched(dist_mantel,
                                         md_hbm_storage_dist_q2_sub,
                                         grouping,
                                         ids)

## Compile 
mantel_res_16S_hbm = pd.DataFrame(mantel_res_16S_hbm,
                          ['corr', 'p', 'n'])
mantel_res_16S_hbm.to_csv('/matrix/results/mantel/mantel_16s_hbm_storage.txt', sep='\t')
mantel_res_16S_hbm
