# Streamlined Extraction of Nucleic Acids and Metabolites from Low- and High-Biomass Samples Using Isopropanol and Matrix Tubes

## Technical replicate analysis

## Set up notebook environment
### Note: This notebook should be run in an environment with QIIME2 and the package gemelli installed. A folder called 'assets' is also needed, which should contain the assets provided in the same repo as this code.

In [4]:
import pandas as pd
import qiime2 as q2
import numpy as np
import plotnine as pn
from biom import Table, load_table
from qiime2.plugins.gemelli.actions import rpca
from qiime2.plugins.gemelli.actions import phylogenetic_rpca_with_taxonomy
from qiime2.plugins.diversity.actions import beta_phylogenetic
from qiime2.plugins.diversity.actions import beta
from qiime2.plugins.diversity.actions import alpha
from qiime2.plugins.feature_table.actions import rarefy
from skbio import DistanceMatrix

s="sample"
o="observation"

%matplotlib inline


# 16S high biomass

## Import data

In [7]:
# Data
qza = q2.Artifact.load("/matrix/data/16S/matrix_16s_deblur_gg2_biom_silva_noMit_noChl_noUnassigned_noEuk_noDomain_noControls_noSpike_hbm_noSingletons.qza")
bt = qza.view(Table)

# Rarefied data
qza_rare = q2.Artifact.load("/matrix/data/16S/matrix_16s_deblur_gg2_biom_silva_noMit_noChl_noUnassigned_noEuk_noDomain_noControls_noSpike_hbm_noSingletons_rar20636.qza")
bt_rare = qza_rare.view(Table)

# Metadata
md = pd.read_csv("/matrix/metadata_samples/metadata_samples_qiita_20250311c.txt",sep='\t', index_col=0)
md.index.name="sample_name"

# Tree
tree_q2 = q2.Artifact.load("/databases/gg2/2024.09/2024.09.phylogeny.asv.nwk.qza")

# Taxonomy
taxonomy_q2 = q2.Metadata.load('/matrix/data/16S/matrix_16s_deblur_gg2_seqs_taxonomy.tsv')

# Filter table to include samples in metadata and re-index metadata
bt_samples = set(bt.ids(s))
md = md[md.index.isin(bt_samples)]


## Calculate distances


In [None]:
rarefaction_depth = 20636

dms = {}

# Jaccard
dms["jaccard"] = beta(table=qza_rare, metric="jaccard").distance_matrix.view(DistanceMatrix)

# weighted unifrac
dms["weighted_unifrac"] = beta_phylogenetic(table=qza_rare, phylogeny=tree_q2, metric="weighted_unifrac").distance_matrix.view(DistanceMatrix)

# unweighted unifrac
dms["unweighted_unifrac"] = beta_phylogenetic(table=qza_rare, phylogeny=tree_q2, metric="unweighted_unifrac").distance_matrix.view(DistanceMatrix)

# RPCA
bplt_rpca, dm_rpca = rpca(table=qza,
               n_components=3, 
               min_sample_count=rarefaction_depth, 
               min_feature_frequency=0)
dms["RPCA"] = dm_rpca.view(DistanceMatrix)

# Phylo-RPCA
bplt_phylo_rpca, dm_phylo_rpca, node_tree, node_counts, tree_to_taxonomy = phylogenetic_rpca_with_taxonomy(table=qza, 
                                                    phylogeny=tree_q2, 
                                                    taxonomy=taxonomy_q2, 
                                                    min_feature_count=0,
                                                    min_feature_frequency=0,
                                                    min_sample_count=rarefaction_depth)
dms["phylo_RPCA"] = dm_phylo_rpca.view(DistanceMatrix)



## Generate dataframes for plotting


In [None]:
md_variable = md.loc[:,["sample_type3","unique_sample_id_tech_rep"]]
out_dfs = {}
for metric,dm in dms.items():
    df = dm.to_data_frame()
    df=df.reset_index().melt(id_vars="index")
    df.columns=["sample1","sample2","value"]
    df = df.merge(md_variable, right_index=True, left_on="sample1")
    df = df.rename(columns={"sample_type3":"sample1_type","unique_sample_id_tech_rep":"sample1_unique_sample_id"})
    df = df.merge(md_variable, right_index=True, left_on="sample2")
    df = df.rename(columns={"sample_type3":"sample2_type","unique_sample_id_tech_rep":"sample2_unique_sample_id"})
    df = df.merge(md.loc[:,["sample_type","sample_type1","sample_type2","sample_type3","biomass","extraction_protocol_storage_solution","host_subject_id"]], right_index=True, left_on="sample1")
    df = df.query('sample1_type==sample2_type & sample1_unique_sample_id==sample2_unique_sample_id & sample1!=sample2')
    out_dfs[metric] = df
    

## Export data frames


In [None]:
metric="jaccard"
df = out_dfs[metric]
df = df.drop_duplicates("value")
df.to_csv("/matrix/results/technical_replicate_distances/matrix_tech_rep_16S_hbm_jaccard.txt", sep = '\t', index = False)

metric="RPCA"
df = out_dfs[metric]
df = df.drop_duplicates("value")
df.to_csv("/matrix/results/technical_replicate_distances/matrix_tech_rep_16S_hbm_rpca.txt", sep = '\t', index = False)

metric="unweighted_unifrac"
df = out_dfs[metric]
df = df.drop_duplicates("value")
df.to_csv("/matrix/results/technical_replicate_distances/matrix_tech_rep_16S_hbm_unifrac.txt", sep = '\t', index = False)

metric="weighted_unifrac"
df = out_dfs[metric]
df = df.drop_duplicates("value")
df.to_csv("/matrix/results/technical_replicate_distances/matrix_tech_rep_16S_hbm_weighted_unifrac.txt", sep = '\t', index = False)

metric="phylo_RPCA"
df = out_dfs[metric]
df = df.drop_duplicates("value")
df.to_csv("/matrix/results/technical_replicate_distances/matrix_tech_rep_16S_hbm_phylo_rpca.txt", sep = '\t', index = False)


# 16S low biomass

## Import data

In [7]:
# Data
qza = q2.Artifact.load("/matrix/data/16S/matrix_16s_deblur_gg2_biom_silva_noMit_noChl_noUnassigned_noEuk_noDomain_noControls_noSpike_lbm_noSingletons.qza")
bt = qza.view(Table)

# Rarefied data
qza_rare = q2.Artifact.load("/matrix/data/16S/matrix_16s_deblur_gg2_biom_silva_noMit_noChl_noUnassigned_noEuk_noDomain_noControls_noSpike_lbm_noSingletons_rar277.qza")
bt_rare = qza_rare.view(Table)

# Metadata
md = pd.read_csv("/matrix/metadata_samples/metadata_samples_qiita_20250319e.txt",sep='\t', index_col=0)
md.index.name="sample_name"

# Tree
tree_q2 = q2.Artifact.load("/databases/gg2/2024.09/2024.09.phylogeny.asv.nwk.qza")

# Taxonomy
taxonomy_q2 = q2.Metadata.load('/matrix/data/16S/matrix_16s_deblur_gg2_seqs_taxonomy.tsv')

# Filter table to include samples in metadata and re-index metadata
bt_samples = set(bt.ids(s))
md = md[md.index.isin(bt_samples)]


## Calculate distances


In [None]:
rarefaction_depth = 277

dms = {}

# Jaccard
dms["jaccard"] = beta(table=qza_rare, metric="jaccard").distance_matrix.view(DistanceMatrix)

# weighted unifrac
dms["weighted_unifrac"] = beta_phylogenetic(table=qza_rare, phylogeny=tree_q2, metric="weighted_unifrac").distance_matrix.view(DistanceMatrix)

# unweighted unifrac
dms["unweighted_unifrac"] = beta_phylogenetic(table=qza_rare, phylogeny=tree_q2, metric="unweighted_unifrac").distance_matrix.view(DistanceMatrix)

# RPCA
bplt_rpca, dm_rpca = rpca(table=qza,
               n_components=3, 
               min_sample_count=rarefaction_depth, 
               min_feature_frequency=0)
dms["RPCA"] = dm_rpca.view(DistanceMatrix)

# Phylo-RPCA
bplt_phylo_rpca, dm_phylo_rpca, node_tree, node_counts, tree_to_taxonomy = phylogenetic_rpca_with_taxonomy(table=qza, 
                                                    phylogeny=tree_q2, 
                                                    taxonomy=taxonomy_q2, 
                                                    min_feature_count=0,
                                                    min_feature_frequency=0,
                                                    min_sample_count=rarefaction_depth)
dms["phylo_RPCA"] = dm_phylo_rpca.view(DistanceMatrix)



## Generate dataframes for plotting


In [None]:
md_variable = md.loc[:,["sample_type3","unique_sample_id_tech_rep"]]
out_dfs = {}
for metric,dm in dms.items():
    df = dm.to_data_frame()
    df=df.reset_index().melt(id_vars="index")
    df.columns=["sample1","sample2","value"]
    df = df.merge(md_variable, right_index=True, left_on="sample1")
    df = df.rename(columns={"sample_type3":"sample1_type","unique_sample_id_tech_rep":"sample1_unique_sample_id"})
    df = df.merge(md_variable, right_index=True, left_on="sample2")
    df = df.rename(columns={"sample_type3":"sample2_type","unique_sample_id_tech_rep":"sample2_unique_sample_id"})
    df = df.merge(md.loc[:,["sample_type","sample_type1","sample_type2","sample_type3","biomass","extraction_protocol_storage_solution","host_subject_id"]], right_index=True, left_on="sample1")
    df = df.query('sample1_type==sample2_type & sample1_unique_sample_id==sample2_unique_sample_id & sample1!=sample2')
    out_dfs[metric] = df
    

## Export data frames


In [None]:
metric="jaccard"
df = out_dfs[metric]
df = df.drop_duplicates("value")
df.to_csv("/matrix/results/technical_replicate_distances/matrix_tech_rep_16S_lbm_jaccard.txt", sep = '\t', index = False)

metric="RPCA"
df = out_dfs[metric]
df = df.drop_duplicates("value")
df.to_csv("/matrix/results/technical_replicate_distances/matrix_tech_rep_16S_lbm_rpca.txt", sep = '\t', index = False)

metric="unweighted_unifrac"
df = out_dfs[metric]
df = df.drop_duplicates("value")
df.to_csv("/matrix/results/technical_replicate_distances/matrix_tech_rep_16S_lbm_unifrac.txt", sep = '\t', index = False)

metric="weighted_unifrac"
df = out_dfs[metric]
df = df.drop_duplicates("value")
df.to_csv("/matrix/results/technical_replicate_distances/matrix_tech_rep_16S_lbm_weighted_unifrac.txt", sep = '\t', index = False)

metric="phylo_RPCA"
df = out_dfs[metric]
df = df.drop_duplicates("value")
df.to_csv("/matrix/results/technical_replicate_distances/matrix_tech_rep_16S_lbm_phylo_rpca.txt", sep = '\t', index = False)


# Shotgun high biomass

## Import data

In [7]:
# Data
qza = q2.Artifact.load("/matrix/data/shotgun/matrix_shotgun_wolr2pe_biom_hbm_noControls_noSpike_noSingletons.qza")
bt = qza.view(Table)

# Rarefied data
qza_rare = q2.Artifact.load("/matrix/data/shotgun/matrix_shotgun_wolr2pe_biom_hbm_noControls_noSpike_noSingletons_rar1515K.qza")
bt_rare = qza_rare.view(Table)

# Metadata
md = pd.read_csv("/matrix/metadata_samples/metadata_samples_qiita_20250307.txt",sep='\t', index_col=0)
md.index.name="sample_name"

# Tree
tree_q2 = q2.Artifact.load("/databases/wol/wol_r2/wolr2_phylogeny.qza")

# Taxonomy
taxonomy_q2 = q2.Metadata.load('/databases/wol/wol_r2/wolr2_taxonomy.txt')

# Filter table to include samples in metadata and re-index metadata
bt_samples = set(bt.ids(s))
md = md[md.index.isin(bt_samples)]


## Calculate distances


In [None]:
rarefaction_depth = 1515275

dms = {}

# Jaccard
dms["jaccard"] = beta(table=qza_rare, metric="jaccard").distance_matrix.view(DistanceMatrix)

# weighted unifrac
dms["weighted_unifrac"] = beta_phylogenetic(table=qza_rare, phylogeny=tree_q2, metric="weighted_unifrac").distance_matrix.view(DistanceMatrix)

# unweighted unifrac
dms["unweighted_unifrac"] = beta_phylogenetic(table=qza_rare, phylogeny=tree_q2, metric="unweighted_unifrac").distance_matrix.view(DistanceMatrix)

# RPCA
bplt_rpca, dm_rpca = rpca(table=qza,
               n_components=3, 
               min_sample_count=rarefaction_depth, 
               min_feature_frequency=0)
dms["RPCA"] = dm_rpca.view(DistanceMatrix)

# Phylo-RPCA
bplt_phylo_rpca, dm_phylo_rpca, node_tree, node_counts, tree_to_taxonomy = phylogenetic_rpca_with_taxonomy(table=qza, 
                                                    phylogeny=tree_q2, 
                                                    taxonomy=taxonomy_q2, 
                                                    min_feature_count=0,
                                                    min_feature_frequency=0,
                                                    min_sample_count=rarefaction_depth)
dms["phylo_RPCA"] = dm_phylo_rpca.view(DistanceMatrix)



## Generate dataframes for plotting


In [None]:
md_variable = md.loc[:,["sample_type3","unique_sample_id"]]
out_dfs = {}
for metric,dm in dms.items():
    df = dm.to_data_frame()
    df=df.reset_index().melt(id_vars="index")
    df.columns=["sample1","sample2","value"]
    df = df.merge(md_variable, right_index=True, left_on="sample1")
    df = df.rename(columns={"sample_type3":"sample1_type","unique_sample_id":"sample1_unique_sample_id"})
    df = df.merge(md_variable, right_index=True, left_on="sample2")
    df = df.rename(columns={"sample_type3":"sample2_type","unique_sample_id":"sample2_unique_sample_id"})
    df = df.merge(md.loc[:,["sample_type","sample_type1","sample_type2","sample_type3","biomass","extraction_protocol_storage_solution","host_subject_id"]], right_index=True, left_on="sample1")
    df = df.query('sample1_type==sample2_type & sample1_unique_sample_id==sample2_unique_sample_id & sample1!=sample2')
    out_dfs[metric] = df
    

## Export data frames


In [None]:
metric="jaccard"
df = out_dfs[metric]
df = df.drop_duplicates("value")
df.to_csv("/matrix/results/technical_replicate_distances/matrix_tech_rep_shotgun_hbm_jaccard.txt", sep = '\t', index = False)

metric="RPCA"
df = out_dfs[metric]
df = df.drop_duplicates("value")
df.to_csv("/matrix/results/technical_replicate_distances/matrix_tech_rep_shotgun_hbm_rpca.txt", sep = '\t', index = False)

metric="unweighted_unifrac"
df = out_dfs[metric]
df = df.drop_duplicates("value")
df.to_csv("/matrix/results/technical_replicate_distances/matrix_tech_rep_shotgun_hbm_unifrac.txt", sep = '\t', index = False)

metric="weighted_unifrac"
df = out_dfs[metric]
df = df.drop_duplicates("value")
df.to_csv("/matrix/results/technical_replicate_distances/matrix_tech_rep_shotgun_hbm_weighted_unifrac.txt", sep = '\t', index = False)

metric="phylo_RPCA"
df = out_dfs[metric]
df = df.drop_duplicates("value")
df.to_csv("/matrix/results/technical_replicate_distances/matrix_tech_rep_shotgun_hbm_phylo_rpca.txt", sep = '\t', index = False)


# Shotgun low biomass

## Import data

In [7]:
# Data
qza = q2.Artifact.load("/matrix/data/shotgun/matrix_shotgun_wolr2pe_biom_lbm_noControls_noSpike_noSingletons.qza")
bt = qza.view(Table)

# Rarefied data
qza_rare = q2.Artifact.load("/matrix/data/shotgun/matrix_shotgun_wolr2pe_biom_lbm_noControls_noSpike_noSingletons_rar55K.qza")
bt_rare = qza_rare.view(Table)

# Metadata
md = pd.read_csv("/matrix/metadata_samples/metadata_samples_qiita_20250319e.txt",sep='\t', index_col=0)
md.index.name="sample_name"

# Tree
tree_q2 = q2.Artifact.load("/databases/wol/wol_r2/wolr2_phylogeny.qza")

# Taxonomy
taxonomy_q2 = q2.Metadata.load('/databases/wol/wol_r2/wolr2_taxonomy.txt')

# Filter table to include samples in metadata and re-index metadata
bt_samples = set(bt.ids(s))
md = md[md.index.isin(bt_samples)]


## Calculate distances


In [None]:
rarefaction_depth = 55892

dms = {}

# Jaccard
dms["jaccard"] = beta(table=qza_rare, metric="jaccard").distance_matrix.view(DistanceMatrix)

# weighted unifrac
dms["weighted_unifrac"] = beta_phylogenetic(table=qza_rare, phylogeny=tree_q2, metric="weighted_unifrac").distance_matrix.view(DistanceMatrix)

# unweighted unifrac
dms["unweighted_unifrac"] = beta_phylogenetic(table=qza_rare, phylogeny=tree_q2, metric="unweighted_unifrac").distance_matrix.view(DistanceMatrix)

# RPCA
bplt_rpca, dm_rpca = rpca(table=qza,
               n_components=3, 
               min_sample_count=rarefaction_depth, 
               min_feature_frequency=0)
dms["RPCA"] = dm_rpca.view(DistanceMatrix)

# Phylo-RPCA
bplt_phylo_rpca, dm_phylo_rpca, node_tree, node_counts, tree_to_taxonomy = phylogenetic_rpca_with_taxonomy(table=qza, 
                                                    phylogeny=tree_q2, 
                                                    taxonomy=taxonomy_q2, 
                                                    min_feature_count=0,
                                                    min_feature_frequency=0,
                                                    min_sample_count=rarefaction_depth)
dms["phylo_RPCA"] = dm_phylo_rpca.view(DistanceMatrix)



## Generate dataframes for plotting


In [None]:
md_variable = md.loc[:,["sample_type3","unique_sample_id"]]
out_dfs = {}
for metric,dm in dms.items():
    df = dm.to_data_frame()
    df=df.reset_index().melt(id_vars="index")
    df.columns=["sample1","sample2","value"]
    df = df.merge(md_variable, right_index=True, left_on="sample1")
    df = df.rename(columns={"sample_type3":"sample1_type","unique_sample_id":"sample1_unique_sample_id"})
    df = df.merge(md_variable, right_index=True, left_on="sample2")
    df = df.rename(columns={"sample_type3":"sample2_type","unique_sample_id":"sample2_unique_sample_id"})
    df = df.merge(md.loc[:,["sample_type","sample_type1","sample_type2","sample_type3","biomass","extraction_protocol_storage_solution","host_subject_id"]], right_index=True, left_on="sample1")
    df = df.query('sample1_type==sample2_type & sample1_unique_sample_id==sample2_unique_sample_id & sample1!=sample2')
    out_dfs[metric] = df
    

## Export data frames


In [None]:
metric="jaccard"
df = out_dfs[metric]
df = df.drop_duplicates("value")
df.to_csv("/matrix/results/technical_replicate_distances/matrix_tech_rep_shotgun_lbm_jaccard.txt", sep = '\t', index = False)

metric="RPCA"
df = out_dfs[metric]
df = df.drop_duplicates("value")
df.to_csv("/matrix/results/technical_replicate_distances/matrix_tech_rep_shotgun_lbm_rpca.txt", sep = '\t', index = False)

metric="unweighted_unifrac"
df = out_dfs[metric]
df = df.drop_duplicates("value")
df.to_csv("/matrix/results/technical_replicate_distances/matrix_tech_rep_shotgun_lbm_unifrac.txt", sep = '\t', index = False)

metric="weighted_unifrac"
df = out_dfs[metric]
df = df.drop_duplicates("value")
df.to_csv("/matrix/results/technical_replicate_distances/matrix_tech_rep_shotgun_lbm_weighted_unifrac.txt", sep = '\t', index = False)

metric="phylo_RPCA"
df = out_dfs[metric]
df = df.drop_duplicates("value")
df.to_csv("/matrix/results/technical_replicate_distances/matrix_tech_rep_shotgun_lbm_phylo_rpca.txt", sep = '\t', index = False)


# Metabolomics

## Import data

In [7]:
# Data
qza = q2.Artifact.load("/matrix/data/lcms/matrix_lcms_merged_biom_qiita_ids_noSingletons.qza")
bt = qza.view(Table)

# Metadata
md = pd.read_csv("/matrix/metadata_samples/metadata_samples_qiita_20250307.txt",sep='\t', index_col=0)
md.index.name="sample_name"

# Filter table to include samples in metadata and re-index metadata
bt_samples = set(bt.ids(s))
md = md[md.index.isin(bt_samples)]


## Calculate distances


In [None]:
dms = {}

# Jaccard
dms["jaccard"] = beta(table=qza, metric="jaccard").distance_matrix.view(DistanceMatrix)

# Cosine
dms["cosine"] = beta(table=qza, metric="cosine").distance_matrix.view(DistanceMatrix)

# Canberra-Adkins
dms["canberra_adkins"] = beta(table=qza, metric="canberra_adkins").distance_matrix.view(DistanceMatrix)

# RPCA
bplt_rpca, dm_rpca = rpca(table=qza,
               n_components=3, 
               min_sample_count=0, 
               min_feature_frequency=0)
dms["RPCA"] = dm_rpca.view(DistanceMatrix)



## Generate dataframes for plotting


In [None]:
md_variable = md.loc[:,["sample_type3","unique_sample_id"]]
out_dfs = {}
for metric,dm in dms.items():
    df = dm.to_data_frame()
    df=df.reset_index().melt(id_vars="index")
    df.columns=["sample1","sample2","value"]
    df = df.merge(md_variable, right_index=True, left_on="sample1")
    df = df.rename(columns={"sample_type3":"sample1_type","unique_sample_id":"sample1_unique_sample_id"})
    df = df.merge(md_variable, right_index=True, left_on="sample2")
    df = df.rename(columns={"sample_type3":"sample2_type","unique_sample_id":"sample2_unique_sample_id"})
    df = df.merge(md.loc[:,["sample_type","sample_type1","sample_type2","sample_type3","biomass","extraction_protocol_storage_solution","host_subject_id"]], right_index=True, left_on="sample1")
    df = df.query('sample1_type==sample2_type & sample1_unique_sample_id==sample2_unique_sample_id & sample1!=sample2')
    out_dfs[metric] = df
    

## Export data frames


In [None]:
metric="jaccard"
df = out_dfs[metric]
df = df.drop_duplicates("value")
df.to_csv("/matrix/results/technical_replicate_distances/matrix_tech_rep_metab_jaccard.txt", sep = '\t', index = False)

metric="RPCA"
df = out_dfs[metric]
df = df.drop_duplicates("value")
df.to_csv("/matrix/results/technical_replicate_distances/matrix_tech_rep_metab_rpca.txt", sep = '\t', index = False)

metric="cosine"
df = out_dfs[metric]
df = df.drop_duplicates("value")
df.to_csv("/matrix/results/technical_replicate_distances/matrix_tech_rep_metab_cosine.txt", sep = '\t', index = False)

metric="canberra_adkins"
df = out_dfs[metric]
df = df.drop_duplicates("value")
df.to_csv("/matrix/results/technical_replicate_distances/matrix_tech_rep_metab_canberra_adkins.txt", sep = '\t', index = False)


# Proceed to plotting in R