# Set up notebook environment
## NOTE: Use a QIIME2 kernel

In [1]:
import pandas as pd
import qiime2 as q2
import numpy as np
import plotnine as pn
from biom import Table, load_table
from qiime2.plugins.deicode.actions import rpca
from qiime2.plugins.diversity.actions import beta_phylogenetic
from qiime2.plugins.diversity.actions import beta
from qiime2.plugins.diversity.actions import alpha
from qiime2.plugins.feature_table.actions import rarefy
from skbio import DistanceMatrix

s="sample"
o="observation"

%matplotlib inline


In [2]:
cd /Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/


/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02


# 16S - high biomass

## Create function for making taxonomy files human readable

In [5]:
def add_taxsplit(taxdf):
    # split taxonomy 
    def tax_split(tax_id, tax_level): return tax_id.split(
        tax_level)[1].split(';')[0]
    for level, lname in zip(['k__', 'p__', 'c__', 'o__',
                             'f__', 'g__', 's__'],
                            ['kingdom', 'phylum', 'class',
                             'order', 'family', 'genus',
                             'species']):
        if lname not in taxdf.columns:
            taxonomy_tmp = []
            for tax in taxdf.Taxon:
                if tax is not np.nan and\
                   level in tax and\
                   len(tax_split(tax, level)) > 0:
                    taxonomy_tmp.append(tax_split(tax,
                                                  level))
                else:
                    taxonomy_tmp.append(np.nan)
            taxdf[lname] = taxonomy_tmp
    return taxdf


## Import data

In [6]:
# Data
qza = q2.Artifact.load("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/data/16S/10_filtered_data/dna_bothPS_16S_deblur_biom_lod_noChl_noMit_sepp_gg_noNTCs_noMock_hbm.qza")
bt = qza.view(Table)
qza_rare = q2.Artifact.load("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/data/16S/11_normalized_data/dna_bothPS_16S_deblur_biom_lod_noChl_noMit_sepp_gg_noNTCs_noMock_hbm_rar12690.qza")
bt_rare = qza_rare.view(Table)

# Metadata
md = pd.read_csv("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/sample_metadata/12201_metadata.txt",sep='\t', index_col=0)
md.index.name="sample_name"
#md_q2 = q2.Metadata(md)

# Taxonomy
tax_q2 = q2.Artifact.load("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/data/16S/06_taxonomy/dna_all_16S_deblur_seqs_taxonomy_silva138.qza")
tax_df = pd.DataFrame(tax_q2.view(pd.Series))
tax_df = add_taxsplit(tax_df)

# Tree
tree_q2 = q2.Artifact.load("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/data/16S/09_fragment_insertion/dna_all_16S_deblur_seqs_noChl_noMit_tree_gg.qza")

# Filter samples
bt_samples = set(bt.ids(s))
md = md[md.index.isin(bt_samples)]


## Calculate distances

In [8]:
rarefaction_depth = 12690

dms = {}

# Jaccard
dms["jaccard"] = beta(table=qza_rare, metric="jaccard").distance_matrix.view(DistanceMatrix)

# weighted unifrac
dms["weighted_unifrac"] = beta_phylogenetic(table=qza_rare, phylogeny=tree_q2, metric="weighted_unifrac").distance_matrix.view(DistanceMatrix)

# unweighted unifrac
dms["unweighted_unifrac"] = beta_phylogenetic(table=qza_rare, phylogeny=tree_q2, metric="unweighted_unifrac").distance_matrix.view(DistanceMatrix)

# Deicode
bplt, dm= rpca(table=qza,n_components=3, min_sample_count=rarefaction_depth, min_feature_frequency=10)
dms["deicode"] = dm.view(DistanceMatrix)




## Generate dataframes

In [9]:
md_sample_type3 = md.loc[:,["sample_type_3","extraction_kit_round"]]
out_dfs = {}
for metric,dm in dms.items():
    df = dm.to_data_frame()
    df=df.reset_index().melt(id_vars="index")
    df.columns=["sample1","sample2","value"]
    df = df.merge(md_sample_type3, right_index=True, left_on="sample1")
    df = df.rename(columns={"sample_type_3":"sample1_type","extraction_kit_round":"sample1_extraction_kit_round"})
    df = df.merge(md_sample_type3, right_index=True, left_on="sample2")
    df = df.rename(columns={"sample_type_3":"sample2_type","extraction_kit_round":"sample2_extraction_kit_round"})
    df = df.merge(md.loc[:,["sample_type","sample_type_2","biomass_plate"]], right_index=True, left_on="sample1")
    df = df.query('sample1_type==sample2_type & sample1_extraction_kit_round==sample2_extraction_kit_round & sample1!=sample2')
    out_dfs[metric] = df
    #.groupby(["sample1_bead_beating","sample1_type"]).agg({"value":["mean",scipy.stats.sem]})
    

## Export data frames

In [11]:
metric="jaccard"
df = out_dfs[metric]
#df = df.query('sample_type_2 not in ["doorknob","mouse feces","mouse jejunum tissue", "PCR extraction control", "two strains"]')
df = df.drop_duplicates("value")
df.to_csv("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/technical_replicates/data_tech_reps_16S_hbm_jaccard.txt", sep = '\t', index = False)


In [12]:
metric="deicode"
df = out_dfs[metric]
#df = df.query('sample_type_2 not in ["doorknob","mouse feces","mouse jejunum tissue", "PCR extraction control"]')
df = df.drop_duplicates("value")
df.to_csv("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/technical_replicates/data_tech_reps_16S_hbm_rpca.txt", sep = '\t', index = False)


In [13]:
metric="unweighted_unifrac"
df = out_dfs[metric]
#df = df.query('sample_type_2 not in ["doorknob","mouse feces","mouse jejunum tissue", "PCR extraction control"]')
df = df.drop_duplicates("value")
df.to_csv("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/technical_replicates/data_tech_reps_16S_hbm_unifrac.txt", sep = '\t', index = False)


In [14]:
metric="weighted_unifrac"
df = out_dfs[metric]
#df = df.query('sample_type_2 not in ["doorknob","mouse feces","mouse jejunum tissue", "PCR extraction control"]')
df = df.drop_duplicates("value")
df.to_csv("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/technical_replicates/data_tech_reps_16S_hbm_wunifrac.txt", sep = '\t', index = False)


# 16S - low biomass

## Create function for making taxonomy files human readable

In [15]:
def add_taxsplit(taxdf):
    # split taxonomy 
    def tax_split(tax_id, tax_level): return tax_id.split(
        tax_level)[1].split(';')[0]
    for level, lname in zip(['k__', 'p__', 'c__', 'o__',
                             'f__', 'g__', 's__'],
                            ['kingdom', 'phylum', 'class',
                             'order', 'family', 'genus',
                             'species']):
        if lname not in taxdf.columns:
            taxonomy_tmp = []
            for tax in taxdf.Taxon:
                if tax is not np.nan and\
                   level in tax and\
                   len(tax_split(tax, level)) > 0:
                    taxonomy_tmp.append(tax_split(tax,
                                                  level))
                else:
                    taxonomy_tmp.append(np.nan)
            taxdf[lname] = taxonomy_tmp
    return taxdf


## Import data

In [16]:
# Data
qza = q2.Artifact.load("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/data/16S/10_filtered_data/dna_bothPS_16S_deblur_biom_lod_noChl_noMit_sepp_gg_noNTCs_noMock_lbm.qza")
bt = qza.view(Table)
qza_rare = q2.Artifact.load("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/data/16S/11_normalized_data/dna_bothPS_16S_deblur_biom_lod_noChl_noMit_sepp_gg_noNTCs_noMock_lbm_rar3295.qza")
bt_rare = qza_rare.view(Table)

# Metadata
md = pd.read_csv("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/sample_metadata/12201_metadata.txt",sep='\t', index_col=0)
md.index.name="sample_name"
#md_q2 = q2.Metadata(md)

# Taxonomy
tax_q2 = q2.Artifact.load("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/data/16S/06_taxonomy/dna_all_16S_deblur_seqs_taxonomy_silva138.qza")
tax_df = pd.DataFrame(tax_q2.view(pd.Series))
tax_df = add_taxsplit(tax_df)

# Tree
tree_q2 = q2.Artifact.load("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/data/16S/09_fragment_insertion/dna_all_16S_deblur_seqs_noChl_noMit_tree_gg.qza")

# Filter samples
bt_samples = set(bt.ids(s))
md = md[md.index.isin(bt_samples)]


## Calculate distances

In [18]:
rarefaction_depth = 3295

dms = {}

# Jaccard
dms["jaccard"] = beta(table=qza_rare, metric="jaccard").distance_matrix.view(DistanceMatrix)

# weighted unifrac
dms["weighted_unifrac"] = beta_phylogenetic(table=qza_rare, phylogeny=tree_q2, metric="weighted_unifrac").distance_matrix.view(DistanceMatrix)

# unweighted unifrac
dms["unweighted_unifrac"] = beta_phylogenetic(table=qza_rare, phylogeny=tree_q2, metric="unweighted_unifrac").distance_matrix.view(DistanceMatrix)

# Deicode
bplt, dm= rpca(table=qza,n_components=3, min_sample_count=rarefaction_depth, min_feature_frequency=10)
dms["deicode"] = dm.view(DistanceMatrix)




## Generate dataframes

In [19]:
md_sample_type3 = md.loc[:,["sample_type_3","extraction_kit_round"]]
out_dfs = {}
for metric,dm in dms.items():
    df = dm.to_data_frame()
    df=df.reset_index().melt(id_vars="index")
    df.columns=["sample1","sample2","value"]
    df = df.merge(md_sample_type3, right_index=True, left_on="sample1")
    df = df.rename(columns={"sample_type_3":"sample1_type","extraction_kit_round":"sample1_extraction_kit_round"})
    df = df.merge(md_sample_type3, right_index=True, left_on="sample2")
    df = df.rename(columns={"sample_type_3":"sample2_type","extraction_kit_round":"sample2_extraction_kit_round"})
    df = df.merge(md.loc[:,["sample_type","sample_type_2","biomass_plate"]], right_index=True, left_on="sample1")
    df = df.query('sample1_type==sample2_type & sample1_extraction_kit_round==sample2_extraction_kit_round & sample1!=sample2')
    out_dfs[metric] = df
    #.groupby(["sample1_bead_beating","sample1_type"]).agg({"value":["mean",scipy.stats.sem]})
    

## Export data frames

In [21]:
metric="jaccard"
df = out_dfs[metric]
#df = df.query('sample_type_2 not in ["doorknob","mouse feces","mouse jejunum tissue", "PCR extraction control", "two strains"]')
df = df.drop_duplicates("value")
df.to_csv("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/technical_replicates/data_tech_reps_16S_lbm_jaccard.txt", sep = '\t', index = False)


In [22]:
metric="deicode"
df = out_dfs[metric]
#df = df.query('sample_type_2 not in ["doorknob","mouse feces","mouse jejunum tissue", "PCR extraction control"]')
df = df.drop_duplicates("value")
df.to_csv("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/technical_replicates/data_tech_reps_16S_lbm_rpca.txt", sep = '\t', index = False)


In [23]:
metric="unweighted_unifrac"
df = out_dfs[metric]
#df = df.query('sample_type_2 not in ["doorknob","mouse feces","mouse jejunum tissue", "PCR extraction control"]')
df = df.drop_duplicates("value")
df.to_csv("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/technical_replicates/data_tech_reps_16S_lbm_unifrac.txt", sep = '\t', index = False)


In [24]:
metric="weighted_unifrac"
df = out_dfs[metric]
#df = df.query('sample_type_2 not in ["doorknob","mouse feces","mouse jejunum tissue", "PCR extraction control"]')
df = df.drop_duplicates("value")
df.to_csv("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/technical_replicates/data_tech_reps_16S_lbm_wunifrac.txt", sep = '\t', index = False)


# ITS - high biomass

## Create function for making taxonomy files human readable

In [3]:
def add_taxsplit(taxdf):
    # split taxonomy 
    def tax_split(tax_id, tax_level): return tax_id.split(
        tax_level)[1].split(';')[0]
    for level, lname in zip(['k__', 'p__', 'c__', 'o__',
                             'f__', 'g__', 's__'],
                            ['kingdom', 'phylum', 'class',
                             'order', 'family', 'genus',
                             'species']):
        if lname not in taxdf.columns:
            taxonomy_tmp = []
            for tax in taxdf.Taxon:
                if tax is not np.nan and\
                   level in tax and\
                   len(tax_split(tax, level)) > 0:
                    taxonomy_tmp.append(tax_split(tax,
                                                  level))
                else:
                    taxonomy_tmp.append(np.nan)
            taxdf[lname] = taxonomy_tmp
    return taxdf


## Import data

In [25]:
# Data
qza = q2.Artifact.load("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/data/ITS/08_filtered_data/dna_bothPS_its_deblur_biom_lod_noNTCs_noMock_hbm.qza")
bt = qza.view(Table)
qza_rare = q2.Artifact.load("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/data/ITS/09_normalized_data/dna_bothPS_its_deblur_biom_lod_noNTCs_noMock_hbm_rar1491.qza")
bt_rare = qza_rare.view(Table)

# Metadata
md = pd.read_csv("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/sample_metadata/12201_metadata.txt",sep='\t', index_col=0)
md.index.name="sample_name"
#md_q2 = q2.Metadata(md)

# Taxonomy
tax_q2 = q2.Artifact.load("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/data/ITS/06_taxonomy/dna_all_ITS_deblur_seqs_taxonomy_unite8.qza")
tax_df = pd.DataFrame(tax_q2.view(pd.Series))
tax_df = add_taxsplit(tax_df)

# Filter samples
bt_samples = set(bt.ids(s))
md = md[md.index.isin(bt_samples)]


## Calculate distances

In [26]:
rarefaction_depth = 1491

dms = {}

# Jaccard
dms["jaccard"] = beta(table=qza_rare, metric="jaccard").distance_matrix.view(DistanceMatrix)

# Deicode
bplt, dm= rpca(table=qza,n_components=3, min_sample_count=rarefaction_depth, min_feature_frequency=10)
dms["deicode"] = dm.view(DistanceMatrix)




## Generate dataframes

In [27]:
md_sample_type3 = md.loc[:,["sample_type_3","extraction_kit_round"]]
out_dfs = {}
for metric,dm in dms.items():
    df = dm.to_data_frame()
    df=df.reset_index().melt(id_vars="index")
    df.columns=["sample1","sample2","value"]
    df = df.merge(md_sample_type3, right_index=True, left_on="sample1")
    df = df.rename(columns={"sample_type_3":"sample1_type","extraction_kit_round":"sample1_extraction_kit_round"})
    df = df.merge(md_sample_type3, right_index=True, left_on="sample2")
    df = df.rename(columns={"sample_type_3":"sample2_type","extraction_kit_round":"sample2_extraction_kit_round"})
    df = df.merge(md.loc[:,["sample_type","sample_type_2","biomass_plate"]], right_index=True, left_on="sample1")
    df = df.query('sample1_type==sample2_type & sample1_extraction_kit_round==sample2_extraction_kit_round & sample1!=sample2')
    out_dfs[metric] = df
    #.groupby(["sample1_bead_beating","sample1_type"]).agg({"value":["mean",scipy.stats.sem]})
    

## Export data frames

In [28]:
metric="jaccard"
df = out_dfs[metric]
#df = df.query('sample_type_2 not in ["doorknob","mouse feces","mouse jejunum tissue", "PCR extraction control", "two strains"]')
df = df.drop_duplicates("value")
df.to_csv("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/technical_replicates/data_tech_reps_its_hbm_jaccard.txt", sep = '\t', index = False)


In [29]:
metric="deicode"
df = out_dfs[metric]
#df = df.query('sample_type_2 not in ["doorknob","mouse feces","mouse jejunum tissue", "PCR extraction control"]')
df = df.drop_duplicates("value")
df.to_csv("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/technical_replicates/data_tech_reps_its_hbm_rpca.txt", sep = '\t', index = False)


# ITS - low biomass

## Create function for making taxonomy files human readable

In [3]:
def add_taxsplit(taxdf):
    # split taxonomy 
    def tax_split(tax_id, tax_level): return tax_id.split(
        tax_level)[1].split(';')[0]
    for level, lname in zip(['k__', 'p__', 'c__', 'o__',
                             'f__', 'g__', 's__'],
                            ['kingdom', 'phylum', 'class',
                             'order', 'family', 'genus',
                             'species']):
        if lname not in taxdf.columns:
            taxonomy_tmp = []
            for tax in taxdf.Taxon:
                if tax is not np.nan and\
                   level in tax and\
                   len(tax_split(tax, level)) > 0:
                    taxonomy_tmp.append(tax_split(tax,
                                                  level))
                else:
                    taxonomy_tmp.append(np.nan)
            taxdf[lname] = taxonomy_tmp
    return taxdf


## Import data

In [30]:
# Data
qza = q2.Artifact.load("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/data/ITS/08_filtered_data/dna_bothPS_its_deblur_biom_lod_noNTCs_noMock_lbm.qza")
bt = qza.view(Table)
qza_rare = q2.Artifact.load("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/data/ITS/09_normalized_data/dna_bothPS_its_deblur_biom_lod_noNTCs_noMock_lbm_rar344.qza")
bt_rare = qza_rare.view(Table)

# Metadata
md = pd.read_csv("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/sample_metadata/12201_metadata.txt",sep='\t', index_col=0)
md.index.name="sample_name"
#md_q2 = q2.Metadata(md)

# Taxonomy
tax_q2 = q2.Artifact.load("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/data/ITS/06_taxonomy/dna_all_its_deblur_seqs_taxonomy_unite8.qza")
tax_df = pd.DataFrame(tax_q2.view(pd.Series))
tax_df = add_taxsplit(tax_df)

# Filter samples
bt_samples = set(bt.ids(s))
md = md[md.index.isin(bt_samples)]


## Calculate distances

In [31]:
rarefaction_depth = 344

dms = {}

# Jaccard
dms["jaccard"] = beta(table=qza_rare, metric="jaccard").distance_matrix.view(DistanceMatrix)

# Deicode
bplt, dm= rpca(table=qza,n_components=3, min_sample_count=rarefaction_depth, min_feature_frequency=10)
dms["deicode"] = dm.view(DistanceMatrix)




## Generate dataframes

In [32]:
md_sample_type3 = md.loc[:,["sample_type_3","extraction_kit_round"]]
out_dfs = {}
for metric,dm in dms.items():
    df = dm.to_data_frame()
    df=df.reset_index().melt(id_vars="index")
    df.columns=["sample1","sample2","value"]
    df = df.merge(md_sample_type3, right_index=True, left_on="sample1")
    df = df.rename(columns={"sample_type_3":"sample1_type","extraction_kit_round":"sample1_extraction_kit_round"})
    df = df.merge(md_sample_type3, right_index=True, left_on="sample2")
    df = df.rename(columns={"sample_type_3":"sample2_type","extraction_kit_round":"sample2_extraction_kit_round"})
    df = df.merge(md.loc[:,["sample_type","sample_type_2","biomass_plate"]], right_index=True, left_on="sample1")
    df = df.query('sample1_type==sample2_type & sample1_extraction_kit_round==sample2_extraction_kit_round & sample1!=sample2')
    out_dfs[metric] = df
    #.groupby(["sample1_bead_beating","sample1_type"]).agg({"value":["mean",scipy.stats.sem]})
    

## Export data frames

In [33]:
metric="jaccard"
df = out_dfs[metric]
#df = df.query('sample_type_2 not in ["doorknob","mouse feces","mouse jejunum tissue", "PCR extraction control", "two strains"]')
df = df.drop_duplicates("value")
df.to_csv("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/technical_replicates/data_tech_reps_its_lbm_jaccard.txt", sep = '\t', index = False)


In [34]:
metric="deicode"
df = out_dfs[metric]
#df = df.query('sample_type_2 not in ["doorknob","mouse feces","mouse jejunum tissue", "PCR extraction control"]')
df = df.drop_duplicates("value")
df.to_csv("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/technical_replicates/data_tech_reps_its_lbm_rpca.txt", sep = '\t', index = False)


# Metagenomic - high biomass

## Create function for making taxonomy files human readable

In [3]:
def add_taxsplit(taxdf):
    # split taxonomy 
    def tax_split(tax_id, tax_level): return tax_id.split(
        tax_level)[1].split(';')[0]
    for level, lname in zip(['k__', 'p__', 'c__', 'o__',
                             'f__', 'g__', 's__'],
                            ['kingdom', 'phylum', 'class',
                             'order', 'family', 'genus',
                             'species']):
        if lname not in taxdf.columns:
            taxonomy_tmp = []
            for tax in taxdf.Taxon:
                if tax is not np.nan and\
                   level in tax and\
                   len(tax_split(tax, level)) > 0:
                    taxonomy_tmp.append(tax_split(tax,
                                                  level))
                else:
                    taxonomy_tmp.append(np.nan)
            taxdf[lname] = taxonomy_tmp
    return taxdf


## Import data

In [35]:
# Data
qza = q2.Artifact.load("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/data/shotgun/03_filtered_data/dna_bothPS_shotgun_woltka_wol_biom_noNTCs_noMock_hbm.qza")
bt = qza.view(Table)
qza_rare = q2.Artifact.load("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/data/shotgun/04_normalized_data/dna_bothPS_shotgun_woltka_wol_biom_noNTCs_noMock_hbm_rar38k.qza")
bt_rare = qza_rare.view(Table)

# Metadata
md = pd.read_csv("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/sample_metadata/12201_metadata.txt",sep='\t', index_col=0)
md.index.name="sample_name"
#md_q2 = q2.Metadata(md)

# Taxonomy
tax_q2 = q2.Artifact.load("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/data/shotgun/wol_taxonomy.qza")
tax_df = pd.DataFrame(tax_q2.view(pd.Series))
tax_df = add_taxsplit(tax_df)

# Tree
tree_q2 = q2.Artifact.load("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/data/shotgun/wol_tree.qza")

# Filter samples
bt_samples = set(bt.ids(s))
md = md[md.index.isin(bt_samples)]


## Calculate distances

In [36]:
rarefaction_depth = 38000

dms = {}

# Jaccard
dms["jaccard"] = beta(table=qza_rare, metric="jaccard").distance_matrix.view(DistanceMatrix)

# weighted unifrac
dms["weighted_unifrac"] = beta_phylogenetic(table=qza_rare, phylogeny=tree_q2, metric="weighted_unifrac").distance_matrix.view(DistanceMatrix)

# unweighted unifrac
dms["unweighted_unifrac"] = beta_phylogenetic(table=qza_rare, phylogeny=tree_q2, metric="unweighted_unifrac").distance_matrix.view(DistanceMatrix)

# Deicode
bplt, dm= rpca(table=qza,n_components=3, min_sample_count=rarefaction_depth, min_feature_frequency=10)
dms["deicode"] = dm.view(DistanceMatrix)




## Generate dataframes

In [37]:
md_sample_type3 = md.loc[:,["sample_type_3","extraction_kit_round"]]
out_dfs = {}
for metric,dm in dms.items():
    df = dm.to_data_frame()
    df=df.reset_index().melt(id_vars="index")
    df.columns=["sample1","sample2","value"]
    df = df.merge(md_sample_type3, right_index=True, left_on="sample1")
    df = df.rename(columns={"sample_type_3":"sample1_type","extraction_kit_round":"sample1_extraction_kit_round"})
    df = df.merge(md_sample_type3, right_index=True, left_on="sample2")
    df = df.rename(columns={"sample_type_3":"sample2_type","extraction_kit_round":"sample2_extraction_kit_round"})
    df = df.merge(md.loc[:,["sample_type","sample_type_2","biomass_plate"]], right_index=True, left_on="sample1")
    df = df.query('sample1_type==sample2_type & sample1_extraction_kit_round==sample2_extraction_kit_round & sample1!=sample2')
    out_dfs[metric] = df
    #.groupby(["sample1_bead_beating","sample1_type"]).agg({"value":["mean",scipy.stats.sem]})
    

## Export data frames

In [38]:
metric="jaccard"
df = out_dfs[metric]
#df = df.query('sample_type_2 not in ["doorknob","mouse feces","mouse jejunum tissue", "PCR extraction control", "two strains"]')
df = df.drop_duplicates("value")
df.to_csv("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/technical_replicates/data_tech_reps_shotgun_hbm_jaccard.txt", sep = '\t', index = False)


In [39]:
metric="deicode"
df = out_dfs[metric]
#df = df.query('sample_type_2 not in ["doorknob","mouse feces","mouse jejunum tissue", "PCR extraction control"]')
df = df.drop_duplicates("value")
df.to_csv("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/technical_replicates/data_tech_reps_shotgun_hbm_rpca.txt", sep = '\t', index = False)


In [40]:
metric="unweighted_unifrac"
df = out_dfs[metric]
#df = df.query('sample_type_2 not in ["doorknob","mouse feces","mouse jejunum tissue", "PCR extraction control"]')
df = df.drop_duplicates("value")
df.to_csv("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/technical_replicates/data_tech_reps_shotgun_hbm_unifrac.txt", sep = '\t', index = False)


In [41]:
metric="weighted_unifrac"
df = out_dfs[metric]
#df = df.query('sample_type_2 not in ["doorknob","mouse feces","mouse jejunum tissue", "PCR extraction control"]')
df = df.drop_duplicates("value")
df.to_csv("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/technical_replicates/data_tech_reps_shotgun_hbm_wunifrac.txt", sep = '\t', index = False)


# Metagenomic - low biomass

## Create function for making taxonomy files human readable

In [3]:
def add_taxsplit(taxdf):
    # split taxonomy 
    def tax_split(tax_id, tax_level): return tax_id.split(
        tax_level)[1].split(';')[0]
    for level, lname in zip(['k__', 'p__', 'c__', 'o__',
                             'f__', 'g__', 's__'],
                            ['kingdom', 'phylum', 'class',
                             'order', 'family', 'genus',
                             'species']):
        if lname not in taxdf.columns:
            taxonomy_tmp = []
            for tax in taxdf.Taxon:
                if tax is not np.nan and\
                   level in tax and\
                   len(tax_split(tax, level)) > 0:
                    taxonomy_tmp.append(tax_split(tax,
                                                  level))
                else:
                    taxonomy_tmp.append(np.nan)
            taxdf[lname] = taxonomy_tmp
    return taxdf


## Import data

In [42]:
# Data
qza = q2.Artifact.load("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/data/shotgun/03_filtered_data/dna_bothPS_shotgun_woltka_wol_biom_noNTCs_noMock_lbm.qza")
bt = qza.view(Table)
qza_rare = q2.Artifact.load("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/data/shotgun/04_normalized_data/dna_bothPS_shotgun_woltka_wol_biom_noNTCs_noMock_lbm_rar600.qza")
bt_rare = qza_rare.view(Table)

# Metadata
md = pd.read_csv("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/sample_metadata/12201_metadata.txt",sep='\t', index_col=0)
md.index.name="sample_name"
#md_q2 = q2.Metadata(md)

# Taxonomy
tax_q2 = q2.Artifact.load("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/data/shotgun/wol_taxonomy.qza")
tax_df = pd.DataFrame(tax_q2.view(pd.Series))
tax_df = add_taxsplit(tax_df)

# Tree
tree_q2 = q2.Artifact.load("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/data/shotgun/wol_tree.qza")

# Filter samples
bt_samples = set(bt.ids(s))
md = md[md.index.isin(bt_samples)]


## Calculate distances

In [43]:
rarefaction_depth = 600

dms = {}

# Jaccard
dms["jaccard"] = beta(table=qza_rare, metric="jaccard").distance_matrix.view(DistanceMatrix)

# weighted unifrac
dms["weighted_unifrac"] = beta_phylogenetic(table=qza_rare, phylogeny=tree_q2, metric="weighted_unifrac").distance_matrix.view(DistanceMatrix)

# unweighted unifrac
dms["unweighted_unifrac"] = beta_phylogenetic(table=qza_rare, phylogeny=tree_q2, metric="unweighted_unifrac").distance_matrix.view(DistanceMatrix)

# Deicode
bplt, dm= rpca(table=qza,n_components=3, min_sample_count=rarefaction_depth, min_feature_frequency=10)
dms["deicode"] = dm.view(DistanceMatrix)




## Generate dataframes

In [44]:
md_sample_type3 = md.loc[:,["sample_type_3","extraction_kit_round"]]
out_dfs = {}
for metric,dm in dms.items():
    df = dm.to_data_frame()
    df=df.reset_index().melt(id_vars="index")
    df.columns=["sample1","sample2","value"]
    df = df.merge(md_sample_type3, right_index=True, left_on="sample1")
    df = df.rename(columns={"sample_type_3":"sample1_type","extraction_kit_round":"sample1_extraction_kit_round"})
    df = df.merge(md_sample_type3, right_index=True, left_on="sample2")
    df = df.rename(columns={"sample_type_3":"sample2_type","extraction_kit_round":"sample2_extraction_kit_round"})
    df = df.merge(md.loc[:,["sample_type","sample_type_2","biomass_plate"]], right_index=True, left_on="sample1")
    df = df.query('sample1_type==sample2_type & sample1_extraction_kit_round==sample2_extraction_kit_round & sample1!=sample2')
    out_dfs[metric] = df
    #.groupby(["sample1_bead_beating","sample1_type"]).agg({"value":["mean",scipy.stats.sem]})
    

## Export data frames

In [45]:
metric="jaccard"
df = out_dfs[metric]
#df = df.query('sample_type_2 not in ["doorknob","mouse feces","mouse jejunum tissue", "PCR extraction control", "two strains"]')
df = df.drop_duplicates("value")
df.to_csv("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/technical_replicates/data_tech_reps_shotgun_lbm_jaccard.txt", sep = '\t', index = False)


In [46]:
metric="deicode"
df = out_dfs[metric]
#df = df.query('sample_type_2 not in ["doorknob","mouse feces","mouse jejunum tissue", "PCR extraction control"]')
df = df.drop_duplicates("value")
df.to_csv("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/technical_replicates/data_tech_reps_shotgun_lbm_rpca.txt", sep = '\t', index = False)


In [47]:
metric="unweighted_unifrac"
df = out_dfs[metric]
#df = df.query('sample_type_2 not in ["doorknob","mouse feces","mouse jejunum tissue", "PCR extraction control"]')
df = df.drop_duplicates("value")
df.to_csv("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/technical_replicates/data_tech_reps_shotgun_lbm_unifrac.txt", sep = '\t', index = False)


In [48]:
metric="weighted_unifrac"
df = out_dfs[metric]
#df = df.query('sample_type_2 not in ["doorknob","mouse feces","mouse jejunum tissue", "PCR extraction control"]')
df = df.drop_duplicates("value")
df.to_csv("/Users/Justin/Mycelium/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_02/results/technical_replicates/data_tech_reps_shotgun_lbm_wunifrac.txt", sep = '\t', index = False)
