# Set up notebook environment

In [12]:
import pandas as pd
import qiime2 as q2
import numpy as np
import plotnine as pn
from biom import Table, load_table
from qiime2.plugins.deicode.actions import rpca
from qiime2.plugins.diversity.actions import beta_phylogenetic
from qiime2.plugins.diversity.actions import beta
from qiime2.plugins.diversity.actions import alpha
from qiime2.plugins.feature_table.actions import rarefy
from skbio import DistanceMatrix

s="sample"
o="observation"

%matplotlib inline


In [2]:
cd /Users/Justin/Google-Drive-UCSD/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_03_2vs20min/


/Users/Justin/Google-Drive-UCSD/UCSD/00_Knight_Lab/03_Extraction_test_12201/round_03_2vs20min


# 16S processing

In [3]:
def add_taxsplit(taxdf):
    # split taxonomy 
    def tax_split(tax_id, tax_level): return tax_id.split(
        tax_level)[1].split(';')[0]
    for level, lname in zip(['k__', 'p__', 'c__', 'o__',
                             'f__', 'g__', 's__'],
                            ['kingdom', 'phylum', 'class',
                             'order', 'family', 'genus',
                             'species']):
        if lname not in taxdf.columns:
            taxonomy_tmp = []
            for tax in taxdf.Taxon:
                if tax is not np.nan and\
                   level in tax and\
                   len(tax_split(tax, level)) > 0:
                    taxonomy_tmp.append(tax_split(tax,
                                                  level))
                else:
                    taxonomy_tmp.append(np.nan)
            taxdf[lname] = taxonomy_tmp
    return taxdf


## Import data

In [42]:
# Data
qza = q2.Artifact.load("01_16S/data/extraction_test_round_3_biom_lod.qza")
bt = qza.view(Table)

# Metadata
md = pd.read_csv("01_16S/metadata/metadata_12201_round3_read_counts_alpha_diversity_2020.09.23.txt",sep='\t', index_col=0)
md.index.name="sample_name"
md_q2 = q2.Metadata(md)

# Filter samples
md = md.loc[ md.sample_type.notna(), ]
bt_samples = set(bt.ids(s))
md_samples = set(md.index)
all_samples_keep = bt_samples & md_samples
bt.filter(all_samples_keep)
md = md.query('index in @all_samples_keep')

# Remove blank samples
blank_samples = set( md.query('sample_type=="control blank"').index )
bt_no_blank = bt.filter(blank_samples, invert=True, inplace=False)
md_noblank = md.query('sample_type!="control blank"').copy()

# Convert back to qza
qza = q2.Artifact.import_data('FeatureTable[Frequency]', bt)
qza_no_blank = q2.Artifact.import_data('FeatureTable[Frequency]', bt_no_blank)

# Taxonomy
tax_q2 = q2.Artifact.load("01_16S/data/extraction_test_round_3_taxonomy.qza")
tax_df = pd.DataFrame(tax_q2.view(pd.Series))
tax_df = add_taxsplit(tax_df)

# Tree
tree_q2 = q2.Artifact.import_data('Phylogeny[Rooted]', "01_16S/data/extraction_test_round_3_tree.tre")


## Calculate distances

In [7]:
# Rarefy
qza_rare = rarefy(qza, 5000).rarefied_table
bt_rare = qza_rare.view(Table)

dms = {}

# Jaccard
dms["jaccard"] = beta(table=qza_rare, metric="jaccard").distance_matrix.view(DistanceMatrix)

# weighted unifrac
dms["weighted_unifrac"] = beta_phylogenetic(table=qza_rare, phylogeny=tree_q2, metric="weighted_unifrac").distance_matrix.view(DistanceMatrix)

# unweighted unifrac
dms["unweighted_unifrac"] = beta_phylogenetic(table=qza_rare, phylogeny=tree_q2, metric="unweighted_unifrac").distance_matrix.view(DistanceMatrix)

# Deicode
bplt, dm= rpca(table=qza,n_components=3, min_sample_count=5000, min_feature_count=10)
dms["deicode"] = dm.view(DistanceMatrix)


Use a regular DataFrame whose columns are SparseArrays instead.

See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.

  return constructor(mat, index=index, columns=columns)
Use a Series with sparse values instead.

    >>> series = pd.Series(pd.SparseArray(...))

See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.

  sparse_index=BlockIndex(N, blocs, blens),
Use a Series with sparse values instead.

    >>> series = pd.Series(pd.SparseArray(...))

See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.

  return self._constructor(new_data).__finalize__(self)
Use a regular DataFrame whose columns are SparseArrays instead.

See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.

  default_kind=self._default_kind,


In [43]:
md_sample_type3 = md.loc[:,["sample_type_3","extraction_protocol_short"]]
out_dfs = {}
for metric,dm in dms.items():
    df = dm.to_data_frame()
    df=df.reset_index().melt(id_vars="index")
    df.columns=["sample1","sample2","value"]
    df = df.merge(md_sample_type3, right_index=True, left_on="sample1")
    df = df.rename(columns={"sample_type_3":"sample1_type","extraction_protocol_short":"sample1_extraction_protocol_short"})
    df = df.merge(md_sample_type3, right_index=True, left_on="sample2")
    df = df.rename(columns={"sample_type_3":"sample2_type","extraction_protocol_short":"sample2_extraction_protocol_short"})
    df = df.merge(md.loc[:,["sample_type","sample_type_2","biomass_sample","biomass_sample_long"]], right_index=True, left_on="sample1")
    df = df.query('sample1_type==sample2_type & sample1_extraction_protocol_short==sample2_extraction_protocol_short & sample1!=sample2')
    out_dfs[metric] = df
    #.groupby(["sample1_bead_beating","sample1_type"]).agg({"value":["mean",scipy.stats.sem]})
    

## Export data frames

In [44]:
metric="jaccard"
df = out_dfs[metric]
df = df.query('sample_type_2 not in ["doorknob","mouse feces","mouse jejunum tissue", "PCR extraction control", "two strains"]')
df = df.drop_duplicates("value")
df.to_csv("01_16S/metadata/technical_replicate_distances_jaccard.tsv", sep = '\t', index = False)


In [48]:
metric="deicode"
df = out_dfs[metric]
df = df.query('sample_type_2 not in ["doorknob","mouse feces","mouse jejunum tissue", "PCR extraction control"]')
df = df.drop_duplicates("value")
df.to_csv("01_16S/metadata/technical_replicate_distances_rpca.tsv", sep = '\t', index = False)


In [49]:
metric="unweighted_unifrac"
df = out_dfs[metric]
df = df.query('sample_type_2 not in ["doorknob","mouse feces","mouse jejunum tissue", "PCR extraction control"]')
df = df.drop_duplicates("value")
df.to_csv("01_16S/metadata/technical_replicate_distances_unweighted_unifrac.tsv", sep = '\t', index = False)


In [50]:
metric="weighted_unifrac"
df = out_dfs[metric]
df = df.query('sample_type_2 not in ["doorknob","mouse feces","mouse jejunum tissue", "PCR extraction control"]')
df = df.drop_duplicates("value")
df.to_csv("01_16S/metadata/technical_replicate_distances_weighted_unifrac.tsv", sep = '\t', index = False)


# Metagenomics processing

In [3]:
def add_taxsplit(taxdf):
    # split taxonomy 
    def tax_split(tax_id, tax_level): return tax_id.split(
        tax_level)[1].split(';')[0]
    for level, lname in zip(['k__', 'p__', 'c__', 'o__',
                             'f__', 'g__', 's__'],
                            ['kingdom', 'phylum', 'class',
                             'order', 'family', 'genus',
                             'species']):
        if lname not in taxdf.columns:
            taxonomy_tmp = []
            for tax in taxdf.Taxon:
                if tax is not np.nan and\
                   level in tax and\
                   len(tax_split(tax, level)) > 0:
                    taxonomy_tmp.append(tax_split(tax,
                                                  level))
                else:
                    taxonomy_tmp.append(np.nan)
            taxdf[lname] = taxonomy_tmp
    return taxdf


## Import data - high biomass

In [54]:
# Data
qza = q2.Artifact.load("01_shotgun/gotu_profile_updated_sampleIDs_highBiomass.qza")
bt = qza.view(Table)

# Metadata
md = pd.read_csv("01_shotgun/metadata_12201_round3_read_counts_alpha_diversity_2020.09.24.txt",sep='\t', index_col=0)
md.index.name="sample_name"
md_q2 = q2.Metadata(md)

# Filter samples
md = md.loc[ md.sample_type.notna(), ]
bt_samples = set(bt.ids(s))
md_samples = set(md.index)
all_samples_keep = bt_samples & md_samples
bt.filter(all_samples_keep)
md = md.query('index in @all_samples_keep')

# Remove blank samples
blank_samples = set( md.query('sample_type=="control blank"').index )
bt_no_blank = bt.filter(blank_samples, invert=True, inplace=False)
md_noblank = md.query('sample_type!="control blank"').copy()

# Convert back to qza
qza = q2.Artifact.import_data('FeatureTable[Frequency]', bt)
qza_no_blank = q2.Artifact.import_data('FeatureTable[Frequency]', bt_no_blank)

# Taxonomy
tax_q2 = q2.Artifact.load("../../03_Web_of_life/wol_taxonomy.qza")
tax_df = pd.DataFrame(tax_q2.view(pd.Series))
tax_df = add_taxsplit(tax_df)

# Tree
tree_q2 = q2.Artifact.import_data('Phylogeny[Rooted]', "../../03_Web_of_life/wol_tree.nwk")


## Calculate distances - high biomass

In [56]:
# Rarefy
qza_rare = rarefy(qza, 35000).rarefied_table
bt_rare = qza_rare.view(Table)

dms = {}

# Jaccard
dms["jaccard"] = beta(table=qza_rare, metric="jaccard").distance_matrix.view(DistanceMatrix)

# weighted unifrac
dms["weighted_unifrac"] = beta_phylogenetic(table=qza_rare, phylogeny=tree_q2, metric="weighted_unifrac").distance_matrix.view(DistanceMatrix)

# unweighted unifrac
dms["unweighted_unifrac"] = beta_phylogenetic(table=qza_rare, phylogeny=tree_q2, metric="unweighted_unifrac").distance_matrix.view(DistanceMatrix)

# Deicode
bplt, dm= rpca(table=qza,n_components=3, min_sample_count=35000, min_feature_count=10)
dms["deicode"] = dm.view(DistanceMatrix)


Use a regular DataFrame whose columns are SparseArrays instead.

See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.



In [57]:
md_sample_type3 = md.loc[:,["sample_type_3","extraction_protocol_short"]]
out_dfs = {}
for metric,dm in dms.items():
    df = dm.to_data_frame()
    df=df.reset_index().melt(id_vars="index")
    df.columns=["sample1","sample2","value"]
    df = df.merge(md_sample_type3, right_index=True, left_on="sample1")
    df = df.rename(columns={"sample_type_3":"sample1_type","extraction_protocol_short":"sample1_extraction_protocol_short"})
    df = df.merge(md_sample_type3, right_index=True, left_on="sample2")
    df = df.rename(columns={"sample_type_3":"sample2_type","extraction_protocol_short":"sample2_extraction_protocol_short"})
    df = df.merge(md.loc[:,["sample_type","sample_type_2","biomass_sample","biomass_sample_long"]], right_index=True, left_on="sample1")
    df = df.query('sample1_type==sample2_type & sample1_extraction_protocol_short==sample2_extraction_protocol_short & sample1!=sample2')
    out_dfs[metric] = df
    #.groupby(["sample1_bead_beating","sample1_type"]).agg({"value":["mean",scipy.stats.sem]})
    

## Export data frames - high biomass

In [62]:
metric="jaccard"
df = out_dfs[metric]
df = df.query('sample_type_2 not in ["bare soil"]')
df = df.drop_duplicates("value")
df.to_csv("01_shotgun/technical_replicate_distances_shotgun_high_biomass_jaccard.tsv", sep = '\t', index = False)


In [63]:
metric="deicode"
df = out_dfs[metric]
df = df.query('sample_type_2 not in ["bare soil"]')
df = df.drop_duplicates("value")
df.to_csv("01_shotgun/technical_replicate_distances_shotgun_high_biomass_rpca.tsv", sep = '\t', index = False)


In [64]:
metric="unweighted_unifrac"
df = out_dfs[metric]
df = df.query('sample_type_2 not in ["bare soil"]')
df = df.drop_duplicates("value")
df.to_csv("01_shotgun/technical_replicate_distances_shotgun_high_biomass_unweighted_unifrac.tsv", sep = '\t', index = False)


In [65]:
metric="weighted_unifrac"
df = out_dfs[metric]
df = df.query('sample_type_2 not in ["bare soil"]')
df = df.drop_duplicates("value")
df.to_csv("01_shotgun/technical_replicate_distances_shotgun_high_biomass_weighted_unifrac.tsv", sep = '\t', index = False)


## Import data - low biomass

In [66]:
# Data
qza = q2.Artifact.load("01_shotgun/gotu_profile_updated_sampleIDs_lowBiomass.qza")
bt = qza.view(Table)

# Metadata
md = pd.read_csv("01_shotgun/metadata_12201_round3_read_counts_alpha_diversity_2020.09.24.txt",sep='\t', index_col=0)
md.index.name="sample_name"
md_q2 = q2.Metadata(md)

# Filter samples
md = md.loc[ md.sample_type.notna(), ]
bt_samples = set(bt.ids(s))
md_samples = set(md.index)
all_samples_keep = bt_samples & md_samples
bt.filter(all_samples_keep)
md = md.query('index in @all_samples_keep')

# Remove blank samples
blank_samples = set( md.query('sample_type=="control blank"').index )
bt_no_blank = bt.filter(blank_samples, invert=True, inplace=False)
md_noblank = md.query('sample_type!="control blank"').copy()

# Convert back to qza
qza = q2.Artifact.import_data('FeatureTable[Frequency]', bt)
qza_no_blank = q2.Artifact.import_data('FeatureTable[Frequency]', bt_no_blank)

# Taxonomy
tax_q2 = q2.Artifact.load("../../03_Web_of_life/wol_taxonomy.qza")
tax_df = pd.DataFrame(tax_q2.view(pd.Series))
tax_df = add_taxsplit(tax_df)

# Tree
tree_q2 = q2.Artifact.import_data('Phylogeny[Rooted]', "../../03_Web_of_life/wol_tree.nwk")


## Calculate distances - low biomass

In [67]:
# Rarefy
qza_rare = rarefy(qza, 20000).rarefied_table
bt_rare = qza_rare.view(Table)

dms = {}

# Jaccard
dms["jaccard"] = beta(table=qza_rare, metric="jaccard").distance_matrix.view(DistanceMatrix)

# weighted unifrac
dms["weighted_unifrac"] = beta_phylogenetic(table=qza_rare, phylogeny=tree_q2, metric="weighted_unifrac").distance_matrix.view(DistanceMatrix)

# unweighted unifrac
dms["unweighted_unifrac"] = beta_phylogenetic(table=qza_rare, phylogeny=tree_q2, metric="unweighted_unifrac").distance_matrix.view(DistanceMatrix)

# Deicode
bplt, dm= rpca(table=qza,n_components=3, min_sample_count=20000, min_feature_count=10)
dms["deicode"] = dm.view(DistanceMatrix)


Use a regular DataFrame whose columns are SparseArrays instead.

See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.



In [68]:
md_sample_type3 = md.loc[:,["sample_type_3","extraction_protocol_short"]]
out_dfs = {}
for metric,dm in dms.items():
    df = dm.to_data_frame()
    df=df.reset_index().melt(id_vars="index")
    df.columns=["sample1","sample2","value"]
    df = df.merge(md_sample_type3, right_index=True, left_on="sample1")
    df = df.rename(columns={"sample_type_3":"sample1_type","extraction_protocol_short":"sample1_extraction_protocol_short"})
    df = df.merge(md_sample_type3, right_index=True, left_on="sample2")
    df = df.rename(columns={"sample_type_3":"sample2_type","extraction_protocol_short":"sample2_extraction_protocol_short"})
    df = df.merge(md.loc[:,["sample_type","sample_type_2","biomass_sample","biomass_sample_long"]], right_index=True, left_on="sample1")
    df = df.query('sample1_type==sample2_type & sample1_extraction_protocol_short==sample2_extraction_protocol_short & sample1!=sample2')
    out_dfs[metric] = df
    #.groupby(["sample1_bead_beating","sample1_type"]).agg({"value":["mean",scipy.stats.sem]})
    

## Export data frames - high biomass

In [69]:
metric="jaccard"
df = out_dfs[metric]
#df = df.query('sample_type_2 not in ["bare soil"]')
df = df.drop_duplicates("value")
df.to_csv("01_shotgun/technical_replicate_distances_shotgun_low_biomass_jaccard.tsv", sep = '\t', index = False)


In [70]:
metric="deicode"
df = out_dfs[metric]
#df = df.query('sample_type_2 not in ["bare soil"]')
df = df.drop_duplicates("value")
df.to_csv("01_shotgun/technical_replicate_distances_shotgun_low_biomass_rpca.tsv", sep = '\t', index = False)


In [71]:
metric="unweighted_unifrac"
df = out_dfs[metric]
#df = df.query('sample_type_2 not in ["bare soil"]')
df = df.drop_duplicates("value")
df.to_csv("01_shotgun/technical_replicate_distances_shotgun_low_biomass_unweighted_unifrac.tsv", sep = '\t', index = False)


In [72]:
metric="weighted_unifrac"
df = out_dfs[metric]
#df = df.query('sample_type_2 not in ["bare soil"]')
df = df.drop_duplicates("value")
df.to_csv("01_shotgun/technical_replicate_distances_shotgun_low_biomass_weighted_unifrac.tsv", sep = '\t', index = False)
