In [None]:
import numpy as np
import pandas as pd
import os
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna

In [None]:
taxa_file = "taxa_table.tsv"
abund_file = "abundance_table.tsv"
data_path = "../otu_data/dada2_outputs"
out_path = "../otu_data/tree_data"
abund_f = os.path.join(data_path, abund_file)
tax_f = os.path.join(data_path, taxa_file)
taxa_df = pd.read_csv(tax_f, sep="\t")
abund_df = pd.read_csv(abund_f, sep="\t")

In [None]:
print("Renaming rows from the entire sequences to OTU# format")
print("\tStoring sequences in dictionary, accesible by OTU name")
OTU_Seqs = {taxa_df.loc[idx, taxa_df.columns[0]]:idx for idx in taxa_df.index}
OTU_Names = {idx:"OTU{}".format(idx+1) for idx in taxa_df.index }
OTU_name2seq = {OTU_Names[num]:seq for seq, num in OTU_Seqs.items()}
taxa_df.loc[:, taxa_df.columns[0]] = taxa_df.loc[:, taxa_df.columns[0]].apply(lambda x: OTU_Names[OTU_Seqs[x]])
taxa_df = taxa_df.set_index(taxa_df.columns[0])
new_cols = ['Samples']+[OTU_Names[OTU_Seqs[y]] for y in abund_df.columns[1:]]
abund_df.columns = new_cols

In [None]:
abund_df2 = abund_df.set_index('Samples')
abund_df3 = abund_df2[abund_df2.sum(1)!=0]

In [None]:
print(abund_df2.shape, abund_df3.shape)

with open(os.path.join(out_path, 'cov_model_data', 'poor_aligners.txt')) as paFH:
    poor_aligners = [i for i in paFH.read().split("\n") if i != ""]
    
with open(os.path.join(out_path, 'cov_model_data', 'reverse_strand_aligners.txt')) as rsFH:
    rev_strand_algn = [i for i in rsFH.read().split("\n") if i != ""]


In [None]:
pa_taxa = set(poor_aligners)
print(len(pa_taxa))
length_filter = set([i for i in OTU_name2seq.keys() if len(OTU_name2seq[i]) < 240 or len(OTU_name2seq[i]) > 260])
print(len(length_filter))
arch_euk = set(taxa_df.index[taxa_df.Kingdom.isin(['Archaea', 'Eukaryota'])])
print(len(arch_euk))
print(len(arch_euk.intersection(pa_taxa)))
print(len(arch_euk.intersection(length_filter)))
to_remove = arch_euk - length_filter.union(pa_taxa)
print(len(to_remove))

print(set(['OTU47634', 'OTU19288', 'OTU18363']).intersection(set(abund_df3.columns)))
print(set(['OTU47634', 'OTU19288', 'OTU18363']).intersection(arch_euk))
print(set(['OTU47634', 'OTU19288', 'OTU18363']).intersection(pa_taxa))
print(set(['OTU47634', 'OTU19288', 'OTU18363']).intersection(length_filter))
print(set(['OTU47634', 'OTU19288', 'OTU18363']).intersection(to_remove))

In [None]:
abund_feather = '../otu_data/tree_data/hq_asv_table.feather'
abund_dfx = pd.read_feather(abund_feather, use_threads=True).set_index('Samples')

In [None]:
been_removed = set(abund_df3.columns) - set(abund_dfx.columns)
still_removed = been_removed - to_remove
print(len(been_removed), len(still_removed))

In [None]:
non_bacterial_abunds = abund_df3.loc[:, abund_df3.columns.isin(still_removed)].reset_index()
print(non_bacterial_abunds.shape)
print(non_bacterial_abunds.iloc[:3, :3])

In [None]:
# write out fresh abundance table
# '../otu_data/tree_data/non_bact_table.feather'
abund_feather = os.path.join(out_path, 'non_bact_table.feather')
non_bacterial_abunds.to_feather(abund_feather)

In [None]:
# mix of bacterial and some archael abundances
bacterial_abunds = pd.read_csv("../otu_data/abundances_full.txt", sep="\t", index_col=0)
# everything that was prematurely dropped
nba_1_df = non_bacterial_abunds.set_index('Samples').loc[bacterial_abunds.index, :]
# relative abundance
nba_1_df_ra = nba_1_df.div(nba_1_df.sum(1), axis=0)
# only keeping things that exist 
low_abund = set(nba_1_df.columns[(nba_1_df_ra > 0).sum() == 0])
# make pa
presence_absence = ((nba_1_df > 0)).astype(int)
# only keep things that are present in more than 2 samples 
low_abund = low_abund.union(set(nba_1_df.columns[presence_absence.sum() < 0]))
print(len(low_abund))


In [None]:
rare_fn = "../otu_data/final_rarefied_table.tsv"
meta_fn = "../otu_data/final_metadata.tsv"
meta_df_x = pd.read_csv(meta_fn, sep="\t", index_col=0, converters={'DateMMDDYY': lambda x: str(x)})
rare_abund = pd.read_csv(rare_fn, sep="\t", index_col=0)
env_data = pd.read_csv("../otu_data/environmental_and_mapping_data.txt", sep="\t", index_col=0)
matched_cols = [i for i in meta_df_x.index if i in set(bacterial_abunds.index) and meta_df_x.loc[i, 'DepthName'] != 'LAB']
meta_df_x = meta_df_x.loc[matched_cols, :]
rare_abund = rare_abund.loc[matched_cols, :]
env_data = env_data.loc[matched_cols, :]

meta_df_x['Actual Depth (m)'] = pd.Series({i:env_data.loc[i, 'Actual Depth (m)'] for i in meta_df_x.index})
meta_df_x['Depth'] = meta_df_x['DepthName'].astype(int)
meta_df_x.loc[meta_df_x['Actual Depth (m)'].notnull(), 'Depth'] = meta_df_x.loc[meta_df_x['Actual Depth (m)'].notnull(), 'Actual Depth (m)']

full_df = pd.concat([nba_1_df.loc[matched_cols, :], 
                     bacterial_abunds.loc[matched_cols, :]], axis=1, sort=True)
print(full_df.shape)

needed_cols = ['CollectionAgency', 'StationName', 'Month', 'Year', 'DateMMDDYY', 'enspie', 'faith_pd',
               'Depth', 'Latitude', 'observed_otus']
weighted_uf_cols = [i for i in meta_df_x.columns if i.startswith("SB") and i.endswith("_wu")]
weighted_bc_cols = [i for i in meta_df_x.columns if i.startswith("SB") and i.endswith("_bc")]
weighted_clr_cols = [i for i in meta_df_x.columns if i.startswith("SB") and i.endswith("_clr")]


#to_csv("../masters_students/", sep='\t', index_label='Sample')

In [None]:
clean_singles_f = "../otu_data/WaterQualityData/matched_cleaned_data/all_mdata_colset_2.tsv"
cs_df = pd.read_csv(clean_singles_f, sep="\t", index_col=0).drop("depth_float", axis=1)

transect_data_f = "../otu_data/WaterQualityData/matched_cleaned_data/transect_mdata_colset_1.tsv"
td_df = pd.read_csv(transect_data_f, sep="\t", index_col=0)

for new_col in [i for i in td_df.columns if not i in set(cs_df.columns)]:
    cs_df[new_col] = pd.Series({i:td_df.loc[i, new_col] for i in cs_df.index if i in set(td_df.index)})

for new_col in ['Depth', 'DateMMDDYY', 'observed_otus']:
    cs_df[new_col] = pd.Series({i:meta_df_x.loc[i, new_col] for i in cs_df.index if i in set(meta_df_x.index)})

print(cs_df.shape, cs_df.columns)

alpha_diversity = ['enspie', 'observed_otus', 'faith_pd']
beta_div_uf = [i+"_wu" for i in list(cs_df.index)]
beta_div_bc = [i+"_bc" for i in list(cs_df.index)]
env_data_cols = ['Month', 'Year', 'DateMMDDYY', 'CollectionAgency', 'StationName', 'Latitude', 'Depth', 
                 'Depth_Percentage', 'WTEMP', 'DO', 'CHLA', 'SALINITY','NO2F', 'NH4F', 'PC', 'PHEO', 'NO3F', 
                 'Discharge_Susquehanna_14', 'day_length']

cs_df[alpha_diversity].to_csv("/Users/login/Google Drive/SiYi_Xiaotong_Materials/alpha_diversity.txt", sep='\t', index_label='Sample')
meta_df_x.loc[list(cs_df.index), beta_div_uf].to_csv("/Users/login/Google Drive/SiYi_Xiaotong_Materials/bray_curtis_betadiversity.txt", sep='\t', index_label='Sample')
meta_df_x.loc[list(cs_df.index), beta_div_bc].to_csv("/Users/login/Google Drive/SiYi_Xiaotong_Materials/weighted_unifrac_betadiversity.txt", sep='\t', index_label='Sample')
cs_df[env_data_cols].to_csv("/Users/login/Google Drive/SiYi_Xiaotong_Materials/env_metadata.txt", sep='\t', index_label='Sample')


In [None]:
taxa_df['Species2'] = taxa_df['Genus'] + " " + taxa_df['Species']
taxas_1 = taxa_df.copy().drop('Species', axis=1).astype(str)

print((taxas_1['Order'] == 'Chloroplast').sum())
taxas_1.loc[taxas_1['Order'] == 'Chloroplast', 'Kingdom'] = 'Eukaryota'
taxas_1.loc[taxas_1['Order'] == 'Chloroplast', 'Phylum'] = 'Chloroplast'
taxas_1.loc[taxas_1['Order'] == 'Chloroplast', 'Class'] = 'Chloroplast'
taxas_1.loc[taxas_1['Order'] == 'Chloroplast', 'Class'] = 'Chloroplast'
taxas_1.loc[taxas_1['Order'] == 'Chloroplast', 'Family'] = 'Chloroplast'

print((taxas_1['Family'] == 'Mitochondria').sum())
taxas_1.loc[taxas_1['Family'] == 'Mitochondria', 'Kingdom'] = 'Eukaryota'
taxas_1.loc[taxas_1['Family'] == 'Mitochondria', 'Phylum'] = 'Mitochondria'
taxas_1.loc[taxas_1['Family'] == 'Mitochondria', 'Class'] = 'Mitochondria'
taxas_1.loc[taxas_1['Family'] == 'Mitochondria', 'Order'] = 'Mitochondria'
taxas_1.loc[taxas_1['Family'] == 'Mitochondria', 'Class'] = 'Mitochondria'


In [None]:
subset_abundances = full_df.loc[cs_df.index, :]
order_df = pd.DataFrame(index=subset_abundances.index, 
                        columns=[i for i in taxas_1['Order'].astype(str).unique()])

for ord_ in sorted(order_df.columns):
    print(ord_)    
    otus_in_abund_df = set(list(taxas_1.index[taxas_1['Order'].astype(str) == ord_]))
    print(len(otus_in_abund_df))
    otus_in_abund_df = otus_in_abund_df.intersection(set(subset_abundances.columns))
    print(len(otus_in_abund_df))
    aggd_data = subset_abundances[otus_in_abund_df].sum(1)
    if aggd_data.sum() > 0:
        order_df[ord_] = subset_abundances[otus_in_abund_df].sum(1)
    else:
        order_df.drop(ord_, axis=1, inplace=True)

order_df_ra = order_df.div(order_df.sum(1), axis=0)*1e6
order_df_ra.to_csv("/Users/login/Google Drive/SiYi_Xiaotong_Materials/taxa_order_counts.txt",
                   sep='\t', index_label='Sample')


In [None]:
def taxa_breakdown(abunds_, taxas_, level_, weighted=True, flatten_val=0.0):
    # 'Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species'
    # remove non-existant features
    flip_abunds = abunds_.loc[:, abunds_.sum(0) > 0].T
    # create presence or absence table if need be
    if not weighted:
        flip_abunds = (flip_abunds > 0).astype(int)
    # add level column
    otu_fetch = lambda x: taxas_.loc[x, level_]
    flip_abunds['otu_name'] = flip_abunds.index
    flip_abunds['taxa_name'] = flip_abunds['otu_name'].apply(otu_fetch)
    flip_abunds.drop('otu_name', axis=1, inplace=True)
    ttable_raw = flip_abunds.groupby('taxa_name').agg(np.sum)
    ttable = ttable_raw.div(ttable_raw.sum(0))
    if flatten_val:
        flat_ttv = ttable.values
        flat_ttv[flat_ttv < flatten_val] = 0.0
        ttable = pd.DataFrame(flat_ttv, index=ttable.index, columns=ttable.columns)
    return ttable.T

In [None]:
abunds_1 = subset_abundances.copy()
relativize = lambda v: (v.sum() / v.sum().sum()).sort_values(ascending=False)

flattened_tables_pa = {}
for level_1, fv in zip(taxas_1.columns, [0, 0.03, 0.03, 0.03, 0.03, 0.03, 0.03]):
    ttable_1 = taxa_breakdown(abunds_1, taxas_1, level_1, weighted=False, flatten_val=fv)
    ttable_1 = ttable_1.loc[:, ttable_1.columns[ttable_1.sum() > 0]]
    col_order = ttable_1.max().sort_values(ascending=False).index
    ttable_1 = ttable_1.loc[:, col_order]
    print("The collapsed {} taxa table is {}".format(level_1, ttable_1.shape))
    flattened_tables_pa[level_1] = relativize(ttable_1.copy())
    
flattened_tables = {}
for level_1, fv in zip(taxas_1.columns, [0, 0.05, 0.05, 0.1, 0.08, 0.08, 0.01]):
    ttable_1 = taxa_breakdown(abunds_1, taxas_1, level_1, weighted=True, flatten_val=fv)
    ttable_1 = ttable_1.loc[:, ttable_1.columns[ttable_1.sum() > 0]]
    col_order = ttable_1.max().sort_values(ascending=False).index
    ttable_1 = ttable_1.loc[:, col_order]
    print("The collapsed {} taxa table is {}".format(level_1, ttable_1.shape))
    flattened_tables[level_1] = relativize(ttable_1.copy())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.gridspec as gridspec

def plot_taxa(ttable_mix, key_order, height_rat, bar_width_mult, fignamex=None):
    plt.clf(); plt.close();
    fig_width = 12
    fig_t = plt.figure(figsize=(fig_width,10), dpi=140)
    gs = gridspec.GridSpec(2, len(key_order), figure=fig_t, height_ratios=height_rat, hspace=.05, wspace=0.05,
                           bottom=0.075, top=0.925, right=0.925, left=0.075)
    
    # set the width of each bar to the number of samples
    adjusted_width = (fig_width / len(key_order))*(bar_width_mult)
    # set the left bottom anchor of each bar
    bar_locs = np.arange(len(key_order))*(fig_width / len(key_order))
    for ko_i, ko in enumerate(key_order):
        ttable = ttable_mix[ko]
        possible_colors = [j for i, j in sns.xkcd_rgb.items() if not 'white' in i]
        np.random.seed(2)
        colors_needed = np.random.choice(possible_colors, size=ttable.shape)
        print("{} colors grabbed".format(len(colors_needed)))

        ax_i = plt.subplot(gs[0,ko_i])
        # set the bar labels 
        bar_names = [ko]
        # loop over each taxon name
        for bar_n, bar_col in enumerate(ttable.index):
            # subset those fractions across samples
            bar_x = np.array([ttable[bar_col]])
            # set the y-axis location for each bar
            if bar_n == 0:
                running_base = bar_x*0.0
            # Create an individual bar
            ax_i.bar([bar_locs[ko_i]], bar_x, bottom=running_base, 
                     color=colors_needed[bar_n], edgecolor='white', 
                     width=adjusted_width)
            for tick in ax_i.get_xticklabels():
                tick.set_rotation(45)
            # increment the bottoms
            running_base = running_base + bar_x
        ax_i.axis('off')
        
        ax2 = plt.subplot(gs[1,ko_i])
        patches = [mpatches.Patch(color=color, label=label) for label, color in zip(list(ttable.index), colors_needed)]
        ax2.legend(patches, list(ttable.index), loc='best', 
                   bbox_to_anchor=(0., 0., 1., 1.),
                   mode='expand', fontsize='x-small', ncol=1)

        ax2.axis('off')
    # Show graphic
    plt.show()
    if fignamex:
        fig_t.savefig(fignamex, dpi=140)

    return

fignamet = "AllTaxonomy_AllSamples_unweighted.png".format(level_1)
figpatht = os.path.join("../otu_data/pca_plots", fignamet)
plot_taxa(flattened_tables_pa, ["Kingdom", "Phylum", "Class", "Order", "Family"],
          [7,4], .9, fignamex=figpatht)


In [None]:
fignamet = "AllTaxonomy_AllSamples_weighted.png".format(level_1)
figpatht = os.path.join("../otu_data/pca_plots", fignamet)
plot_taxa(flattened_tables, ["Kingdom", "Phylum", "Class", "Order", "Family"],
          [7,4], .9, fignamex=figpatht)

In [None]:
str_to_write = ""
for i in subset_abundances.columns:
    str_to_write += ">"+i+"\n"
    str_to_write += OTU_name2seq[i] + "\n"

with open("/Volumes/KeithSSD/CB_V4/otu_data/FAPROTAX_out/otu_seqs_for_rdp.fasta", 'w') as fh:
    _ = fh.write(str_to_write)


In [None]:
from biom.table import Table
from biom.util import biom_open

name_checks = {'SAR11_clade': 'SAR11 clade',
               'SAR86_clade': 'SAR86 clade'}
name_corrector = lambda x: name_checks[x] if x in name_checks.keys() else x

sample_ids = []
for i in list(subset_abundances.index):
    sample_ids.append(i)

_outname_ = '_silva'
observ_ids, observ_metadata = [], []
for i in list(subset_abundances.columns):
    if i.startswith("OTU") and i in list(taxa_df.index):
        observ_ids.append(i)
        observ_metadata.append({'taxonomy': [name_corrector(j) for j in taxa_df.loc[i, :].dropna().values]})

_data_ = subset_abundances.loc[sample_ids, observ_ids].values.T
print(_outname_, _data_.shape, subset_abundances.shape)
table = Table(_data_, observ_ids, sample_ids, observ_metadata, None)
with biom_open('../otu_data/FAPROTAX_out/otu_taxa{}.biom'.format(_outname_), 'w') as f:  
    table.to_hdf5(f, "faith and trust")

In [None]:
rdp_taxa = pd.read_csv("/Volumes/KeithSSD/CB_V4/otu_data/FAPROTAX_out/fixrank_rdp_classifiedx.txt", 
                       sep=";", index_col=0, header=None).drop(1, axis=1)
rdp_taxa.index.name = ''
rdp_cols = ['K', 'K%', 'P', 'P%', 'C', 'C%', 'O', 'O%', 'F', 'F%', 'G', 'G%']
rdp_taxa.columns = rdp_cols
print(rdp_taxa.shape)
rdp_name_cols = [i for i in rdp_cols if not '%' in i]

import numpy as np

for i in rdp_name_cols:
    level_pct = rdp_taxa[i+'%'].apply(lambda x: float(x[:-1]) < 80.)
    rdp_taxa.loc[level_pct, i] = np.nan
    rdp_taxa.drop(i+'%', axis=1, inplace=True)
    print("{}: {}".format(i, rdp_taxa[i].notnull().sum()))

_outname_ = '_rdp'
observ_ids, observ_metadata = [], []
for i in list(subset_abundances.columns):
    if i.startswith("OTU") and i in list(rdp_taxa.index):
        observ_ids.append(i)
        observ_metadata.append({'taxonomy': [name_corrector(j) for j in rdp_taxa.loc[i, :].dropna().values]})
        
_data_ = subset_abundances.loc[sample_ids, observ_ids].values.T
print(_outname_, _data_.shape, subset_abundances.shape)
table = Table(_data_, observ_ids, sample_ids, observ_metadata, None)
with biom_open('../otu_data/FAPROTAX_out/otu_taxa{}.biom'.format(_outname_), 'w') as f:  
    table.to_hdf5(f, "faith and trust")

In [None]:
gtdb_rows = []
largest_row = 0
with open("/Volumes/KeithSSD/CB_V4/otu_data/FAPROTAX_out/fixrank_gtdb_classifiedx.txt", 'r') as fh:
    for l in fh:
        gtdb_rows.append([i.strip() for i in l.split(";")])
        if len(gtdb_rows[-1]) > largest_row:
            largest_row = len(gtdb_rows[-1])

print(len(gtdb_rows), largest_row)

gtdb_fix_width = [i+[""]*(largest_row-len(i)) for i in gtdb_rows]
gtdb_taxa = pd.DataFrame(gtdb_fix_width).set_index(0)
fix_conf = lambda x: int(x.split(", ")[-1][:-2]) if str(x).startswith("(") else 0

gtdb_taxa.drop([1,2], inplace=True, axis=1)

for num_col in range(2,8):
    print(num_col*2)
    gtdb_taxa[num_col*2] = gtdb_taxa[num_col*2].apply(fix_conf)
    gtdb_taxa.loc[gtdb_taxa[num_col*2] < 60, (num_col*2)-1] = np.nan
    gtdb_taxa.drop(num_col*2, axis=1, inplace=True)

_outname_ = '_gtdb'
observ_ids, observ_metadata = [], []
for i in list(subset_abundances.columns):
    if i.startswith("OTU") and i in list(gtdb_taxa.index):
        observ_ids.append(i)
        observ_metadata.append({'taxonomy': [name_corrector(j) for j in gtdb_taxa.loc[i, :].dropna().values]})

_data_ = subset_abundances.loc[sample_ids, observ_ids].values.T
print(_outname_, _data_.shape, subset_abundances.shape)
table = Table(_data_, observ_ids, sample_ids, observ_metadata, None)
with biom_open('../otu_data/FAPROTAX_out/otu_taxa{}.biom'.format(_outname_), 'w') as f:  
    table.to_hdf5(f, "faith and trust")

In [None]:
silva_fxn_df = pd.read_csv("/Volumes/KeithSSD/CB_V4/otu_data/FAPROTAX_out/report_silva.txt", sep="\t",
                 index_col=0, comment='#')
rdp_fxn_df = pd.read_csv("/Volumes/KeithSSD/CB_V4/otu_data/FAPROTAX_out/report_rdp.txt", sep="\t",
                 index_col=0, comment='#')
gtdb_fxn_df = pd.read_csv("/Volumes/KeithSSD/CB_V4/otu_data/FAPROTAX_out/report_gtdb.txt", sep="\t",
                 index_col=0, comment='#')

print(gtdb_fxn_df.shape, rdp_fxn_df.shape, silva_fxn_df.shape)
print(gtdb_fxn_df.sum().sum(), rdp_fxn_df.sum().sum(), silva_fxn_df.sum().sum())
fxn_df = gtdb_fxn_df.astype(str) + rdp_fxn_df.astype(str) + silva_fxn_df.astype(str)
print(np.unique(fxn_df.values, return_counts=True))

fxn_df = ((gtdb_fxn_df + rdp_fxn_df + silva_fxn_df) > 0).astype(int).loc[subset_abundances.columns, :]
fapro_raw = np.dot(subset_abundances.values, fxn_df.values) 


print(fapro_raw.shape)


In [None]:
fapro_df = pd.DataFrame(index=subset_abundances.index,
                        columns=fxn_df.columns, 
                        data=fapro_raw)

print(fapro_df.sum(1).shape)
fapro_ra_df = fapro_df.div(fapro_df.sum(1), axis=0)*1e6

fapro_ra_df.to_csv("/Users/login/Google Drive/SiYi_Xiaotong_Materials/FAPROTAX_counts.txt", 
                sep='\t', index_label='Sample')


