In [2]:
import numpy as np; import pandas as pd; import os

In [6]:
env_file = '../data/environmental_raw_data/Transformed_WQVars_Hab_and_aDiv.txt'
env_df = pd.read_csv(env_file, sep="\t")

abund_file = '../data/otu_tables/final_unrarefied_table.2.txt'
abund_df = pd.read_csv(abund_file, sep="\t", index_col=0).loc[env_df.index,:]



(236, 35) (236, 27746) (27746, 7)
(236, 27746) 0


In [9]:
taxa_file = '../data/otu_tables/taxa_table_with_OTUs.txt'
taxa_df = pd.read_csv(taxa_file, sep="\t", index_col=0).loc[abund_df.columns, :]

print(env_df.shape, abund_df.shape, taxa_df.shape)
abund_df = abund_df[abund_df.sum(1)!=0]
print(abund_df.shape, len([i for i in abund_df.columns if not i.startswith("OTU")]))

# this corrects the hierarchy to kick chloroplast out of Cyanobacteria and mitochondria out of Rickettsiales 
taxas_1 = taxa_df.copy().astype(str)
print((taxas_1['Order'] == 'Chloroplast').sum())
print(taxa_df.loc[taxas_1['Order'] == 'Chloroplast',:].sample(1).values)
taxa_df.loc[taxas_1['Order'] == 'Chloroplast', 'Kingdom'] = 'Eukaryota'
taxa_df.loc[taxas_1['Order'] == 'Chloroplast', 'Phylum'] = 'Chloroplast'
taxa_df.loc[taxas_1['Order'] == 'Chloroplast', 'Class'] = 'Chloroplast'
taxa_df.loc[taxas_1['Order'] == 'Chloroplast', 'Class'] = 'Chloroplast'
taxa_df.loc[taxas_1['Order'] == 'Chloroplast', 'Family'] = 'Chloroplast'

print((taxas_1['Family'] == 'Mitochondria').sum())
print(taxa_df.loc[taxas_1['Family'] == 'Mitochondria',:].sample(1).values)
taxa_df.loc[taxas_1['Family'] == 'Mitochondria', 'Kingdom'] = 'Eukaryota'
taxa_df.loc[taxas_1['Family'] == 'Mitochondria', 'Phylum'] = 'Mitochondria'
taxa_df.loc[taxas_1['Family'] == 'Mitochondria', 'Class'] = 'Mitochondria'
taxa_df.loc[taxas_1['Family'] == 'Mitochondria', 'Order'] = 'Mitochondria'
taxa_df.loc[taxas_1['Family'] == 'Mitochondria', 'Class'] = 'Mitochondria'


(236, 35) (236, 27746) (27746, 7)
(236, 27746) 0
737
[['Bacteria' 'Cyanobacteria' 'Oxyphotobacteria' 'Chloroplast' nan nan nan]]
336
[['Bacteria' 'Proteobacteria' 'Alphaproteobacteria' 'Rickettsiales'
  'Mitochondria' nan nan]]


In [10]:
# this checks two things: 
# 1. that there are no gaps in the hierarchy i.e. a lower level defined where and upper isn't: this is true
# 2. that category levels are unique: this isn't true, we will use combined names with a level and the one above it.

tx_lvls = ['Species', 'Genus', 'Family', 'Order', 'Class']
for tx_lvl_up, tx_lvl_lw in zip(tx_lvls[1:], tx_lvls[:-1]):
    taxa_df[tx_lvl_lw+'2'] = pd.Series(index=taxa_df.index, dtype=str)
    both_exist = taxa_df[tx_lvl_lw].notnull() & taxa_df[tx_lvl_up].notnull()
    lw_only = taxa_df[tx_lvl_lw].notnull() & taxa_df[tx_lvl_up].isnull()
    print("upper:{}, lower:{}, both:{}, low only:{}".format(tx_lvl_up, tx_lvl_lw, both_exist.sum(), lw_only.sum()))
    taxa_df.loc[lw_only, tx_lvl_lw+'2'] = taxa_df.loc[lw_only, tx_lvl_lw]
    taxa_df.loc[both_exist, tx_lvl_lw+'2'] = taxa_df.loc[both_exist, tx_lvl_up] + " " + taxa_df.loc[both_exist, tx_lvl_lw]
    print("\t", taxa_df[tx_lvl_lw].unique().shape, taxa_df[tx_lvl_lw+'2'].unique().shape)

print(taxa_df.columns)


upper:Genus, lower:Species, both:338, low only:0
	 (286,) (322,)
upper:Family, lower:Genus, both:7923, low only:0
	 (1064,) (1064,)
upper:Order, lower:Family, both:15015, low only:0
	 (475,) (478,)
upper:Class, lower:Order, both:18868, low only:0
	 (354,) (354,)
Index(['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species',
       'Species2', 'Genus2', 'Family2', 'Order2'],
      dtype='object')


In [6]:
from biom.table import Table
from biom.util import biom_open

official_cols = ['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']

name_checks = {'SAR11_clade': 'SAR11 clade',
               'SAR86_clade': 'SAR86 clade'}
name_corrector = lambda x: name_checks[x] if x in name_checks.keys() else x

sample_ids = []
for i in list(abund_df.index):
    sample_ids.append(i)

observ_ids, observ_metadata = [], []
for i in list(abund_df.columns):
    if i.startswith("OTU") and i in list(taxa_df.index):
        observ_ids.append(i)
        observ_metadata.append({'taxonomy': [name_corrector(j) for j in taxa_df.loc[i, official_cols].dropna().values]})

_data_ = abund_df.loc[sample_ids, observ_ids].values.T
table = Table(_data_, observ_ids, sample_ids, observ_metadata, None)

with biom_open('../data/faprotax_data/otu_taxa_full.biom', 'w') as f:  
    table.to_hdf5(f, "faith and trust")


In [11]:
silva_fxn_df = pd.read_csv("../data/faprotax_data/report_silva.2.txt", sep="\t", index_col=0, comment='#')
oeu_clusters = pd.read_csv('../data/oeu_clusters/cluster_assignments.txt', sep='\t', index_col=0)
ftx_df = taxa_df.merge(silva_fxn_df, how='outer', right_index=True, left_index=True)
fotx_df = ftx_df.merge(oeu_clusters, how='inner', right_index=True, left_index=True)

print(taxa_df.shape, silva_fxn_df.shape, oeu_clusters.shape, ftx_df.shape, fotx_df.shape)
ftx_df.to_csv("../data/otu_annotations_faprotax_oeu_taxonomy.txt", sep="\t", index=True, header=True)

(27746, 11) (27746, 91) (1432, 2) (27746, 102) (1424, 104)


In [12]:
rel_abund = abund_df.div(abund_df.sum(1), axis=0)*1000000
oeu_abundances = pd.DataFrame(index=rel_abund.index, columns=oeu_clusters.ward_clusters.dropna().unique())

for oeu in sorted(oeu_abundances.columns):
    oeu_membs = list(oeu_clusters[oeu_clusters.ward_clusters == oeu].index)
    oeu_membs = set(oeu_membs).intersection(set(rel_abund.columns))
    agg_col = rel_abund.loc[:, oeu_membs].sum(1)
    print(oeu, len(oeu_membs), agg_col.sum())
    oeu_abundances.loc[agg_col.index, oeu] = agg_col

print(oeu_abundances.shape, oeu_abundances.isnull().sum().sum())

1.0 1 54.630479437087544
2.0 2 180.78983214323097
3.0 2 7361.819035756531
4.0 2 6324.445488861235
5.0 2 4794943.480289471
6.0 2 3060413.9773296593
7.0 2 5147491.737153316
8.0 2 41845.48707977339
9.0 2 1094096.3571927603
10.0 3 14143.912998076445
11.0 2 3240778.8804190895
12.0 3 2678.2286359969835
13.0 6 9708176.39879597
14.0 2 39620.666616009956
15.0 2 198408.53423158306
16.0 2 10377.579279712245
17.0 2 137511.991926317
18.0 2 38808.384677643815
19.0 5 6116236.544664033
20.0 2 135241.77879540803
21.0 2 75485.81104119658
22.0 3 1021078.8136748351
23.0 2 62325.30601513092
24.0 3 276625.3982575221
25.0 2 2723431.95395552
26.0 2 32423.294379907366
27.0 2 5278.681941874138
28.0 2 101303.78446265675
29.0 2 12168.734598401134
30.0 2 2227040.9867659435
31.0 2 35717.447010564945
32.0 3 2044871.306699452
33.0 3 4570769.957503373
34.0 3 68397.05712229692
35.0 3 1312472.8830585668
36.0 3 58532.66355883281
37.0 2 37465.95271805277
38.0 2 6370.750761024448
39.0 2 227614.33108755446
40.0 2 9594.38426

In [13]:
annot_pops = silva_fxn_df.index[silva_fxn_df.sum(1) > 0]
rel_abund_fx = abund_df[annot_pops].div(abund_df[annot_pops].sum(1), axis=0)*1000000
fxn_abundances = pd.DataFrame(index=rel_abund.index, columns=silva_fxn_df.columns)
for fxn_i, fxn in enumerate(fxn_abundances.columns):
    fxn_membs = list(silva_fxn_df[silva_fxn_df[fxn] != 0 ].index)
    fxn_membs = set(fxn_membs).intersection(set(rel_abund_fx.columns))
    agg_col = rel_abund_fx.loc[:, fxn_membs].sum(1)
    if fxn_i < 10:
        print(fxn, len(fxn_membs), agg_col.sum())
    fxn_abundances.loc[agg_col.index, fxn] = agg_col

print(fxn_abundances.shape, fxn_abundances.isnull().sum().sum())

methanotrophy 63 1221105.045798447
acetoclastic_methanogenesis 5 6113.0445467366235
methanogenesis_by_disproportionation_of_methyl_groups 6 1458.3496434521808
methanogenesis_using_formate 1 108.04678425758354
methanogenesis_by_CO2_reduction_with_H2 24 6733.290689057167
methanogenesis_by_reduction_of_methyl_compounds_with_H2 7 847.1445139941832
hydrogenotrophic_methanogenesis 31 7580.435203051349
methanogenesis 40 15362.693044200429
methanol_oxidation 54 13155641.530778315
methylotrophy 124 14377593.721090756
(236, 91) 0


In [14]:
rel_abund = abund_df.div(abund_df.sum(1), axis=0)*1000000
oeu_abundances = pd.DataFrame(index=rel_abund.index, columns=oeu_clusters.ward_clusters.dropna().unique())

for oeu_i, oeu in enumerate(sorted(oeu_abundances.columns)):
    oeu_membs = list(oeu_clusters[oeu_clusters.ward_clusters == oeu].index)
    oeu_membs = set(oeu_membs).intersection(set(rel_abund.columns))
    agg_col = rel_abund.loc[:, oeu_membs].sum(1)
    if oeu_i < 10:
        print(oeu, len(oeu_membs), agg_col.sum())
    oeu_abundances.loc[agg_col.index, oeu] = agg_col

print(oeu_abundances.shape, oeu_abundances.isnull().sum().sum())

1.0 1 54.630479437087544
2.0 2 180.78983214323097
3.0 2 7361.819035756531
4.0 2 6324.445488861235
5.0 2 4794943.480289471
6.0 2 3060413.9773296593
7.0 2 5147491.737153316
8.0 2 41845.48707977339
9.0 2 1094096.3571927603
10.0 3 14143.912998076445
(236, 190) 0


In [15]:
taxa_abundances = pd.DataFrame(index=rel_abund.index, 
                               columns=taxa_df.Family2.dropna().unique())

for tax_i, tax in enumerate(sorted(taxa_abundances.columns)):
    tax_membs = list(taxa_df[taxa_df.Family2 == tax].index)
    tax_membs = set(tax_membs).intersection(set(rel_abund.columns))
    agg_col = rel_abund.loc[:, tax_membs].sum(1)
    if tax_i < 10:
        print(tax, len(tax_membs), agg_col.sum())
    taxa_abundances.loc[agg_col.index, tax] = agg_col

print(taxa_abundances.shape, taxa_abundances.isnull().sum().sum())

ANME-1 ANME-1b 1 7.508146338777574
Acanthopleuribacterales Acanthopleuribacteraceae 15 3720.7877556071608
Acetobacterales Acetobacteraceae 44 20623.10754276356
Acetobacterales Acetobacterales_Incertae_Sedis 4 978.8103838366926
Acholeplasmatales Acholeplasmataceae 12 114298.14476038721
Acidiferrobacterales Acidiferrobacteraceae 10 7097.20644808484
Acidithiobacillales Acidithiobacillaceae 3 4559.096046053645
Acidobacteriales Acidobacteriaceae_(Subgroup_1) 2 157.36523061435372
Actinomarinales Actinomarinaceae 14 7503157.342442007
Actinomycetales Actinomycetaceae 3 218.36523144537637
(236, 477) 0


In [16]:
taxa_abundances_o = pd.DataFrame(index=rel_abund.index, 
                                columns=taxa_df.Order2.dropna().unique())

for tax_i, tax in enumerate(sorted(taxa_abundances_o.columns)):
    tax_membs = list(taxa_df[taxa_df.Order2 == tax].index)
    tax_membs = set(tax_membs).intersection(set(rel_abund.columns))
    agg_col = rel_abund.loc[:, tax_membs].sum(1)
    if tax_i < 10:
        print(tax, len(tax_membs), agg_col.sum())
    taxa_abundances_o.loc[agg_col.index, tax] = agg_col

print(taxa_abundances_o.shape, taxa_abundances_o.isnull().sum().sum())

ABY1 Candidatus_Buchananbacteria 7 163.43681816834123
ABY1 Candidatus_Falkowbacteria 33 2252.8460443037493
ABY1 Candidatus_Kerfeldbacteria 40 3951.4393356421733
ABY1 Candidatus_Komeilibacteria 18 547.3906918689572
ABY1 Candidatus_Kuenenbacteria 13 834.5872293973684
ABY1 Candidatus_Magasanikbacteria 28 13166.297981821124
ABY1 Candidatus_Uhrbacteria 30 20371.74949699819
Acidimicrobiia Actinomarinales 86 7581673.594059959
Acidimicrobiia IMCC26256 17 11752.756473620298
Acidimicrobiia Microtrichales 280 13346444.921807807
(236, 353) 0


In [17]:
taxa_abundances.to_csv("../data/otu_tables/taxa_family_abundances.txt", sep="\t", index=True, header=True)
taxa_abundances_o.to_csv("../data/otu_tables/taxa_order_abundances.txt", sep="\t", index=True, header=True)
oeu_abundances.to_csv("../data/oeu_clusters/oeu_abundances.txt", sep="\t", index=True, header=True)
fxn_abundances.to_csv("../data/faprotax_data/function_abundances.txt", sep="\t", index=True, header=True)

In [26]:
observed_fxns = silva_fxn_df.columns[silva_fxn_df.sum() > 0]
annoation_rarity = fotx_df.loc[:, observed_fxns].sum()/ silva_fxn_df.loc[:, observed_fxns].sum()
annoation_rarity.sort_values()

harmful_algae                                              0.000000
nitrite_ammonification                                     0.000000
nitrate_ammonification                                     0.000000
dark_sulfur_oxidation                                      0.000000
chitinolysis                                               0.000000
dark_thiosulfate_oxidation                                 0.000000
manganese_oxidation                                        0.000000
human_pathogens_gastroenteritis                            0.000000
human_gut                                                  0.000000
anammox                                                    0.000000
dissimilatory_arsenate_reduction                           0.000000
arsenate_respiration                                       0.000000
mammal_gut                                                 0.000000
plant_pathogen                                             0.000000
xylanolysis                                     

In [None]:
def taxa_breakdown(abunds_, taxas_, level_, weighted=True, flatten_val=0.0):
    # 'Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species'
    # remove non-existant features
    flip_abunds = abunds_.loc[:, abunds_.sum(0) > 0].T
    # create presence or absence table if need be
    if not weighted:
        flip_abunds = (flip_abunds > 0).astype(int)
    # add level column
    otu_fetch = lambda x: taxas_.loc[x, level_]
    flip_abunds['otu_name'] = flip_abunds.index
    flip_abunds['taxa_name'] = flip_abunds['otu_name'].apply(otu_fetch)
    flip_abunds.drop('otu_name', axis=1, inplace=True)
    ttable_raw = flip_abunds.groupby('taxa_name').agg(np.sum)
    ttable = ttable_raw.div(ttable_raw.sum(0))
    if flatten_val:
        flat_ttv = ttable.values
        flat_ttv[flat_ttv < flatten_val] = 0.0
        ttable = pd.DataFrame(flat_ttv, index=ttable.index, columns=ttable.columns)
    return ttable.T

In [None]:
abunds_1 = subset_abundances.copy()
relativize = lambda v: (v.sum() / v.sum().sum()).sort_values(ascending=False)

taxa_cols_to_use = ['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species2']
flattened_tables_pa = {}
for level_1, fv in zip(taxa_cols_to_use, [0, 0.03, 0.03, 0.03, 0.03, 0.03, 0.03]):
    ttable_1 = taxa_breakdown(abunds_1, taxas_1, level_1, weighted=False, flatten_val=fv)
    ttable_1 = ttable_1.loc[:, ttable_1.columns[ttable_1.sum() > 0]]
    col_order = ttable_1.max().sort_values(ascending=False).index
    ttable_1 = ttable_1.loc[:, col_order]
    print("The collapsed {} taxa table is {}".format(level_1, ttable_1.shape))
    flattened_tables_pa[level_1] = relativize(ttable_1.copy())
    
flattened_tables = {}
for level_1, fv in zip(taxa_cols_to_use, [0, 0.05, 0.05, 0.1, 0.08, 0.08, 0.01]):
    ttable_1 = taxa_breakdown(abunds_1, taxas_1, level_1, weighted=True, flatten_val=fv)
    ttable_1 = ttable_1.loc[:, ttable_1.columns[ttable_1.sum() > 0]]
    col_order = ttable_1.max().sort_values(ascending=False).index
    ttable_1 = ttable_1.loc[:, col_order]
    print("The collapsed {} taxa table is {}".format(level_1, ttable_1.shape))
    flattened_tables[level_1] = relativize(ttable_1.copy())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.gridspec as gridspec

def plot_taxa(ttable_mix, key_order, height_rat, bar_width_mult, fignamex=None):
    fig_width = 12
    fig_t = plt.figure(figsize=(fig_width,10), dpi=140)
    gs = gridspec.GridSpec(2, len(key_order), figure=fig_t, height_ratios=height_rat, hspace=.05, wspace=0.05,
                           bottom=0.075, top=0.925, right=0.925, left=0.075)
    
    # set the width of each bar to the number of samples
    adjusted_width = (fig_width / len(key_order))*(bar_width_mult)
    # set the left bottom anchor of each bar
    bar_locs = np.arange(len(key_order))*(fig_width / len(key_order))
    for ko_i, ko in enumerate(key_order):
        ttable = ttable_mix[ko]
        possible_colors = [j for i, j in sns.xkcd_rgb.items() if not 'white' in i]
        np.random.seed(2)
        colors_needed = np.random.choice(possible_colors, size=ttable.shape)
        print("{} colors grabbed".format(len(colors_needed)))

        ax_i = plt.subplot(gs[0,ko_i])
        # set the bar labels 
        bar_names = [ko]
        # loop over each taxon name
        for bar_n, bar_col in enumerate(ttable.index):
            # subset those fractions across samples
            bar_x = np.array([ttable[bar_col]])
            # set the y-axis location for each bar
            if bar_n == 0:
                running_base = bar_x*0.0
            # Create an individual bar
            ax_i.bar([bar_locs[ko_i]], bar_x, bottom=running_base, 
                     color=colors_needed[bar_n], edgecolor='white', 
                     width=adjusted_width)
            for tick in ax_i.get_xticklabels():
                tick.set_rotation(45)
            # increment the bottoms
            running_base = running_base + bar_x
        ax_i.axis('off')
        
        ax2 = plt.subplot(gs[1,ko_i])
        patches = [mpatches.Patch(color=color, label=label) for label, color in zip(list(ttable.index), colors_needed)]
        ax2.legend(patches, list(ttable.index), loc='best', 
                   bbox_to_anchor=(0., 0., 1., 1.),
                   mode='expand', fontsize='x-small', ncol=1)

        ax2.axis('off')
    # Show graphic
    plt.show()
    if fignamex:
        fig_t.savefig(fignamex, dpi=140)
    return

fignamet = "AllTaxonomy_AllSamples_unweighted.png".format(level_1)
figpatht = os.path.join("../otu_data/pca_plots", fignamet)
plot_taxa(flattened_tables_pa, ["Kingdom", "Phylum", "Class", "Order", "Family"],
          [7,4], .9, fignamex=figpatht)
plt.show()

(taxas_1.loc[abunds_1.columns, :] != 'nan').sum() / len(taxas_1.loc[abunds_1.columns, :])

In [None]:
fignamet = "AllTaxonomy_AllSamples_weighted.png".format(level_1)
figpatht = os.path.join("../otu_data/pca_plots", fignamet)
plot_taxa(flattened_tables, ["Kingdom", "Phylum", "Class", "Order", "Family"],
          [7,4], .9, fignamex=figpatht)