In [5]:
import numpy as np; import pandas as pd; import os

In [35]:
env_file = '../data/environmental_raw_data/Transformed_WQVars_Hab_and_aDiv.txt'
env_df = pd.read_csv(env_file, sep="\t")

abund_file = '../data/otu_tables/final_unrarefied_table.txt'
abund_df = pd.read_csv(abund_file, sep="\t", index_col=0).loc[env_df.index,:]

taxa_file = '../data/otu_tables/taxa_table_with_OTUs.txt'
taxa_df = pd.read_csv(taxa_file, sep="\t", index_col=0).loc[abund_df.columns, :]

print(env_df.shape, abund_df.shape, taxa_df.shape)
abund_df = abund_df[abund_df.sum(1)!=0]
print(abund_df.shape, len([i for i in abund_df.columns if not i.startswith("OTU")]))

(236, 37) (236, 20964) (20964, 7)
(236, 20964) 0


In [36]:
# this corrects the hierarchy to kick chloroplast out of Cyanobacteria and mitochondria out of Rickettsiales 
taxas_1 = taxa_df.copy().astype(str)
print((taxas_1['Order'] == 'Chloroplast').sum())
print(taxa_df.loc[taxas_1['Order'] == 'Chloroplast',:].sample(1).values)
taxa_df.loc[taxas_1['Order'] == 'Chloroplast', 'Kingdom'] = 'Eukaryota'
taxa_df.loc[taxas_1['Order'] == 'Chloroplast', 'Phylum'] = 'Chloroplast'
taxa_df.loc[taxas_1['Order'] == 'Chloroplast', 'Class'] = 'Chloroplast'
taxa_df.loc[taxas_1['Order'] == 'Chloroplast', 'Class'] = 'Chloroplast'
taxa_df.loc[taxas_1['Order'] == 'Chloroplast', 'Family'] = 'Chloroplast'

print((taxas_1['Family'] == 'Mitochondria').sum())
print(taxa_df.loc[taxas_1['Family'] == 'Mitochondria',:].sample(1).values)
taxa_df.loc[taxas_1['Family'] == 'Mitochondria', 'Kingdom'] = 'Eukaryota'
taxa_df.loc[taxas_1['Family'] == 'Mitochondria', 'Phylum'] = 'Mitochondria'
taxa_df.loc[taxas_1['Family'] == 'Mitochondria', 'Class'] = 'Mitochondria'
taxa_df.loc[taxas_1['Family'] == 'Mitochondria', 'Order'] = 'Mitochondria'
taxa_df.loc[taxas_1['Family'] == 'Mitochondria', 'Class'] = 'Mitochondria'


562
[['Bacteria' 'Cyanobacteria' 'Oxyphotobacteria' 'Chloroplast' nan nan nan]]
694
[['Bacteria' 'Proteobacteria' 'Alphaproteobacteria' 'Rickettsiales'
  'Mitochondria' nan nan]]


In [37]:
# this checks two things: 
# 1. that there are no gaps in the hierarchy i.e. a lower level defined where and upper isn't: this is true
# 2. that category levels are unique: this isn't true, we will use combined names with a level and the one above it.

tx_lvls = ['Species', 'Genus', 'Family', 'Order', 'Class']
for tx_lvl_up, tx_lvl_lw in zip(tx_lvls[1:], tx_lvls[:-1]):
    taxa_df[tx_lvl_lw+'2'] = pd.Series(index=taxa_df.index)
    both_exist = taxa_df[tx_lvl_lw].notnull() & taxa_df[tx_lvl_up].notnull()
    lw_only = taxa_df[tx_lvl_lw].notnull() & taxa_df[tx_lvl_up].isnull()
    print("upper:{}, lower:{}, both:{}, low only:{}".format(tx_lvl_up, tx_lvl_lw, both_exist.sum(), lw_only.sum()))
    taxa_df.loc[lw_only, tx_lvl_lw+'2'] = taxa_df.loc[lw_only, tx_lvl_lw]
    taxa_df.loc[both_exist, tx_lvl_lw+'2'] = taxa_df.loc[both_exist, tx_lvl_up] + " " + taxa_df.loc[both_exist, tx_lvl_lw]
    print("\t", taxa_df[tx_lvl_lw].unique().shape, taxa_df[tx_lvl_lw+'2'].unique().shape)

print(taxa_df.columns)


upper:Genus, lower:Species, both:307, low only:0
	 (266,) (300,)
upper:Family, lower:Genus, both:6043, low only:0
	 (1027,) (1027,)
upper:Order, lower:Family, both:11707, low only:0
	 (449,) (452,)
upper:Class, lower:Order, both:14458, low only:0
	 (335,) (335,)
Index(['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species',
       'Species2', 'Genus2', 'Family2', 'Order2'],
      dtype='object')


In [80]:
from biom.table import Table
from biom.util import biom_open

official_cols = ['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']

name_checks = {'SAR11_clade': 'SAR11 clade',
               'SAR86_clade': 'SAR86 clade'}
name_corrector = lambda x: name_checks[x] if x in name_checks.keys() else x

sample_ids = []
for i in list(abund_df.index):
    sample_ids.append(i)

observ_ids, observ_metadata = [], []
for i in list(abund_df.columns):
    if i.startswith("OTU") and i in list(taxa_df.index):
        observ_ids.append(i)
        observ_metadata.append({'taxonomy': [name_corrector(j) for j in taxa_df.loc[i, official_cols].dropna().values]})

_data_ = abund_df.loc[sample_ids, observ_ids].values.T
table = Table(_data_, observ_ids, sample_ids, observ_metadata, None)

with biom_open('../data/faprotax_data/otu_taxa_full.biom', 'w') as f:  
    table.to_hdf5(f, "faith and trust")


In [38]:
silva_fxn_df = pd.read_csv("../data/faprotax_data/report_silva.txt", sep="\t", index_col=0, comment='#')
oeu_clusters = pd.read_csv('../data/oeu_clusters/cluster_assignments.txt', sep='\t', index_col=0)
ftx_df = taxa_df.merge(silva_fxn_df, how='outer', right_index=True, left_index=True)
fotx_df = ftx_df.merge(oeu_clusters, how='inner', right_index=True, left_index=True)

print(taxa_df.shape, silva_fxn_df.shape, oeu_clusters.shape, ftx_df.shape, fotx_df.shape)
ftx_df.to_csv("../data/otu_annotations_faprotax_oeu_taxonomy.txt", sep="\t", index=True, header=True)

(20964, 11) (20964, 91) (1432, 2) (20964, 102) (1026, 104)


In [69]:
rel_abund = abund_df.div(abund_df.sum(1), axis=0)*1000000
oeu_abundances = pd.DataFrame(index=rel_abund.index, columns=oeu_clusters.ward_clusters.dropna().unique())

for oeu in sorted(oeu_abundances.columns):
    oeu_membs = list(oeu_clusters[oeu_clusters.ward_clusters == oeu].index)
    oeu_membs = set(oeu_membs).intersection(set(rel_abund.columns))
    agg_col = rel_abund.loc[:, oeu_membs].sum(1)
    print(oeu, len(oeu_membs), agg_col.sum())
    oeu_abundances.loc[agg_col.index, oeu] = agg_col

print(oeu_abundances.shape, oeu_abundances.isnull().sum().sum())

1.0 1 69.02597447419464
2.0 1 265.68316102902196
3.0 2 9285.159447638493
4.0 2 11000.916100598952
5.0 2 6338123.431300601
6.0 2 4066153.4981032405
7.0 2 6794099.919228723
8.0 2 71620.34617900614
9.0 2 1417777.33920067
10.0 3 24606.452826113156
11.0 2 4427142.722910645
12.0 3 3330.020487642807
13.0 6 12536399.262601795
14.0 1 8021.12275626238
15.0 1 66875.43713831225
16.0 2 14927.991167660854
17.0 1 62427.08554864769
18.0 2 54034.86489760792
19.0 5 7918034.22880553
20.0 0 0.0
21.0 2 104356.93965868247
22.0 0 0.0
23.0 2 88310.52401549096
24.0 3 370219.4525934666
25.0 2 3409324.154698465
26.0 2 41389.726719204526
27.0 2 6732.565465386372
28.0 1 71538.1350276994
29.0 2 17568.411585050428
30.0 1 2612481.8961967137
31.0 0 0.0
32.0 3 2669768.6846798374
33.0 0 0.0
34.0 3 122601.06416045353
35.0 2 1746963.7754562853
36.0 1 11137.688432477773
37.0 2 52888.81510872743
38.0 2 13663.030337867902
39.0 2 264350.96800159384
40.0 1 6553.930502026949
41.0 3 62282.42274350138
42.0 2 103145.97795872432
43

In [74]:
annot_pops = silva_fxn_df.index[silva_fxn_df.sum(1) > 0]
rel_abund_fx = abund_df[annot_pops].div(abund_df[annot_pops].sum(1), axis=0)*1000000
fxn_abundances = pd.DataFrame(index=rel_abund.index, columns=silva_fxn_df.columns)
for fxn_i, fxn in enumerate(fxn_abundances.columns):
    fxn_membs = list(silva_fxn_df[silva_fxn_df[fxn] != 0 ].index)
    fxn_membs = set(fxn_membs).intersection(set(rel_abund_fx.columns))
    agg_col = rel_abund_fx.loc[:, fxn_membs].sum(1)
    if fxn_i < 10:
        print(fxn, len(fxn_membs), agg_col.sum())
    fxn_abundances.loc[agg_col.index, fxn] = agg_col

print(fxn_abundances.shape, fxn_abundances.isnull().sum().sum())

methanotrophy 55 378112.3766113111
acetoclastic_methanogenesis 3 1425.9391874695984
methanogenesis_by_disproportionation_of_methyl_groups 5 1851.2363248816384
methanogenesis_using_formate 1 189.0537858020607
methanogenesis_by_CO2_reduction_with_H2 9 5897.982309800024
methanogenesis_by_reduction_of_methyl_compounds_with_H2 1 214.96130696474634
hydrogenotrophic_methanogenesis 10 6112.943616764771
methanogenesis 19 10084.053212427658
methanol_oxidation 31 13768502.988743562
methylotrophy 87 14146830.32666184
(236, 91) 0


In [75]:
rel_abund = abund_df.div(abund_df.sum(1), axis=0)*1000000
oeu_abundances = pd.DataFrame(index=rel_abund.index, columns=oeu_clusters.ward_clusters.dropna().unique())

for oeu_i, oeu in enumerate(sorted(oeu_abundances.columns)):
    oeu_membs = list(oeu_clusters[oeu_clusters.ward_clusters == oeu].index)
    oeu_membs = set(oeu_membs).intersection(set(rel_abund.columns))
    agg_col = rel_abund.loc[:, oeu_membs].sum(1)
    if oeu_i < 10:
        print(oeu, len(oeu_membs), agg_col.sum())
    oeu_abundances.loc[agg_col.index, oeu] = agg_col

print(oeu_abundances.shape, oeu_abundances.isnull().sum().sum())

1.0 1 69.02597447419464
2.0 1 265.68316102902196
3.0 2 9285.159447638493
4.0 2 11000.916100598952
5.0 2 6338123.431300601
6.0 2 4066153.4981032405
7.0 2 6794099.919228723
8.0 2 71620.34617900614
9.0 2 1417777.33920067
10.0 3 24606.452826113156
(236, 190) 0


In [78]:
taxa_abundances = pd.DataFrame(index=rel_abund.index, 
                               columns=taxa_df.Family2.dropna().unique())

for tax_i, tax in enumerate(sorted(taxa_abundances.columns)):
    tax_membs = list(taxa_df[taxa_df.Family2 == tax].index)
    tax_membs = set(tax_membs).intersection(set(rel_abund.columns))
    agg_col = rel_abund.loc[:, tax_membs].sum(1)
    if tax_i < 10:
        print(tax, len(tax_membs), agg_col.sum())
    taxa_abundances.loc[agg_col.index, tax] = agg_col

print(taxa_abundances.shape, taxa_abundances.isnull().sum().sum())

Acanthopleuribacterales Acanthopleuribacteraceae 11 4914.051555635606
Acetobacterales Acetobacteraceae 42 14072.558229035141
Acetobacterales Acetobacterales_Incertae_Sedis 2 854.1548074251925
Acholeplasmatales Acholeplasmataceae 8 41139.231328776776
Acidiferrobacterales Acidiferrobacteraceae 16 10850.123704711856
Acidithiobacillales Acidithiobacillaceae 4 6233.049963702999
Acidobacteriales Acidobacteriaceae_(Subgroup_1) 5 318.7500928242816
Acidobacteriales Koribacteraceae 2 0.0
Actinomarinales Actinomarinaceae 10 2997355.8090128684
Actinomycetales Actinomycetaceae 1 254.94801355293376
(236, 451) 0


In [79]:
taxa_abundances.to_csv("../data/otu_tables/taxa_family_abundances.txt", sep="\t", index=True, header=True)
oeu_abundances.to_csv("../data/oeu_clusters/oeu_abundances.txt", sep="\t", index=True, header=True)
fxn_abundances.to_csv("../data/faprotax_data/function_abundances.txt", sep="\t", index=True, header=True)

In [26]:
observed_fxns = silva_fxn_df.columns[silva_fxn_df.sum() > 0]
annoation_rarity = fotx_df.loc[:, observed_fxns].sum()/ silva_fxn_df.loc[:, observed_fxns].sum()
annoation_rarity.sort_values()

harmful_algae                                              0.000000
nitrite_ammonification                                     0.000000
nitrate_ammonification                                     0.000000
dark_sulfur_oxidation                                      0.000000
chitinolysis                                               0.000000
dark_thiosulfate_oxidation                                 0.000000
manganese_oxidation                                        0.000000
human_pathogens_gastroenteritis                            0.000000
human_gut                                                  0.000000
anammox                                                    0.000000
dissimilatory_arsenate_reduction                           0.000000
arsenate_respiration                                       0.000000
mammal_gut                                                 0.000000
plant_pathogen                                             0.000000
xylanolysis                                     