In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import seaborn as sns

from scipy.stats import pearsonr

sns.set_style("darkgrid")
np.random.seed(930525)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 200)

warnings.simplefilter('once')

%matplotlib inline
%load_ext watermark
%watermark --iversions

pandas    : 1.3.0
matplotlib: 3.4.2
numpy     : 1.21.0
seaborn   : 0.11.1



In [2]:
from skbio.stats.composition import clr, clr_inv

from multiplicative_replacement import multiplicative_replacement

  collections.Sequence, SkbioObject):
  from collections import Iterable


In [3]:
# df_tax = pd.read_csv("../data/taxonomy_clr_s.txt", sep='\t', index_col=0)
df_tax = pd.read_csv("../data/otu_taxatable/species.gtdb98.normalized.txt", sep="\t", index_col=0)
df_tax.columns = [".".join(_.split(".")[:3])for _ in df_tax.columns]

In [4]:
# nutrients
df_nutrients = pd.read_csv("../data/nutr_65.txt", sep="\t", index_col=0)

In [5]:
# food totals
df_dhydrt = pd.read_csv("../data/diet.dhydrt.txt", sep="\t", index_col=0)

df_dhydrt.reset_index(drop=True)

df_dhydrt.index = df_dhydrt["taxonomy"]

In [6]:
# kegg modules
df_kegg = pd.read_csv("../data/otu_taxatable/species.gtdb98.kegg.txt", sep="\t", index_col=0)

df_kegg.columns = [".".join(col.split('.')[:3]) for col in df_kegg]

In [7]:
df_mapping = pd.read_csv("../data/SampleID_map.txt", sep='\t', index_col=0)

In [8]:
soylent_ids = {"MCTs11", "MCTs12"}
soylent_mask = [_ not in soylent_ids for _ in df_mapping['UserName']]
df_mapping = df_mapping.loc[soylent_mask, :].copy()

In [9]:
# dropping soylent samples
col_mask = [col in set(df_mapping.index) for col in df_tax.columns]
df_tax = df_tax.loc[:, col_mask].copy()

In [10]:
col_mask = [col in set(df_tax.columns) for col in df_nutrients.columns]
df_nutrients = df_nutrients.loc[:, col_mask].copy()

col_mask = [col in set(df_tax.columns) for col in df_dhydrt.columns]
df_dhydrt = df_dhydrt.loc[:, col_mask].copy()

col_mask = [col in set(df_tax.columns) for col in df_kegg.columns]
df_kegg = df_kegg.loc[:, col_mask].copy()

In [11]:
# from skbio.stats.composition import closure

# # rarify counts
# median_depth = df_tax.sum(axis=0).median()
# df_tax_counts_rarefied = df_tax.copy()

# for column in df_tax.columns:
#     counts = np.random.choice(df_tax.shape[0], p=closure(df_tax[column].values), size=int(median_depth))
#     col = np.bincount(counts, minlength=df_tax.shape[0])
#     df_tax_counts_rarefied[column] = col

In [12]:
species = [tax for tax in df_tax.index if "NA" not in tax]

df_tax_species = df_tax.loc[species, :].copy()

In [13]:
def subset_top_variance(df, ratio=.8, max_n=50):
    for n in range(1, max_n):
#         top_n = (df.mean(axis=1).abs() / df.std(axis=1)).sort_values(ascending=False)[:n]
        top_n = (df.std(axis=1)).sort_values(ascending=False)[:n]
        df_tn = df.loc[top_n.index, :].copy()
        other = df.loc[[_ not in top_n.index for _ in df.index], :].sum(axis=0)
        
        if df_tn.sum(axis=0).sum() / (df_tn.sum(axis=0).sum() + other.sum()) > ratio:
           break 

    other.name = "other"

    df_tn = df_tn.append(other)

    return df_tn

In [14]:
from sklearn.feature_selection import VarianceThreshold

def filter_variance(df, vt=.8 * (1 - .8)):
    sel = VarianceThreshold(threshold=(vt))
    fit = sel.fit(df.T)
    return df.T.loc[:, fit.variances_ > vt].T.copy()

In [15]:
df_tax_species = filter_variance(df_tax_species)

df_tax_top_species = subset_top_variance(df_tax_species)

In [16]:
df_tax_species.reset_index().rename({"#OTU ID": "#taxonomy"}, axis=1).to_csv("../data/taxonomy_s.GTDB98.txt", sep="\t", index=False)

In [17]:
df_tax_clr_species = pd.DataFrame(np.transpose(clr(multiplicative_replacement(df_tax_species.T.values))), index=df_tax_species.index, columns=df_tax_species.columns)

In [18]:
df_tax_clr_species.reset_index().rename({"#OTU ID": "#taxonomy"}, axis=1).to_csv("../data/taxonomy_clr_s.GTDB98.txt", sep="\t", index=False)

In [19]:
df_mapping = pd.read_csv("../data/SampleID_map.txt", sep='\t', index_col=0)

In [20]:
df_tax_species = df_tax_species.T.reset_index()
df_tax_species = df_tax_species.rename({"index": "#SampleID"}, axis=1)

In [21]:
df_tax_species_users = pd.merge(df_tax_species, df_mapping.reset_index()[["#SampleID", "UserName"]], on="#SampleID", how="left")
df_tax_species_users.groupby("UserName").mean().T.reset_index().rename({"index": "#taxonomy"}, axis=1).to_csv("../data/UN_tax_mean_norm_s.GTDB98.txt", index=False, sep="\t")

In [22]:
df_tax_clr_species = df_tax_clr_species.T.reset_index()
df_tax_clr_species = df_tax_clr_species.rename({"index": "#SampleID"}, axis=1)

In [23]:
df_tax_clr_species_users = pd.merge(df_tax_clr_species, df_mapping.reset_index()[["#SampleID", "UserName"]], on="#SampleID", how="left")
df_tax_clr_species_users.groupby("UserName").mean().T.reset_index().rename({"index": "#taxonomy"}, axis=1).to_csv("../data/UN_tax_CLR_mean_norm_s.GTDB98.txt", index=False, sep="\t")

In [24]:
df_tax_top_species_clr = pd.DataFrame(np.transpose(clr(multiplicative_replacement(df_tax_top_species.T.values))), index=df_tax_top_species.index, columns=df_tax_top_species.columns)

In [25]:
# df_tax_t20_species.index.name = "#taxonomy"
# df_tax_t20_species_clr.index.name = "#taxonomy"

In [26]:
df_tax_top_species.to_csv("../data/taxonomy_counts_s_top.txt", sep="\t", index_label="#taxonomy")
df_tax_top_species_clr.to_csv("../data/taxonomy_clr_s_top.txt", sep="\t", index_label="#taxonomy")

In [27]:
# np.isfinite(clr_inv(df_tax_t20_species_clr.T))

In [28]:
df_nutrients_filtered = filter_variance(df_nutrients)

In [29]:
df_nutrients_filtered.to_csv("../data/nutrients_top.txt", sep="\t", index_label="#taxonomy")

In [30]:
# subset df_dhydrt to different levels

for i, level in zip(range(1, 6), ("L1", "L2", "L3", "L4", "L5")):
    df_lvl = df_dhydrt.copy()
    df_lvl["level"] = [";".join(tax.split(";")[:i]) for tax in df_dhydrt.index]
    df_lvl.groupby("level").sum()
    df_grouped = df_lvl.groupby("level").sum()
    df_grouped = filter_variance(df_grouped)
    df_grouped = subset_top_variance(df_grouped)
    df_grouped = pd.DataFrame(np.transpose(clr(multiplicative_replacement(df_grouped.T.values))), index=df_grouped.index, columns=df_grouped.columns)
    df_grouped.to_csv(f"../data/food_clr_{level}.txt", sep="\t", index_label="#taxonomy")    

In [31]:
# kegg modules
df_kegg = filter_variance(df_kegg)

df_kegg = subset_top_variance(df_kegg)

df_kegg_ud = df_kegg.loc[:,~df_kegg.columns.duplicated()]

df_kegg_ud_clr = pd.DataFrame(np.transpose(clr(multiplicative_replacement(df_kegg_ud.T.values))), index=df_kegg.index, columns=df_kegg.columns)

df_kegg_ud.to_csv(f"../data/kegg.txt", sep="\t", index_label="#kegg")    
df_kegg_ud_clr.to_csv(f"../data/kegg_clr.txt", sep="\t", index_label="#kegg")