In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import seaborn as sns

from scipy.stats import pearsonr

sns.set_style("darkgrid")
np.random.seed(930525)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 200)

warnings.simplefilter('once')

%matplotlib inline
%load_ext watermark
%watermark --iversions

numpy     : 1.21.0
seaborn   : 0.11.1
pandas    : 1.3.0
matplotlib: 3.4.2



In [2]:
from skbio.stats.composition import clr, clr_inv

from multiplicative_replacement import multiplicative_replacement

  collections.Sequence, SkbioObject):
  from collections import Iterable


In [3]:
# df_tax = pd.read_csv("../data/taxonomy_clr_s.txt", sep='\t', index_col=0)
df_tax = pd.read_csv("../data/taxonomy_counts_s.txt", sep="\t", index_col=0)

In [4]:
# nutrients
df_nutrients = pd.read_csv("../data/nutr_65.txt", sep="\t", index_col=0)

In [5]:
# food totals
df_dhydrt = pd.read_csv("../data/diet.dhydrt.txt", sep="\t", index_col=0)

df_dhydrt.reset_index(drop=True)

df_dhydrt.index = df_dhydrt["taxonomy"]

In [6]:
# kegg modules
df_kegg = pd.read_csv("../data/ko_table.txt", sep="\t", index_col=0)

df_kegg.columns = [".".join(col.split('.')[:3]) for col in df_kegg]

In [7]:
df_mapping = pd.read_csv("../data/SampleID_map.txt", sep='\t', index_col=0)

In [8]:
soylent_ids = {"MCTs11", "MCTs12"}
soylent_mask = [_ not in soylent_ids for _ in df_mapping['UserName']]
df_mapping = df_mapping.loc[soylent_mask, :].copy()

In [9]:
# dropping soylent samples
col_mask = [col in set(df_mapping.index) for col in df_tax.columns]
df_tax = df_tax.loc[:, col_mask].copy()

In [10]:
col_mask = [col in set(df_tax.columns) for col in df_nutrients.columns]
df_nutrients = df_nutrients.loc[:, col_mask].copy()

col_mask = [col in set(df_tax.columns) for col in df_dhydrt.columns]
df_dhydrt = df_dhydrt.loc[:, col_mask].copy()

col_mask = [col in set(df_tax.columns) for col in df_kegg.columns]
df_kegg = df_kegg.loc[:, col_mask].copy()

In [11]:
from skbio.stats.composition import closure

# rarefiy counts
median_depth = df_tax.sum(axis=0).median()
df_tax_counts_rarefied = df_tax.copy()

for column in df_tax.columns:
    counts = np.random.choice(df_tax.shape[0], p=closure(df_tax[column].values), size=int(median_depth))
    col = np.bincount(counts, minlength=df_tax.shape[0])
    df_tax_counts_rarefied[column] = col

In [12]:
species = [tax for tax in df_tax.index if "NA" not in tax]

df_tax_species = df_tax_counts_rarefied.loc[species, :].copy()

In [13]:
def subset_top_variance(df, ratio=.8, max_n=50):
    for n in range(1, max_n):
#         top_n = (df.mean(axis=1).abs() / df.std(axis=1)).sort_values(ascending=False)[:n]
        top_n = (df.std(axis=1)).sort_values(ascending=False)[:n]
        df_tn = df.loc[top_n.index, :].copy()
        other = df.loc[[_ not in top_n.index for _ in df.index], :].sum(axis=0)
        
        if df_tn.sum(axis=0).sum() / (df_tn.sum(axis=0).sum() + other.sum()) > ratio:
           break 

    other.name = "other"

    df_tn = df_tn.append(other)

    return df_tn

In [14]:
from sklearn.feature_selection import VarianceThreshold

def filter_variance(df, vt=.8 * (1 - .8)):
    sel = VarianceThreshold(threshold=(vt))
    fit = sel.fit(df.T)
    return df.T.loc[:, fit.variances_ > vt].T.copy()

In [15]:
df_tax_species = filter_variance(df_tax_species)

df_tax_top_species = subset_top_variance(df_tax_species)

In [16]:
df_tax_top_species = pd.DataFrame(np.transpose(clr(multiplicative_replacement(df_tax_top_species.T.values))), index=df_tax_top_species.index, columns=df_tax_top_species.columns)

In [17]:
# df_tax_t20_species.index.name = "#taxonomy"
# df_tax_t20_species_clr.index.name = "#taxonomy"

In [18]:
df_tax_top_species.to_csv("../data/taxonomy_counts_s_top.txt", sep="\t", index_label="#taxonomy")
df_tax_top_species.to_csv("../data/taxonomy_clr_s_top.txt", sep="\t", index_label="#taxonomy")

In [19]:
# np.isfinite(clr_inv(df_tax_t20_species_clr.T))

In [20]:
df_nutrients_filtered = filter_variance(df_nutrients)

In [21]:
df_nutrients_filtered.to_csv("../data/nutrients_top.txt", sep="\t", index_label="#taxonomy")

In [22]:
# subset df_dhydrt to different levels

for i, level in zip(range(1, 6), ("L1", "L2", "L3", "L4", "L5")):
    df_lvl = df_dhydrt.copy()
    df_lvl["level"] = [";".join(tax.split(";")[:i]) for tax in df_dhydrt.index]
    df_lvl.groupby("level").sum()
    df_grouped = df_lvl.groupby("level").sum()
    df_grouped = filter_variance(df_grouped)
    df_grouped = subset_top_variance(df_grouped)
    df_grouped = pd.DataFrame(np.transpose(clr(multiplicative_replacement(df_grouped.T.values))), index=df_grouped.index, columns=df_grouped.columns)
    df_grouped.to_csv(f"../data/food_clr_{level}.txt", sep="\t", index_label="#taxonomy")    

In [23]:
# kegg modules
df_kegg = filter_variance(df_kegg)

df_kegg = subset_top_variance(df_kegg)

df_kegg = pd.DataFrame(np.transpose(clr(multiplicative_replacement(df_kegg.T.values))), index=df_kegg.index, columns=df_kegg.columns)
df_kegg_ud = df_kegg.loc[:,~df_kegg.columns.duplicated()]
df_kegg_ud.to_csv(f"../data/kegg_clr.txt", sep="\t", index_label="#kegg")    

In [24]:
df_kegg

Unnamed: 0_level_0,MCT.f.0532,MCT.f.0553,MCT.f.0061,MCT.f.0319,MCT.f.0299,MCT.f.0318,MCT.f.0489,MCT.f.0311,MCT.f.0432,MCT.f.0624,...,MCT.f.0526,MCT.f.0270,MCT.f.0099,MCT.f.0005,MCT.f.0167,MCT.f.0389,MCT.f.0454,MCT.f.0242,MCT.f.0125,MCT.f.0436
#Gene ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
K02014,2.426396,3.041892,2.370004,3.216294,2.47324,3.168622,-0.069964,3.05581,2.719114,2.770341,...,2.915498,3.135258,3.07166,4.290572,3.522963,3.936031,2.379349,4.161072,5.138174,4.00471
K18138,1.35667,1.330303,0.836759,1.341033,1.498997,1.287467,-0.021955,1.326066,1.229041,1.468638,...,-2.037595,-0.039514,0.844038,2.751458,1.894866,1.703304,0.077927,3.21363,0.026507,2.360947
K03199,0.103476,0.455505,1.585122,0.816295,0.886539,0.878293,-1.543238,0.842905,0.323948,-0.594513,...,-4.154215,1.277142,-0.323567,2.011723,-2.169749,1.662392,-1.044216,-1.689388,4.367497,2.35036
K03169,0.434755,0.53105,1.466822,1.292553,0.570021,1.263382,1.221839,1.181862,0.561125,-0.26679,...,1.229795,-0.093856,-0.323567,-1.157647,-2.199179,2.3904,-0.569758,3.624875,4.370659,1.631266
K16089,0.527894,0.378142,0.478211,0.304734,0.324012,0.292256,-2.219066,0.318931,0.544089,0.780745,...,1.253591,-0.095244,0.487363,-1.426159,-2.079432,2.392916,-0.058932,-1.119061,-0.261175,1.652408
K18908,0.272619,0.225801,0.056997,0.345298,0.708858,0.334279,1.265899,0.318931,0.382176,0.274839,...,-1.049493,1.010775,0.503112,2.095434,1.487976,2.399611,0.532552,2.504742,0.395068,2.36359
K04763,1.179684,0.544025,1.10348,0.266286,0.570021,0.197804,0.095828,0.112799,0.330351,0.047341,...,1.265346,1.291156,1.314042,-0.502302,1.520611,-0.923937,0.398493,3.634156,0.387521,-0.585957
K01955,1.10264,0.104849,0.130425,-0.042653,0.289406,0.056154,0.869018,0.082216,0.116473,0.397835,...,-1.687143,-0.130838,-0.477718,2.743046,0.752667,1.661431,-0.821072,-1.817906,-1.061294,2.34938
K03046,-0.092841,0.081726,0.223318,0.313891,0.383435,0.364011,1.495916,0.491813,0.165326,0.123921,...,-2.339983,1.27686,-0.822558,-2.202871,0.741644,1.65352,-1.285378,3.624484,-1.550842,-2.587992
K03555,0.905301,0.033813,0.496787,0.131387,0.870913,-0.071186,0.201188,-0.025909,0.301207,-0.063885,...,1.249952,0.985388,-0.221784,2.018165,0.766181,1.66928,-0.626916,-1.669486,-0.781709,2.350492
