In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import seaborn as sns

from scipy.stats import pearsonr

sns.set_style("darkgrid")
np.random.seed(930525)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 200)

warnings.simplefilter('once')

%matplotlib inline
%load_ext watermark
%watermark --iversions

numpy   1.19.5
pandas  1.1.4
seaborn 0.11.1



In [2]:
from scipy import interpolate

  and should_run_async(code)


In [3]:
def linear_warp(a, b, s):
	return (s - b) / a

In [4]:
# load up the alignment
df_alignment = pd.read_csv("../results/alignment_results.txt", sep="\t")

In [5]:
# get the best person to align to
df_alignment_error = df_alignment.pivot(index="reference", columns="current", values="error").copy()
df_alignment_error.values[[np.arange(df_alignment_error.shape[0])]*2] = 0
best_aligner = df_alignment_error.mean(axis=1).sort_values().index[0]

  df_alignment_error.values[[np.arange(df_alignment_error.shape[0])]*2] = 0


In [6]:
# load up the data.frame
df_tax_counts = pd.read_csv("../data/taxonomy_clr_s_top.txt", index_col=0, sep="\t")

# load up the splines
import pickle
with open("../results/d_splines.pkl", "rb") as inf:
    d_splines = pickle.load(inf)

# load up the spline dataframe
df_tax_splines = pd.read_csv("../results/tax_clr_splines.csv")

In [7]:
# PARAMETERS
PRESENCE_THRESHOLD = .95
SAMPLING_RATE = 1
OVERLAP_THRESHOLD = .5

max_study_day_no = df_tax_splines["StudyDayNo"].max()
min_study_day_no = df_tax_splines["StudyDayNo"].min()

index_splines = np.arange(min_study_day_no, max_study_day_no + 1, SAMPLING_RATE, dtype="int")

In [8]:
# taxonomy
dfs = []
for (current, taxonomy), df in df_tax_splines.groupby(["UserName", "feature"]):
    spline_current, current_min, current_max = d_splines[current][taxonomy]
    row = df_alignment.query(f"reference == '{best_aligner}' & current == '{current}'")
    index_warp = linear_warp(row['a'].values[0], row['b'].values[0], index_splines)

    ts_current_spline = interpolate.splev(index_warp, spline_current)

    ts_current_spline = np.clip(ts_current_spline, current_min, current_max)
    
    df["temporal_warp_spline"] = ts_current_spline
    dfs.append(df)

df_temporal_warp = pd.concat(dfs)
df_temporal_warp_wide = df_temporal_warp.pivot(index=["UserName", "StudyDayNo"], columns="feature", values="temporal_warp_spline")

df_temporal_warp_wide.columns = ["tax;" + column for column in df_temporal_warp_wide.columns]

In [9]:
# load up the splines
with open("../results/d_kegg_splines.pkl", "rb") as inf:
    d_splines = pickle.load(inf)

# load up the spline dataframe
df_kegg_splines = pd.read_csv("../results/kegg_clr_splines.csv")

In [10]:
# taxonomy
dfs = []
for (current, taxonomy), df in df_kegg_splines.groupby(["UserName", "feature"]):
    spline_current, current_min, current_max = d_splines[current][taxonomy]
    row = df_alignment.query(f"reference == '{best_aligner}' & current == '{current}'")
    index_warp = linear_warp(row['a'].values[0], row['b'].values[0], index_splines)

    ts_current_spline = interpolate.splev(index_warp, spline_current)

    ts_current_spline = np.clip(ts_current_spline, current_min, current_max)
    
    df["temporal_warp_spline"] = ts_current_spline
    dfs.append(df)

df_kegg_temporal_warp = pd.concat(dfs)
df_kegg_temporal_warp_wide = df_kegg_temporal_warp.pivot(index=["UserName", "StudyDayNo"], columns="feature", values="temporal_warp_spline")

df_kegg_temporal_warp_wide.columns = ["kegg;" + column for column in df_kegg_temporal_warp_wide.columns]

In [11]:
# load up the splines
with open("../results/d_nutrient_splines.pkl", "rb") as inf:
    d_splines = pickle.load(inf)

# load up the spline dataframe
df_nutrients_splines = pd.read_csv("../results/nutrients_splines.csv")

In [12]:
# taxonomy
dfs = []
for (current, taxonomy), df in df_nutrients_splines.groupby(["UserName", "feature"]):
    spline_current, current_min, current_max = d_splines[current][taxonomy]
    row = df_alignment.query(f"reference == '{best_aligner}' & current == '{current}'")
    index_warp = linear_warp(row['a'].values[0], row['b'].values[0], index_splines)

    ts_current_spline = interpolate.splev(index_warp, spline_current)

    ts_current_spline = np.clip(ts_current_spline, current_min, current_max)
    
    df["temporal_warp_spline"] = ts_current_spline
    dfs.append(df)

df_nutrients_temporal_warp = pd.concat(dfs)
df_nutrients_temporal_warp_wide = df_nutrients_temporal_warp.pivot(index=["UserName", "StudyDayNo"], columns="feature", values="temporal_warp_spline")

df_nutrients_temporal_warp_wide.columns = ["nutrients;" + column for column in df_nutrients_temporal_warp_wide.columns]

In [13]:
# load up the splines
with open("../results/d_food_splines.pkl", "rb") as inf:
    d_splines = pickle.load(inf)

# load up the spline dataframe
df_food_splines = pd.read_csv("../results/food_L3_clr_splines.csv")

In [14]:
# taxonomy
dfs = []
for (current, taxonomy), df in df_food_splines.groupby(["UserName", "feature"]):
    spline_current, current_min, current_max = d_splines[current][taxonomy]
    row = df_alignment.query(f"reference == '{best_aligner}' & current == '{current}'")
    index_warp = linear_warp(row['a'].values[0], row['b'].values[0], index_splines)

    ts_current_spline = interpolate.splev(index_warp, spline_current)

    ts_current_spline = np.clip(ts_current_spline, current_min, current_max)
    
    df["temporal_warp_spline"] = ts_current_spline
    dfs.append(df)

df_food_temporal_warp = pd.concat(dfs)
df_food_temporal_warp_wide = df_food_temporal_warp.pivot(index=["UserName", "StudyDayNo"], columns="feature", values="temporal_warp_spline")

df_food_temporal_warp_wide.columns = ["food;" + column for column in df_food_temporal_warp_wide.columns]

In [15]:
# shift by one day
df_day_plus_1 = df_temporal_warp_wide.groupby("UserName").shift(1)

df_day_plus_1.columns = ["day_plus_one;" + column for column in df_day_plus_1.columns]

In [16]:
df_network = pd.concat([df_temporal_warp_wide, df_food_temporal_warp_wide, df_nutrients_temporal_warp_wide, df_kegg_temporal_warp_wide, df_day_plus_1], axis=1)

In [18]:
df_mapping = pd.read_csv("../data/SampleID_map.txt", sep='\t', index_col=0)

  and should_run_async(code)


In [20]:
df_mapping.columns

  and should_run_async(code)


Index(['UserName', 'StudyDayNo', 'StudyDate', 'Gender', 'Age', 'Weight',
       'Height', 'BMI', 'Supplement', 'Waist.Circumference', 'Study.Status',
       'oilGrams.assigned', 'fecal.status', 'fecal.time', 'BMI.1',
       'Weight.Change', 'Plate', 'SampleOrder', 'SampleType', 'Timing',
       'Activity.Factor', 'Medications', 'Dietary.Supp'],
      dtype='object')

In [50]:
mapping_columns = ['UserName', 'StudyDayNo', 'Gender', 'Age', 'Weight',
       'Height', 'BMI', 'Supplement',
       'oilGrams.assigned', 'Timing',
       'Activity.Factor']

  and should_run_async(code)


In [51]:
df_merged = pd.merge(df_network.reset_index(), df_mapping[mapping_columns], how="left", on=["UserName", "StudyDayNo"])

In [52]:
df_merged

Unnamed: 0,UserName,StudyDayNo,tax;k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium adolescentis,tax;k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium longum,tax;k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium pseudocatenulatum,tax;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__Bacteroides caccae,tax;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__Bacteroides cellulosilyticus,tax;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__Bacteroides coprocola,tax;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__Bacteroides coprophilus,tax;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__Bacteroides dorei,...,day_plus_one;tax;other,Gender,Age,Weight,Height,BMI,Supplement,oilGrams.assigned,Timing,Activity.Factor
0,MCTs01,1,-0.558717,-1.876218,-3.373464,3.351226,-0.940603,-2.827708,-0.162761,3.864673,...,,F,25.9,74.6,172.7,25.0,EVOO,0.0,Pre,1.375
1,MCTs01,2,1.678151,0.311172,-0.972404,2.639542,-2.071017,-0.751167,-1.797724,4.233223,...,3.335170,F,25.9,74.6,172.7,25.0,EVOO,0.0,Pre,1.375
2,MCTs01,3,4.889820,3.102096,1.856659,1.766990,-3.063322,0.039020,-3.533326,3.733083,...,4.571694,F,25.9,74.6,172.7,25.0,EVOO,0.0,Pre,1.375
3,MCTs01,4,2.421892,0.653380,-1.416144,1.667695,-2.611383,-1.083438,-2.568824,3.896414,...,6.189635,F,25.9,74.6,172.7,25.0,EVOO,0.0,Pre,1.375
4,MCTs01,5,2.122291,1.584790,-0.406261,2.467703,-1.069556,-0.189197,-1.312502,4.923476,...,4.716797,F,25.9,74.6,172.7,25.0,EVOO,0.0,Pre,1.375
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507,MCTs37,12,-0.701550,1.552833,1.783357,0.654891,-0.895706,2.615840,-1.232180,3.475548,...,6.108935,M,45.2,84.9,176.5,27.3,MCT,14.0,Post,1.325
508,MCTs37,13,-0.346267,2.558812,2.268693,0.123735,-1.587980,2.339308,-1.513872,2.774626,...,6.334702,M,45.2,84.9,176.5,27.3,MCT,14.0,Post,1.325
509,MCTs37,14,-0.555774,2.344259,1.776265,0.160820,0.113811,0.592743,-0.862887,1.487177,...,5.496566,M,45.2,84.9,176.5,27.3,MCT,14.0,Post,1.325
510,MCTs37,15,-1.028109,1.646889,1.079863,0.587225,1.961449,-0.223215,-2.512682,1.201431,...,4.104731,M,45.2,84.9,176.5,27.3,MCT,14.0,Post,1.325


In [49]:
df_merged.dropna()

  and should_run_async(code)


Unnamed: 0,UserName,StudyDayNo,tax;k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium adolescentis,tax;k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium longum,tax;k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium pseudocatenulatum,tax;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__Bacteroides caccae,tax;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__Bacteroides cellulosilyticus,tax;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__Bacteroides coprocola,tax;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__Bacteroides coprophilus,tax;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__Bacteroides dorei,...,Gender,Age,Weight,Height,BMI,Supplement,Waist.Circumference,oilGrams.assigned,Timing,Activity.Factor
17,MCTs03,2,-1.600666,2.935288,2.609237,2.632534,-1.993709,-1.377523,-0.553347,0.536454,...,M,25.4,91.2,185.1,26.6,MCT,104.0,0.0,Pre,1.550
18,MCTs03,3,-2.442367,1.762326,1.926448,3.503185,-1.056072,-1.263712,-1.161433,0.802177,...,M,25.4,91.2,185.1,26.6,MCT,104.0,0.0,Pre,1.550
19,MCTs03,4,-3.492865,1.040928,0.925975,3.027756,-1.179230,-1.719798,-1.515703,0.672938,...,M,25.4,91.2,185.1,26.6,MCT,104.0,0.0,Pre,1.550
20,MCTs03,5,-2.745418,3.122667,3.090227,4.722442,-0.086158,-1.257341,-1.164968,1.609457,...,M,25.4,91.2,185.1,26.6,MCT,104.0,0.0,Pre,1.550
21,MCTs03,6,-1.613635,4.100833,4.297451,5.539751,0.331912,-0.587733,-0.600828,2.301405,...,M,25.4,91.2,185.1,26.6,MCT,104.0,0.0,Pre,1.550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507,MCTs37,12,-0.701550,1.552833,1.783357,0.654891,-0.895706,2.615840,-1.232180,3.475548,...,M,45.2,84.9,176.5,27.3,MCT,97.5,14.0,Post,1.325
508,MCTs37,13,-0.346267,2.558812,2.268693,0.123735,-1.587980,2.339308,-1.513872,2.774626,...,M,45.2,84.9,176.5,27.3,MCT,97.5,14.0,Post,1.325
509,MCTs37,14,-0.555774,2.344259,1.776265,0.160820,0.113811,0.592743,-0.862887,1.487177,...,M,45.2,84.9,176.5,27.3,MCT,97.5,14.0,Post,1.325
510,MCTs37,15,-1.028109,1.646889,1.079863,0.587225,1.961449,-0.223215,-2.512682,1.201431,...,M,45.2,84.9,176.5,27.3,MCT,97.5,14.0,Post,1.325
