In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import seaborn as sns

from scipy.stats import pearsonr

sns.set_style("darkgrid")
np.random.seed(930525)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 200)

warnings.simplefilter('once')

%matplotlib inline
%load_ext watermark
%watermark --iversions

pandas  1.1.4
numpy   1.19.5
seaborn 0.11.1



In [2]:
from scipy import interpolate

  and should_run_async(code)


In [3]:
def linear_warp(a, b, s):
	return (s - b) / a

In [4]:
# load up the alignment
df_alignment = pd.read_csv("../results/alignment_results.txt", sep="\t")

In [5]:
import random

user_names = list(df_alignment["reference"].unique())

user_names_train = random.sample(user_names, k=round(len(user_names) * 0.8))


mask_reference = np.array([name in user_names_train for name in df_alignment["reference"]])
mask_current = np.array([name in user_names_train for name in df_alignment["current"]])

# df_alignment_train_error = df_alignment.copy().loc[mask_reference & mask_current]
df_alignment_train_error = df_alignment.copy()

In [6]:
# get the best person to align to
df_alignment_error = df_alignment_train_error.pivot(index="reference", columns="current", values="error").copy()
df_alignment_error.values[[np.arange(df_alignment_error.shape[0])]*2] = 0

  df_alignment_error.values[[np.arange(df_alignment_error.shape[0])]*2] = 0


In [7]:
# df_test = df_alignment_error.copy().loc[~mask_train]

best_aligner = df_alignment_error.mean(axis=1).sort_values().index[0]

In [8]:
# load up the data.frame
df_tax_counts = pd.read_csv("../data/taxonomy_clr_s_top.txt", index_col=0, sep="\t")

# load up the splines
import pickle
with open("../results/d_splines.pkl", "rb") as inf:
    d_splines = pickle.load(inf)

# load up the spline dataframe
df_tax_splines = pd.read_csv("../results/tax_clr_splines.csv")

In [9]:
# PARAMETERS
PRESENCE_THRESHOLD = .95
SAMPLING_RATE = 1
OVERLAP_THRESHOLD = .5

max_study_day_no = df_tax_splines["StudyDayNo"].max()
min_study_day_no = df_tax_splines["StudyDayNo"].min()

index_splines = np.arange(min_study_day_no, max_study_day_no + 1, SAMPLING_RATE, dtype="int")

In [10]:
# taxonomy
dfs = []
for (current, taxonomy), df in df_tax_splines.groupby(["UserName", "feature"]):
    spline_current, current_min, current_max = d_splines[current][taxonomy]
    row = df_alignment.query(f"reference == '{best_aligner}' & current == '{current}'")
    index_warp = linear_warp(row['a'].values[0], row['b'].values[0], index_splines)

    ts_current_spline = interpolate.splev(index_warp, spline_current)

    ts_current_spline = np.clip(ts_current_spline, current_min, current_max)
    
    df["temporal_warp_spline"] = ts_current_spline
    dfs.append(df)
    
df_temporal_warp = pd.concat(dfs)
df_temporal_warp_wide = df_temporal_warp.pivot(index=["UserName", "StudyDayNo"], columns="feature", values="temporal_warp_spline")
df_tax_wide = df_temporal_warp.pivot(index=["UserName", "StudyDayNo"], columns="feature", values="spline")


df_temporal_warp_wide.columns = ["tax;" + column for column in df_temporal_warp_wide.columns]
df_tax_wide.columns = ["tax;" + column for column in df_tax_wide.columns]

In [11]:
# load up the splines
with open("../results/d_kegg_splines.pkl", "rb") as inf:
    d_splines = pickle.load(inf)

# load up the spline dataframe
df_kegg_splines = pd.read_csv("../results/kegg_clr_splines.csv")

In [12]:
# taxonomy
dfs = []
for (current, taxonomy), df in df_kegg_splines.groupby(["UserName", "feature"]):
    spline_current, current_min, current_max = d_splines[current][taxonomy]
    row = df_alignment.query(f"reference == '{best_aligner}' & current == '{current}'")
    index_warp = linear_warp(row['a'].values[0], row['b'].values[0], index_splines)

    ts_current_spline = interpolate.splev(index_warp, spline_current)

    ts_current_spline = np.clip(ts_current_spline, current_min, current_max)
    
    df["temporal_warp_spline"] = ts_current_spline
    dfs.append(df)

df_kegg_temporal_warp = pd.concat(dfs)
df_kegg_temporal_warp_wide = df_kegg_temporal_warp.pivot(index=["UserName", "StudyDayNo"], columns="feature", values="temporal_warp_spline")
df_kegg_wide = df_kegg_temporal_warp.pivot(index=["UserName", "StudyDayNo"], columns="feature", values="spline")

df_kegg_temporal_warp_wide.columns = ["kegg;" + column for column in df_kegg_temporal_warp_wide.columns]
df_kegg_wide.columns = ["kegg;" + column for column in df_kegg_wide.columns]

In [13]:
# load up the splines
with open("../results/d_nutrient_splines.pkl", "rb") as inf:
    d_splines = pickle.load(inf)

# load up the spline dataframe
df_nutrients_splines = pd.read_csv("../results/nutrients_splines.csv")

In [14]:
# taxonomy
dfs = []
for (current, taxonomy), df in df_nutrients_splines.groupby(["UserName", "feature"]):
    spline_current, current_min, current_max = d_splines[current][taxonomy]
    row = df_alignment.query(f"reference == '{best_aligner}' & current == '{current}'")
    index_warp = linear_warp(row['a'].values[0], row['b'].values[0], index_splines)

    ts_current_spline = interpolate.splev(index_warp, spline_current)

    ts_current_spline = np.clip(ts_current_spline, current_min, current_max)
    
    df["temporal_warp_spline"] = ts_current_spline
    dfs.append(df)

df_nutrients_temporal_warp = pd.concat(dfs)
df_nutrients_temporal_warp_wide = df_nutrients_temporal_warp.pivot(index=["UserName", "StudyDayNo"], columns="feature", values="temporal_warp_spline")
df_nutrients_wide = df_nutrients_temporal_warp.pivot(index=["UserName", "StudyDayNo"], columns="feature", values="spline")


df_nutrients_temporal_warp_wide.columns = ["nutrients;" + column for column in df_nutrients_temporal_warp_wide.columns]
df_nutrients_wide.columns = ["nutrients;" + column for column in df_nutrients_wide.columns]

In [15]:
# load up the splines
with open("../results/d_food_splines.pkl", "rb") as inf:
    d_splines = pickle.load(inf)

# load up the spline dataframe
df_food_splines = pd.read_csv("../results/food_L3_clr_splines.csv")

In [16]:
# taxonomy
dfs = []
for (current, taxonomy), df in df_food_splines.groupby(["UserName", "feature"]):
    spline_current, current_min, current_max = d_splines[current][taxonomy]
    row = df_alignment.query(f"reference == '{best_aligner}' & current == '{current}'")
    index_warp = linear_warp(row['a'].values[0], row['b'].values[0], index_splines)

    ts_current_spline = interpolate.splev(index_warp, spline_current)

    ts_current_spline = np.clip(ts_current_spline, current_min, current_max)
    
    df["temporal_warp_spline"] = ts_current_spline
    dfs.append(df)

df_food_temporal_warp = pd.concat(dfs)
df_food_temporal_warp_wide = df_food_temporal_warp.pivot(index=["UserName", "StudyDayNo"], columns="feature", values="temporal_warp_spline")
df_food_wide = df_food_temporal_warp.pivot(index=["UserName", "StudyDayNo"], columns="feature", values="spline")


df_food_temporal_warp_wide.columns = ["food;" + column for column in df_food_temporal_warp_wide.columns]
df_food_wide.columns = ["food;" + column for column in df_food_wide.columns]

In [17]:
# shift by one day
df_day_plus_1_warp = df_temporal_warp_wide.groupby("UserName").shift(-1)

df_day_plus_1_warp.columns = ["day_plus_one;" + column for column in df_day_plus_1_warp.columns]

df_day_plus_1 = df_tax_wide.groupby("UserName").shift(-1)
df_day_plus_1.columns = ["day_plus_one;" + column for column in df_day_plus_1.columns]

In [18]:
df_warp_network = pd.concat([df_temporal_warp_wide, df_food_temporal_warp_wide, df_nutrients_temporal_warp_wide, df_kegg_temporal_warp_wide, df_day_plus_1_warp], axis=1)
df_network = pd.concat([df_tax_wide, df_food_wide, df_nutrients_wide, df_kegg_wide, df_day_plus_1], axis=1)

In [19]:
df_mapping = pd.read_csv("../data/SampleID_map.txt", sep='\t', index_col=0)

In [20]:
df_mapping.columns

Index(['UserName', 'StudyDayNo', 'StudyDate', 'Gender', 'Age', 'Weight',
       'Height', 'BMI', 'Supplement', 'Waist.Circumference', 'Study.Status',
       'oilGrams.assigned', 'fecal.status', 'fecal.time', 'BMI.1',
       'Weight.Change', 'Plate', 'SampleOrder', 'SampleType', 'Timing',
       'Activity.Factor', 'Medications', 'Dietary.Supp'],
      dtype='object')

In [21]:
mapping_columns = ['UserName', 'StudyDayNo', 'Gender', 'Age', 'Weight',
       'Height', 'BMI', 'Supplement',
       'oilGrams.assigned', 'Timing',
       'Activity.Factor']

In [22]:
df_merged_warp = pd.merge(df_warp_network, df_mapping[mapping_columns], how="left", on=["UserName", "StudyDayNo"])
df_merged = pd.merge(df_network, df_mapping[mapping_columns], how="left", on=["UserName", "StudyDayNo"])

In [23]:
df_merged_complete_cases_warp = df_merged_warp.dropna()
df_merged_complete_cases = df_merged.dropna()

In [24]:
df_merged_complete_cases_warp.columns = [col.replace(" ", "_").replace(";", ".").replace("-", "_") for col in df_merged_complete_cases_warp.columns]
df_merged_complete_cases.columns = [col.replace(" ", "_").replace(";", ".").replace("-", "_") for col in df_merged_complete_cases.columns]

  and should_run_async(code)


In [25]:
index_splines_train = np.array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [26]:
mask_warp = np.array([name in index_splines_train for name in df_merged_complete_cases_warp["StudyDayNo"]])
mask = np.array([name in index_splines_train for name in df_merged_complete_cases["StudyDayNo"]])

df_train_warp = df_merged_complete_cases_warp.iloc[mask_warp]
df_test_warp = df_merged_complete_cases_warp.iloc[~mask_warp]
df_train = df_merged_complete_cases.iloc[mask]
df_test = df_merged_complete_cases.iloc[~mask]

In [27]:
df_train = df_train.drop(columns=["UserName"])
df_train_warp = df_train_warp.drop(columns=["UserName"])

In [28]:
column_names = list(df_train.columns)

In [29]:
food_names = []
nutrient_names = []
day_plus_one_names = []
tax_names = []
kegg_names = []
other_names = []

for column in column_names:
    if column.startswith("day_plus_one."):
        day_plus_one_names.append(column)
    elif column.startswith("tax."):
        tax_names.append(column)
    elif column.startswith("food."):
        food_names.append(column)
    elif column.startswith("nutrients."):
        nutrient_names.append(column)
    elif column.startswith("kegg."):
        kegg_names.append(column)
    else:
        other_names.append(column)

In [30]:
from itertools import product

food_names_blacklist = list(product(food_names, food_names + tax_names + kegg_names + other_names))

nutrient_names_blacklist = list(product(nutrient_names, food_names + nutrient_names + tax_names + kegg_names + other_names))

day_plus_one_blacklist = list(product(day_plus_one_names, column_names))

tax_names_blacklist = list(product(tax_names, food_names + nutrient_names + tax_names + other_names))

kegg_names_blacklist = list(product(kegg_names, food_names + nutrient_names + tax_names + kegg_names + other_names))

other_names_blacklist = list(product(other_names, food_names + nutrient_names + tax_names + kegg_names + other_names))

In [31]:
blacklist = food_names_blacklist + nutrient_names_blacklist + day_plus_one_blacklist + tax_names_blacklist + kegg_names_blacklist + other_names_blacklist

In [32]:
df_blacklist = pd.DataFrame(blacklist, columns=["from", "to"])
df_blacklist["from"] = df_blacklist["from"].str.replace(";", ".")
df_blacklist["to"] = df_blacklist["to"].str.replace(";", ".")

In [33]:
df_blacklist.to_csv("../data/blacklist.txt", index=True, sep="\t")

df_train.to_csv("../data/train.txt", index=True, sep="\t")
df_train_warp.to_csv("../data/train.warp.txt", index=True, sep="\t")
df_test.to_csv("../data/test.txt", index=True, sep="\t")
df_test_warp.to_csv("../data/test.warp.txt", index=True, sep="\t")

In [34]:
df_test_warp

Unnamed: 0,UserName,StudyDayNo,tax.k__Bacteria.p__Actinobacteriota.c__Actinomycetia.o__Actinomycetales.f__Bifidobacteriaceae.g__Bifidobacterium.s__Bifidobacterium_adolescentis,tax.k__Bacteria.p__Actinobacteriota.c__Actinomycetia.o__Actinomycetales.f__Bifidobacteriaceae.g__Bifidobacterium.s__Bifidobacterium_pseudocatenulatum,tax.k__Bacteria.p__Actinobacteriota.c__Coriobacteriia.o__Coriobacteriales.f__Coriobacteriaceae.g__Collinsella.s__Collinsella_sp900541285,tax.k__Bacteria.p__Actinobacteriota.c__Coriobacteriia.o__Coriobacteriales.f__Coriobacteriaceae.g__Collinsella.s__Collinsella_sp900544865,tax.k__Bacteria.p__Actinobacteriota.c__Coriobacteriia.o__Coriobacteriales.f__Coriobacteriaceae.g__Collinsella.s__Collinsella_sp900555225,tax.k__Bacteria.p__Actinobacteriota.c__Coriobacteriia.o__Coriobacteriales.f__Eggerthellaceae.g__Eggerthella.s__Eggerthella_lenta,tax.k__Bacteria.p__Actinobacteriota.c__Coriobacteriia.o__Coriobacteriales.f__Eggerthellaceae.g__Enteroscipio.s__Enteroscipio_rubneri,tax.k__Bacteria.p__Bacteroidota.c__Bacteroidia.o__Bacteroidales.f__Bacteroidaceae.g__Bacteroides.s__Bacteroides_caccae,...,day_plus_one.tax.other,Gender,Age,Weight,Height,BMI,Supplement,oilGrams.assigned,Timing,Activity.Factor
12,MCTs01,13,4.36597,1.421462,-1.300162,-2.217049,0.268758,-1.117071,-11.426473,3.976477,...,8.054453,F,25.9,74.6,172.7,25.0,EVOO,12.0,Post,1.375
13,MCTs01,14,5.252952,1.948645,-0.175338,-0.433261,1.010371,-0.782025,-11.090063,3.417838,...,7.222057,F,25.9,74.6,172.7,25.0,EVOO,12.0,Post,1.375
14,MCTs01,15,4.91992,1.543707,-1.14935,-1.783254,0.144256,0.456297,-12.074742,2.321714,...,6.780971,F,25.9,74.6,172.7,25.0,EVOO,12.0,Post,1.375
44,MCTs03,13,0.113914,2.924407,-3.322223,-11.1918,0.054213,0.300096,-11.191711,5.378906,...,8.253447,M,25.4,91.2,185.1,26.6,MCT,18.0,Post,1.55
45,MCTs03,14,0.314877,3.587501,-2.73042,-11.246343,-0.090197,0.565805,-11.246343,4.979034,...,8.698225,M,25.4,91.2,185.1,26.6,MCT,18.0,Post,1.55
46,MCTs03,15,0.993164,4.912389,-1.259222,-10.873939,0.505093,1.584744,-10.873939,4.142905,...,8.530108,M,25.4,91.2,185.1,26.6,MCT,18.0,Post,1.55
60,MCTs04,13,-1.479773,1.156246,-1.17996,-1.098363,0.479892,1.884149,-13.311357,5.739443,...,7.862988,F,24.0,50.6,163.8,18.9,MCT,12.0,Post,1.55
61,MCTs04,14,-1.10004,1.690261,-0.310723,-0.693058,0.967791,2.006227,-11.22772,6.185727,...,6.893555,F,24.0,50.6,163.8,18.9,MCT,12.0,Post,1.55
62,MCTs04,15,-1.10093,2.256676,-1.579128,-2.198384,-0.10143,0.817301,-11.967073,5.195961,...,7.159089,F,24.0,50.6,163.8,18.9,MCT,12.0,Post,1.55
76,MCTs05,13,3.237006,1.861763,1.490021,-10.271469,2.220845,4.062614,-8.109152,1.723278,...,9.293687,F,23.1,57.2,169.5,19.9,EVOO,10.0,Post,1.375
