In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import seaborn as sns

from scipy.stats import pearsonr

sns.set_style("darkgrid")
np.random.seed(930525)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 200)

warnings.simplefilter('once')

%matplotlib inline
%load_ext watermark
%watermark --iversions

seaborn 0.11.1
numpy   1.19.5
pandas  1.1.4



In [2]:
from scipy import interpolate

  and should_run_async(code)


In [3]:
def linear_warp(a, b, s):
	return (s - b) / a

In [4]:
# load up the alignment
df_alignment = pd.read_csv("../results/alignment_results.txt", sep="\t")

In [5]:
import random

user_names = list(df_alignment["reference"].unique())

user_names_train = random.sample(user_names, k=round(len(user_names) * 0.8))


mask_reference = np.array([name in user_names_train for name in df_alignment["reference"]])
mask_current = np.array([name in user_names_train for name in df_alignment["current"]])

# df_alignment_train_error = df_alignment.copy().loc[mask_reference & mask_current]
df_alignment_train_error = df_alignment.copy()

In [6]:
# get the best person to align to
df_alignment_error = df_alignment_train_error.pivot(index="reference", columns="current", values="error").copy()
df_alignment_error.values[[np.arange(df_alignment_error.shape[0])]*2] = 0

  df_alignment_error.values[[np.arange(df_alignment_error.shape[0])]*2] = 0


In [7]:
# df_test = df_alignment_error.copy().loc[~mask_train]

best_aligner = df_alignment_error.mean(axis=1).sort_values().index[0]

In [8]:
drop_users = df_alignment_error.index[df_alignment_error[best_aligner] > df_alignment_error[best_aligner].mean() + (2*df_alignment_error[best_aligner].std())]
drop_users

Index(['MCTs30'], dtype='object', name='reference')

In [9]:
# load up the data.frame
df_tax_counts = pd.read_csv("../data/taxonomy_clr_s_top.txt", index_col=0, sep="\t")

# load up the splines
import pickle
with open("../results/d_splines.pkl", "rb") as inf:
    d_splines = pickle.load(inf)

# load up the spline dataframe
df_tax_splines = pd.read_csv("../results/tax_clr_splines.csv")

  and should_run_async(code)


In [10]:
# PARAMETERS
PRESENCE_THRESHOLD = .95
SAMPLING_RATE = 1
OVERLAP_THRESHOLD = .5

max_study_day_no = df_tax_splines["StudyDayNo"].max()
min_study_day_no = df_tax_splines["StudyDayNo"].min()

index_splines = np.arange(min_study_day_no, max_study_day_no + 1, SAMPLING_RATE, dtype="int")

In [11]:
# taxonomy
dfs = []
for (current, taxonomy), df in df_tax_splines.groupby(["UserName", "feature"]):
    spline_current, current_min, current_max = d_splines[current][taxonomy]
    row = df_alignment.query(f"reference == '{best_aligner}' & current == '{current}'")
    index_warp = linear_warp(row['a'].values[0], row['b'].values[0], index_splines)

    ts_current_spline = interpolate.splev(index_warp, spline_current)

    ts_current_spline = np.clip(ts_current_spline, current_min, current_max)
    
    study_day_num = df.loc[np.isfinite(df["spline"]), "StudyDayNo"]
    index_min = np.min(study_day_num)
    index_max = np.max(study_day_num)
    ts_current_spline[index_warp < index_min] = np.nan
    ts_current_spline[index_warp > index_max] = np.nan
    
    df["temporal_warp_spline"] = ts_current_spline
    
    df["index_warp"] = index_warp
    
    dfs.append(df)
    
df_temporal_warp = pd.concat(dfs)

df_temporal_warp_wide_train = df_temporal_warp.query("index_warp <= 12").pivot(index=["UserName", "StudyDayNo"], columns="feature", values="temporal_warp_spline")
df_temporal_warp_wide_train["train"] = True
df_temporal_warp_wide_test = df_temporal_warp.query("index_warp > 12").pivot(index=["UserName", "StudyDayNo"], columns="feature", values="temporal_warp_spline")
df_temporal_warp_wide_test["train"] = False

df_temporal_warp_wide = pd.concat([df_temporal_warp_wide_train, df_temporal_warp_wide_test])

df_tax_wide = df_temporal_warp.pivot(index=["UserName", "StudyDayNo"], columns="feature", values="spline")

df_temporal_warp_wide.columns = ["tax;" + column for column in df_temporal_warp_wide.columns]
df_tax_wide.columns = ["tax;" + column for column in df_tax_wide.columns]
df_temporal_warp_wide = df_temporal_warp_wide.rename({"tax;train": "train"}, axis=1)

In [12]:
# load up the splines
with open("../results/d_kegg_splines.pkl", "rb") as inf:
    d_splines = pickle.load(inf)

# load up the spline dataframe
df_kegg_splines = pd.read_csv("../results/kegg_clr_splines.csv")

  and should_run_async(code)


In [13]:
# kegg
dfs = []
for (current, taxonomy), df in df_kegg_splines.groupby(["UserName", "feature"]):
    spline_current, current_min, current_max = d_splines[current][taxonomy]
    row = df_alignment.query(f"reference == '{best_aligner}' & current == '{current}'")
    index_warp = linear_warp(row['a'].values[0], row['b'].values[0], index_splines)

    ts_current_spline = interpolate.splev(index_warp, spline_current)

    ts_current_spline = np.clip(ts_current_spline, current_min, current_max)
    
    study_day_num = df.loc[np.isfinite(df["spline"]), "StudyDayNo"]
    index_min = np.min(study_day_num)
    index_max = np.max(study_day_num)
    ts_current_spline[index_warp < index_min] = np.nan
    ts_current_spline[index_warp > index_max] = np.nan
       
    df["temporal_warp_spline"] = ts_current_spline
    
    dfs.append(df)

df_kegg_temporal_warp = pd.concat(dfs)
df_kegg_temporal_warp_wide = df_kegg_temporal_warp.pivot(index=["UserName", "StudyDayNo"], columns="feature", values="temporal_warp_spline")
df_kegg_wide = df_kegg_temporal_warp.pivot(index=["UserName", "StudyDayNo"], columns="feature", values="spline")

df_kegg_temporal_warp_wide.columns = ["kegg;" + column for column in df_kegg_temporal_warp_wide.columns]
df_kegg_wide.columns = ["kegg;" + column for column in df_kegg_wide.columns]

In [14]:
# load up the splines
with open("../results/d_nutrient_splines.pkl", "rb") as inf:
    d_splines = pickle.load(inf)

# load up the spline dataframe
df_nutrients_splines = pd.read_csv("../results/nutrients_splines.csv")

In [15]:
# taxonomy
dfs = []
for (current, taxonomy), df in df_nutrients_splines.groupby(["UserName", "feature"]):
    spline_current, current_min, current_max = d_splines[current][taxonomy]
    row = df_alignment.query(f"reference == '{best_aligner}' & current == '{current}'")
    index_warp = linear_warp(row['a'].values[0], row['b'].values[0], index_splines)

    ts_current_spline = interpolate.splev(index_warp, spline_current)

    ts_current_spline = np.clip(ts_current_spline, current_min, current_max)
    
    study_day_num = df.loc[np.isfinite(df["spline"]), "StudyDayNo"]
    index_min = np.min(study_day_num)
    index_max = np.max(study_day_num)
    ts_current_spline[index_warp < index_min] = np.nan
    ts_current_spline[index_warp > index_max] = np.nan
       
    df["temporal_warp_spline"] = ts_current_spline
    dfs.append(df)

df_nutrients_temporal_warp = pd.concat(dfs)
df_nutrients_temporal_warp_wide = df_nutrients_temporal_warp.pivot(index=["UserName", "StudyDayNo"], columns="feature", values="temporal_warp_spline")
df_nutrients_wide = df_nutrients_temporal_warp.pivot(index=["UserName", "StudyDayNo"], columns="feature", values="spline")


df_nutrients_temporal_warp_wide.columns = ["nutrients;" + column for column in df_nutrients_temporal_warp_wide.columns]
df_nutrients_wide.columns = ["nutrients;" + column for column in df_nutrients_wide.columns]

In [16]:
# load up the splines
with open("../results/d_food_splines.pkl", "rb") as inf:
    d_splines = pickle.load(inf)

# load up the spline dataframe
df_food_splines = pd.read_csv("../results/food_L3_clr_splines.csv")

In [17]:
# taxonomy
dfs = []
for (current, taxonomy), df in df_food_splines.groupby(["UserName", "feature"]):
    spline_current, current_min, current_max = d_splines[current][taxonomy]
    row = df_alignment.query(f"reference == '{best_aligner}' & current == '{current}'")
    index_warp = linear_warp(row['a'].values[0], row['b'].values[0], index_splines)

    ts_current_spline = interpolate.splev(index_warp, spline_current)

    ts_current_spline = np.clip(ts_current_spline, current_min, current_max)
    
    study_day_num = df.loc[np.isfinite(df["spline"]), "StudyDayNo"]
    index_min = np.min(study_day_num)
    index_max = np.max(study_day_num)
    ts_current_spline[index_warp < index_min] = np.nan
    ts_current_spline[index_warp > index_max] = np.nan
       
    df["temporal_warp_spline"] = ts_current_spline
    
    dfs.append(df)

df_food_temporal_warp = pd.concat(dfs)
df_food_temporal_warp_wide = df_food_temporal_warp.pivot(index=["UserName", "StudyDayNo"], columns="feature", values="temporal_warp_spline")
df_food_wide = df_food_temporal_warp.pivot(index=["UserName", "StudyDayNo"], columns="feature", values="spline")


df_food_temporal_warp_wide.columns = ["food;" + column for column in df_food_temporal_warp_wide.columns]
df_food_wide.columns = ["food;" + column for column in df_food_wide.columns]

In [18]:
# shift by one day
df_day_plus_1_warp = df_temporal_warp_wide.groupby("UserName").shift(-1)

df_day_plus_1_warp.columns = ["day_plus_one;" + column for column in df_day_plus_1_warp.columns]

df_day_plus_1 = df_tax_wide.groupby("UserName").shift(-1)
df_day_plus_1.columns = ["day_plus_one;" + column for column in df_day_plus_1.columns]

In [19]:
df_warp_network = pd.concat([df_temporal_warp_wide, df_food_temporal_warp_wide, df_nutrients_temporal_warp_wide, df_kegg_temporal_warp_wide, df_day_plus_1_warp], axis=1)
df_network = pd.concat([df_tax_wide, df_food_wide, df_nutrients_wide, df_kegg_wide, df_day_plus_1], axis=1)

In [20]:
df_mapping = pd.read_csv("../data/SampleID_map.txt", sep='\t', index_col=0)

In [21]:
mapping_columns = ['UserName', 'StudyDayNo', 'Gender', 'Age', 'Weight',
       'Height', 'BMI', 'Supplement',
       'oilGrams.assigned', 'Timing',
       'Activity.Factor']

In [22]:
df_merged_warp = pd.merge(df_warp_network, df_mapping[mapping_columns], how="left", on=["UserName", "StudyDayNo"])
df_merged = pd.merge(df_network, df_mapping[mapping_columns], how="left", on=["UserName", "StudyDayNo"])

In [23]:
df_merged = df_merged[~df_merged["UserName"].isin(drop_users)]
df_merged_warp = df_merged_warp[~df_merged_warp["UserName"].isin(drop_users)]

In [24]:
df_merged_complete_cases_warp = df_merged_warp.dropna()
df_merged_complete_cases = df_merged.dropna()

In [25]:
df_merged_complete_cases["UserName"]

  and should_run_async(code)


1      MCTs01
2      MCTs01
3      MCTs01
4      MCTs01
5      MCTs01
        ...  
588    MCTs37
589    MCTs37
590    MCTs37
591    MCTs37
592    MCTs37
Name: UserName, Length: 476, dtype: object

In [26]:
df_merged_complete_cases_warp.columns = [col.replace(" ", "_").replace(";", ".").replace("-", "_") for col in df_merged_complete_cases_warp.columns]
df_merged_complete_cases.columns = [col.replace(" ", "_").replace(";", ".").replace("-", "_") for col in df_merged_complete_cases.columns]

In [27]:
index_splines_train = np.array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [28]:
mask_warp = df_merged_complete_cases_warp["train"].values
mask = np.array([name in index_splines_train for name in df_merged_complete_cases["StudyDayNo"]])

df_train_warp = df_merged_complete_cases_warp.iloc[mask_warp]
df_test_warp = df_merged_complete_cases_warp.iloc[~mask_warp]
df_train = df_merged_complete_cases.iloc[mask]
df_test = df_merged_complete_cases.iloc[~mask]

In [29]:
df_train_warp = df_train_warp[df_train.columns]
df_test_warp = df_test_warp[df_test_warp.columns]

In [30]:
s = set(['-'.join(_) for _ in zip(df_test["UserName"].values.astype(str), df_test["StudyDayNo"].values.astype(str))])
s_warp = set(['-'.join(_) for _ in zip(df_test_warp["UserName"].values.astype(str), df_test_warp["StudyDayNo"].values.astype(str))])

mask = ['-'.join(_) in s_warp for _ in zip(df_test["UserName"].values.astype(str), df_test["StudyDayNo"].values.astype(str))]
mask_warp = ['-'.join(_) in s for _ in zip(df_test_warp["UserName"].values.astype(str), df_test_warp["StudyDayNo"].values.astype(str))]

df_test = df_test.loc[mask]
df_test_warp = df_test_warp.loc[mask_warp]

In [31]:
df_test.shape

(108, 232)

In [32]:
df_train_warp.shape

(357, 232)

In [33]:
df_train.shape

(361, 232)

In [34]:
df_train = df_train.drop(columns=["UserName"])
df_train_warp = df_train_warp.drop(columns=["UserName"])

In [35]:
column_names = list(df_train.columns)

In [36]:
food_names = []
nutrient_names = []
day_plus_one_names = []
tax_names = []
kegg_names = []
other_names = []

for column in column_names:
    if column.startswith("day_plus_one."):
        day_plus_one_names.append(column)
    elif column.startswith("tax."):
        tax_names.append(column)
    elif column.startswith("food."):
        food_names.append(column)
    elif column.startswith("nutrients."):
        nutrient_names.append(column)
    elif column.startswith("kegg."):
        kegg_names.append(column)
    else:
        other_names.append(column)

In [37]:
from itertools import product

food_names_blacklist = list(product(food_names, food_names + tax_names + kegg_names + other_names))

nutrient_names_blacklist = list(product(nutrient_names, food_names + nutrient_names + tax_names + kegg_names + other_names))

day_plus_one_blacklist = list(product(day_plus_one_names, column_names))

tax_names_blacklist = list(product(tax_names, food_names + nutrient_names + tax_names + other_names))

kegg_names_blacklist = list(product(kegg_names, food_names + nutrient_names + tax_names + kegg_names + other_names))

other_names_blacklist = list(product(other_names, food_names + nutrient_names + tax_names + kegg_names + other_names))

In [38]:
blacklist = food_names_blacklist + nutrient_names_blacklist + day_plus_one_blacklist + tax_names_blacklist + kegg_names_blacklist + other_names_blacklist

In [39]:
df_blacklist = pd.DataFrame(blacklist, columns=["from", "to"])
df_blacklist["from"] = df_blacklist["from"].str.replace(";", ".")
df_blacklist["to"] = df_blacklist["to"].str.replace(";", ".")

In [40]:
df_blacklist.to_csv("../data/blacklist.txt", index=True, sep="\t")

df_train.to_csv("../data/train.txt", index=True, sep="\t")
df_train_warp.to_csv("../data/train.warp.txt", index=True, sep="\t")
df_test.to_csv("../data/test.txt", index=True, sep="\t")
df_test_warp.to_csv("../data/test.warp.txt", index=True, sep="\t")