In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import seaborn as sns

from scipy.stats import pearsonr

sns.set_style("darkgrid")
np.random.seed(930525)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 200)

warnings.simplefilter('once')

%matplotlib inline
%load_ext watermark
%watermark --iversions

pandas    : 1.3.0
seaborn   : 0.11.1
matplotlib: 3.4.2
numpy     : 1.21.0



In [2]:
from scipy import interpolate

# https://github.com/DaniRuizPerez/PALM-Public-Respository/blob/master/Alignment/getAlignmentsIBD_Taxa.py

#Use B-spline to extrapolate values. NOTE: Parameters s must be adjusted appropriately to avoid over-fitting.
# tck = interpolate.splrep(timepoints, relativeAbundances, k=3, s=0.001, xb=weekFirstSample, xe=weekLastSample)

In [3]:
SAMPLING_RATE = 1.0

In [4]:
df_mapping = pd.read_csv("../data/SampleID_map.txt", sep='\t', index_col=0)

In [5]:
df_mapping

Unnamed: 0_level_0,UserName,StudyDayNo,StudyDate,Gender,Age,Weight,Height,BMI,Supplement,Waist.Circumference,...,fecal.time,BMI.1,Weight.Change,Plate,SampleOrder,SampleType,Timing,Activity.Factor,Medications,Dietary.Supp
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MCT.f.0001,MCTs01,1,1/31/2017,F,25.9,74.6,172.7,25.0,EVOO,,...,12:00 PM,25.0,-0.4,5.0,54.0,Sample,Pre,1.375,Nexplanon (bc implant),Multivitamin;VitD;VitC;Zinc
MCT.f.0018,MCTs02,1,1/31/2017,F,32.7,89.6,162.8,33.8,,116.5,...,2:30 PM,33.8,,2.0,7.0,Sample,Pre,1.550,,Multivitamin;VitD
MCT.f.0035,MCTs03,1,1/31/2017,M,25.4,91.2,185.1,26.6,MCT,104.0,...,5:21 PM,26.6,2.9,2.0,10.0,Sample,Pre,1.550,Allegra;Flonase;Asthma inhaler,VitD
MCT.f.0052,MCTs04,1,1/31/2017,F,24.0,50.6,163.8,18.9,MCT,78.0,...,11:30 AM,18.9,0.6,3.0,9.0,Sample,Pre,1.550,,Multivitamin
MCT.f.0069,MCTs05,1,1/31/2017,F,23.1,57.2,169.5,19.9,EVOO,85.5,...,7:45 PM,19.9,-0.6,4.0,4.0,Sample,Pre,1.375,Hormonal IUD;ibuprofen,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MCT.f.0561,MCTs33,17,2/16/2017,M,37.9,64.5,166.0,23.4,MCT,84.6,...,No Time,23.4,-0.6,2.0,88.0,Sample,Post,1.325,,
MCT.f.0578,MCTs34,17,2/16/2017,F,41.4,53.9,161.3,20.7,EVOO,85.5,...,No Time,20.7,,4.0,86.0,Sample,Post,1.325,Thyroid hormone;lexapro,Multivitamin;VitD;fish oil
MCT.f.0595,MCTs35,17,2/16/2017,F,29.5,64.5,163.2,24.2,EVOO,84.5,...,5:15 AM,24.2,1.0,6.0,53.0,Sample,Post,1.550,ibuprofen,Multivitamin;folate
MCT.f.0612,MCTs36,17,2/16/2017,M,61.6,74.9,164.1,27.8,EVOO,100.4,...,6:15 AM,27.8,0.2,2.0,53.0,Sample,Post,1.550,lisinopril,


In [6]:
df_tax = pd.read_csv("../data/taxonomy_clr_s.txt", sep='\t', index_col=0)

In [7]:
df_tax_long = pd.melt(df_tax.reset_index(), id_vars=["#taxonomy"], value_vars=df_tax.columns, var_name="#SampleID")

In [8]:
df_tax_metadata_long = pd.merge(df_tax_long, df_mapping.reset_index()[["#SampleID", "StudyDayNo", "UserName"]], on="#SampleID", how="left")

In [9]:
df_tax_metadata_long = df_tax_metadata_long.sort_values(["UserName", "#taxonomy", "StudyDayNo"])

In [10]:
max_study_day_no = df_tax_metadata_long["StudyDayNo"].max()
min_study_day_no = df_tax_metadata_long["StudyDayNo"].min()

index_splines = np.arange(min_study_day_no, max_study_day_no, SAMPLING_RATE, dtype="int")

# for group, df in df_tax_metadata_long.groupby(["UserName", "#taxonomy"]):
#     break

In [12]:
def yield_df_spline(username, taxonomy, df, index_splines, sampling_rate):
    
    timepoints = df["StudyDayNo"].values
    values = df["value"].values
#     print("hi")
    max_subj_study_day_no = df["StudyDayNo"].max()
    min_subj_study_day_no = df["StudyDayNo"].min()

    max_subj_value = df["value"].max()
    min_subj_value = df["value"].min()

    spline = interpolate.splrep(timepoints, values, k=3, s=0.001, xb=min_subj_study_day_no, xe=max_subj_study_day_no)

    spline_timepoints = np.arange(min_subj_study_day_no, max_subj_study_day_no + 1, sampling_rate, dtype='int')
    spline_values = interpolate.splev(spline_timepoints, spline)

    df_out = df.copy().set_index("StudyDayNo").reindex(index_splines)

    df_out['spline'] = pd.Series(spline_values, index=spline_timepoints)
    df_out['spline'] = df_out['spline'].ffill().bfill()

    df_out['spline'] = np.clip(df_out['spline'], min_subj_value, max_subj_value)

    df_out["UserName"] = username
    df_out["#taxonomy"] = taxonomy
#     print("hi")
    return df_out

In [13]:
import multiprocessing
pool = multiprocessing.Pool(8)
results = pool.starmap(yield_df_spline, [(username, taxonomy, df, index_splines, SAMPLING_RATE) for (username, taxonomy), df in df_tax_metadata_long.groupby(["UserName", "#taxonomy"])])

spline with fp=s has been reached. Probable cause: s too small.
(abs(fp-s)/s>0.001)
spline with fp=s has been reached. Probable cause: s too small.
(abs(fp-s)/s>0.001)
spline with fp=s has been reached. Probable cause: s too small.
(abs(fp-s)/s>0.001)
spline with fp=s has been reached. Probable cause: s too small.
(abs(fp-s)/s>0.001)
spline with fp=s has been reached. Probable cause: s too small.
(abs(fp-s)/s>0.001)
spline with fp=s has been reached. Probable cause: s too small.
(abs(fp-s)/s>0.001)
spline with fp=s has been reached. Probable cause: s too small.
(abs(fp-s)/s>0.001)
spline with fp=s has been reached. Probable cause: s too small.
(abs(fp-s)/s>0.001)


In [15]:
df_tax_splines = pd.concat(results)

In [17]:
df_tax_splines.reset_index().to_csv("../results/tax_clr_splines.csv")