In [43]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import re
import seaborn as sns
import statsmodels.api as sm
import warnings

from scipy.stats import pearsonr, spearmanr
warnings.simplefilter(action='ignore', category=FutureWarning)


# Filepaths

In [61]:
fp_biomarkersWide = "/Data/LS_Biomarkers_Wide.csv"
fp_biomarkersLong = "/Data/LS_Biomarkers_Long.csv"
fp_microbiomeWide = "Data/LS_Biomarkers_Wide.csv"
fp_microbiomeLong = "Data/LS_Gut_Microbiome_Long.csv"
fp_microbiomeSummary = "/Data/LS_Gut_Microbiome_Summary.csv"


In [None]:
df_wideBioDairyLS = pd.read_csv(fp_wide)
df_wideBioDairyLSb

In [52]:
# Summary of columns in the with number of values, completion, max, average, min, std,, and data type
df_colSummary = pd.DataFrame({
                                "Column": df_wideBioDairyLS.columns,
                                "# Values": df_wideBioDairyLS.notna().sum().values,
                                "Completion": (df_wideBioDairyLS.notna().sum().values / len(df_wideBioDairyLS)).round(2),
                                "Max": df_wideBioDairyLS.max().values,
                                "Average": df_wideBioDairyLS.mean().values,
                                "Min": df_wideBioDairyLS.min().values,
                                "Std": df_wideBioDairyLS.std().values,
                                "Data Type": df_wideBioDairyLS.dtypes.values
})

# Display all rows only within this cell
with pd.option_context("display.max_rows", None):
    display(df_colSummary.sort_values(by="Completion", ascending=False))

TypeError: '>=' not supported between instances of 'float' and 'str'

In [53]:
# numeric-only view for stats (strings -> NaN; datetimes left alone)
num = df_wideBioDairyLS.copy()
for c in num.columns:
    if not np.issubdtype(num[c].dtype, np.number) and not np.issubdtype(num[c].dtype, np.datetime64):
        num[c] = pd.to_numeric(num[c], errors="coerce")

df_colSummary = pd.DataFrame({
    "Column": df_wideBioDairyLS.columns,
    "# Values": num.notna().sum().values,                                  # count of numeric/non-NaN
    "Completion": (num.notna().sum().values / len(num)).round(2),
    "Max": num.max().values,
    "Average": num.mean().values,
    "Min": num.min().values,
    "Std": num.std().values,
    "Data Type": df_wideBioDairyLS.dtypes.values                           # original dtypes for reference
})

# (Optional) view most-complete first
with pd.option_context("display.max_rows", None):
    display(df_colSummary.sort_values("Completion", ascending=False))

Unnamed: 0,Column,# Values,Completion,Max,Average,Min,Std,Data Type
181,Weight_lbs,322,0.97,202.0,185.604348,166.1,7.40941,float64
183,pH_None,183,0.55,7.6,6.537158,5.1,0.312622,float64
139,Stool Lactoferrin_None,184,0.55,899.0,43.888043,0.5,101.947156,float64
133,Staphylococcus aureus_None,182,0.55,3.0,0.098901,0.0,0.394855,float64
134,Staphylococcus epidermidis_None,182,0.55,1.0,0.032967,0.0,0.179043,float64
135,Staphylococcus haemolyticus_None,181,0.55,2.0,0.01105,0.0,0.148659,float64
63,Exophiala dermatitidis_None,182,0.55,2.0,0.021978,0.0,0.18073,float64
136,Staphylococcus lugdunensis_None,182,0.55,1.0,0.027473,0.0,0.163907,float64
60,Enterococcus spp._None,182,0.55,4.0,1.747253,0.0,1.326262,float64
59,Enterobacter cloacae_None,182,0.55,4.0,0.148352,0.0,0.635047,float64


In [None]:
# Put all 5

In [47]:
# Prepare data
df_plot = df_wideBioDairyLS.dropna(subset=["Total SCFA_mg/mL"]).copy()
df_plot["Date"] = pd.to_datetime(df_plot["Date"])
df_plot = df_plot.set_index("Date").sort_index()

# --- YEARLY INTERVAL WINDOWS ---
# resample into 12-month bins, take mean within each bin
df_1y = df_plot[["Weight_lbs", "Total SCFA_mg/mL"]].resample("12M").mean().reset_index()

# --- variables ---
x_time_1y   = df_1y["Date"]
y_weight_1y = df_1y["Weight_lbs"]
y_scfa_1y   = df_1y["Total SCFA_mg/mL"]

# --- plot ---
fig = go.Figure()

# Weight (yearly interval mean)
fig.add_trace(go.Scatter(
    x=x_time_1y,
    y=y_weight_1y,
    mode="lines+markers",
    name="Weight (yearly interval mean)",
    line=dict(width=3, color="black"),
))

# Total SCFA (yearly interval mean)
fig.add_trace(go.Scatter(
    x=x_time_1y,
    y=y_scfa_1y,
    mode="lines+markers",
    name="Total SCFA (yearly interval mean)",
    line=dict(width=2, color="blue"),
))

# layout
fig.update_layout(
    title="Weight vs Total SCFA (Yearly Interval Means)",
    xaxis_title="Date",
    yaxis=dict(title="Weight (lbs)", side="left"),
    yaxis2=dict(title="Total SCFA (mg/mL)", overlaying="y", side="right"),
    legend=dict(x=0.01, y=0.99, bordercolor="lightgray", borderwidth=1),
    template="plotly_white"
)

fig.show()

In [48]:
# --- prep ---
df_plot = df_wideBioDairyLS.dropna(subset=["Total SCFA_mg/mL"]).copy()
df_plot["Date"] = pd.to_datetime(df_plot["Date"], errors="coerce")
df_plot = df_plot.sort_values("Date").set_index("Date")

# choose interval: None (raw), "1M", "3M", "6M", "12M"
rule = "3M"  # <- change to None / "1M" / "3M" / "6M" / "12M"

if rule is None:
    df_bin = df_plot[["Weight_lbs", "Total SCFA_mg/mL"]].dropna().reset_index()
else:
    df_bin = df_plot[["Weight_lbs", "Total SCFA_mg/mL"]].resample(rule).mean().dropna().reset_index()

x_time   = df_bin["Date"]
y_weight = df_bin["Weight_lbs"]
y_scfa   = df_bin["Total SCFA_mg/mL"]

# --- quick stats (optional) ---
if len(df_bin) >= 3:
    pr, pp = pearsonr(y_weight, y_scfa)
    sr, sp = spearmanr(y_weight, y_scfa)
    print(f"{rule or 'RAW'}  |  Pearson r={pr:.3f} (p={pp:.3g})  |  Spearman r={sr:.3f} (p={sp:.3g})")

# --- plot ---
fig = go.Figure()

# Weight (left axis)
fig.add_trace(go.Scatter(
    x=x_time, y=y_weight,
    mode="lines+markers",
    name="Weight (lbs)",
    line=dict(width=3, color="black"),
))

# SCFA (right axis)  -> attach to y2
fig.add_trace(go.Scatter(
    x=x_time, y=y_scfa,
    mode="lines+markers",
    name="Total SCFA (mg/mL)",
    line=dict(width=2, color="blue"),
    yaxis="y2"
))

fig.update_layout(
    title=f"Weight vs Total SCFA Over Time ({rule or 'Raw'})",
    xaxis=dict(title="Date", dtick="M3", tickformat="%b %Y"),
    yaxis=dict(title="Weight (lbs)", side="left"),
    yaxis2=dict(title="Total SCFA (mg/mL)", overlaying="y", side="right"),
    legend=dict(x=0.01, y=0.99, bordercolor="lightgray", borderwidth=1),
    template="plotly_white"
)

fig.show()

3M  |  Pearson r=0.464 (p=0.000165)  |  Spearman r=0.501 (p=3.91e-05)


In [49]:
# prep 
df_plot = df_wideBioDairyLS.dropna(subset=["Total SCFA_mg/mL"]).copy()
df_plot["Date"] = pd.to_datetime(df_plot["Date"], errors="coerce")
df_plot = df_plot.sort_values("Date").set_index("Date")

# interval order: 1M, 3M, 6M, 12M
intervals = [
                ("1M",  "1-Month"),
                ("3M",  "3-Month"),
                ("6M",  "6-Month"),
                ("12M", "Yearly"),
]

results = []

for rule, label in intervals:
    # resample to interval bins (non-overlapping), take mean within each bin
    df_bin = df_plot[["Weight_lbs", "Total SCFA_mg/mL"]].resample(rule).mean().dropna().reset_index()

    x_time   = df_bin["Date"]
    y_weight = df_bin["Weight_lbs"]
    y_scfa   = df_bin["Total SCFA_mg/mL"]

    # quick stats
    if len(df_bin) >= 3:
        pr, pp = pearsonr(y_weight, y_scfa)
        sr, sp = spearmanr(y_weight, y_scfa)
    else:
        pr = pp = sr = sp = float("nan")

    results.append({
                    "Interval": label,
                    "N bins": len(df_bin),
                    "Pearson r": round(pr, 3) if pd.notna(pr) else pr,
                    "Pearson p": f"{pp:.3g}" if pd.notna(pp) else pp,
                    "Spearman r": round(sr, 3) if pd.notna(sr) else sr,
                    "Spearman p": f"{sp:.3g}" if pd.notna(sp) else sp,
    })

    # plot 
    fig = go.Figure()

    # Weight (left axis)
    fig.add_trace(go.Scatter(
                                x=x_time, 
                                y=y_weight,
                                mode="lines+markers",
                                name="Weight (lbs)",
                                line=dict(width=3, color="black"),
    ))

    # SCFA (right axis)
    fig.add_trace(go.Scatter(
                                x=x_time, 
                                y=y_scfa,
                                mode="lines+markers",
                                name="Total SCFA (mg/mL)",
                                line=dict(width=2, color="blue"),
                                yaxis="y2"
    ))

    fig.update_layout(
                        title=f"Weight vs Total SCFA Over Time ({label} Interval Means)",
                        xaxis=dict(title="Date", dtick="M3", tickformat="%b %Y"),
                        yaxis=dict(title="Weight (lbs)", side="left"),
                        yaxis2=dict(title="Total SCFA (mg/mL)", overlaying="y", side="right"),
                        legend=dict(x=0.01, y=0.99, bordercolor="lightgray", borderwidth=1),
                        template="plotly_white"
    )

    fig.show()

# summary table
df_results = pd.DataFrame(results, columns=["Interval","N bins","Pearson r","Pearson p","Spearman r","Spearman p"])
df_results

Unnamed: 0,Interval,N bins,Pearson r,Pearson p,Spearman r,Spearman p
0,1-Month,133,0.412,8.15e-07,0.453,4.45e-08
1,3-Month,61,0.464,0.000165,0.501,3.91e-05
2,6-Month,33,0.492,0.00362,0.585,0.000354
3,Yearly,18,0.412,0.0896,0.55,0.018


In [None]:
# Sho the long format biomarker data

Now that we have seen the biomarkers data lets take a look at the microbiome data

In [60]:
df_wideMicrobiome = pd.read_csv(fp_microbiomeWide)
df_wideMicrobiome

Unnamed: 0,Species,12/28/11,4/3/12,8/7/12,11/6/12,1/26/13,2/8/13,2/5/13,3/24/13,4/7/13,...,4/14/19,4/28/19,5/12/19,5/19/19,6/2/19,6/16/19,6/23/19,7/14/19,7/28/19,8/11/19
0,Adlercreutzia_equolifaciens,0.031,0.035,0.143,0.064,0.041,0.036,,0.008,0.006,...,,,,,,,,,,
1,Aeromonas_unclassified,0.000,0.000,0.000,0.000,0.000,0.000,,0.000,0.000,...,,,,,,,,,,
2,Aggregatibacter_segnis,0.000,0.000,0.000,0.000,0.000,0.014,,0.015,0.562,...,,,,,,,,,,
3,Aggregatibacter_unclassified,0.000,0.000,0.000,0.000,0.000,0.000,,0.000,0.137,...,,,,,,,,,,
4,Akkermansia_muciniphila,12.328,0.950,3.842,5.579,3.518,3.212,,1.063,0.892,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224,p__Proteobacteria;f__Pasteurellaceae;g__Aggreg...,0.000,0.000,0.000,0.000,0.000,0.040,0.29,0.030,1.030,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
225,p__Proteobacteria;f__Pseudomonadaceae;g__Pseud...,0.000,0.010,0.000,0.010,0.010,0.010,0.01,0.010,0.030,...,0.02,0.00,0.10,0.10,0.33,0.09,0.00,0.09,0.08,0.06
226,p__Synergistetes;f__Synergistaceae;g__Synergistes,0.100,0.000,0.020,0.050,0.110,0.040,0.12,0.010,0.020,...,0.07,0.03,0.06,0.01,0.00,0.00,0.03,0.05,0.11,0.03
227,p__Tenericutes;f__;g__,0.990,0.000,0.010,0.030,0.160,0.110,0.05,0.350,0.160,...,0.64,0.50,0.22,0.33,0.39,0.39,0.40,0.27,0.16,0.58
