In [43]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import re
import seaborn as sns
import statsmodels.api as sm
import warnings

from scipy.stats import pearsonr, spearmanr
warnings.simplefilter(action='ignore', category=FutureWarning)


# Filepaths

In [50]:
fp_biomarkersWide = "../Data./LS_Biomarkers_Wide.csv"
fp_biomarkersLong = "../Data./LS_Biomarkers_Long.csv"
fp_microbiome = "../Data./LS_Gut_Microbiome.csv"
fp_microbiomeSummary = "../Data./LS_Gut_Microbiome_Summary.csv


Unnamed: 0,Date,% SCFA Acetate,% SCFA Butyrate,% SCFA Propionate,% SCFA Valerate,237_mg/dL,A/G ratio_None,ALT_U/L,APO A1_mg/dL,APOLIPOPROTEIN-(B100)_mg/dL,...,Uric Acid_None,VLDL Cholesterol_mg/dL,Vitamin B12_pg/mL,"Vitamin D, 25-Hydroxy_ng/mL",Vitamin E Alpha-Tocopherol_mg/L,Vitamin E Gamma-Tocopherol_mg/L,WBC-_10**3/mL,Weight_lbs,Zinc_µg/dL,pH_None
0,1993-05-01,,,,,,,,,,...,,,,,,,,,,
1,1996-11-27,,,,,,1.3,,,,...,6.0,,,,,,8.7,,,
2,1997-05-08,,,,,,,,,,...,6.3,,,,,,6.3,,,
3,1998-04-15,,,,,,1.4,,,,...,,,,,,,6.4,,,
4,1999-04-01,,,,,,,,,,...,,,,,,,8.7,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327,2025-02-05,47.0,24.0,22.0,7.2,,,22.0,,,...,,,,,,,6.7,178.0,,6.5
328,2025-02-12,,,,,,,14.0,,,...,,,,49.0,,,4.0,179.0,,
329,2025-02-28,,,,,,,,,,...,,,,,,,5.5,179.6,,
330,2025-05-20,,,,,,,17.0,,,...,,,,64.0,,,4.0,182.5,,


In [None]:
df_wideBioDairyLS = pd.read_csv(fp_wide)
df_wideBioDairyLSb

In [51]:
# Summary of columns in the with number of values, completion, and data type
df_colSummary = pd.DataFrame({
                                "Column": df_wideBioDairyLS.columns,
                                "# Values": df_wideBioDairyLS.notna().sum().values,
                                "Completion": (df_wideBioDairyLS.notna().sum().values / len(df_wideBioDairyLS)).round(2),
                                "Data Type": df_wideBioDairyLS.dtypes.values
})

# Display all rows only within this cell
with pd.option_context("display.max_rows", None):
    display(df_colSummary.sort_values(by="Completion", ascending=False))

Unnamed: 0,Column,# Values,Completion,Data Type
0,Date,332,1.0,object
181,Weight_lbs,322,0.97,float64
90,"Kocuria spp, salsicia, rhizophila, kristinae_None",182,0.55,float64
118,Pseudomonas aeruginosa_None,182,0.55,float64
117,Providencia stuartii_None,183,0.55,object
116,Providencia rettgeri_None,182,0.55,float64
113,Pantoea spp_None,182,0.55,float64
107,Mucoid Escherichia coli_None,182,0.55,float64
106,Morganellla morganii_None,182,0.55,float64
98,Lysinibacillus spp_None,182,0.55,float64


In [None]:
# Put all 5

In [47]:
# Prepare data
df_plot = df_wideBioDairyLS.dropna(subset=["Total SCFA_mg/mL"]).copy()
df_plot["Date"] = pd.to_datetime(df_plot["Date"])
df_plot = df_plot.set_index("Date").sort_index()

# --- YEARLY INTERVAL WINDOWS ---
# resample into 12-month bins, take mean within each bin
df_1y = df_plot[["Weight_lbs", "Total SCFA_mg/mL"]].resample("12M").mean().reset_index()

# --- variables ---
x_time_1y   = df_1y["Date"]
y_weight_1y = df_1y["Weight_lbs"]
y_scfa_1y   = df_1y["Total SCFA_mg/mL"]

# --- plot ---
fig = go.Figure()

# Weight (yearly interval mean)
fig.add_trace(go.Scatter(
    x=x_time_1y,
    y=y_weight_1y,
    mode="lines+markers",
    name="Weight (yearly interval mean)",
    line=dict(width=3, color="black"),
))

# Total SCFA (yearly interval mean)
fig.add_trace(go.Scatter(
    x=x_time_1y,
    y=y_scfa_1y,
    mode="lines+markers",
    name="Total SCFA (yearly interval mean)",
    line=dict(width=2, color="blue"),
))

# layout
fig.update_layout(
    title="Weight vs Total SCFA (Yearly Interval Means)",
    xaxis_title="Date",
    yaxis=dict(title="Weight (lbs)", side="left"),
    yaxis2=dict(title="Total SCFA (mg/mL)", overlaying="y", side="right"),
    legend=dict(x=0.01, y=0.99, bordercolor="lightgray", borderwidth=1),
    template="plotly_white"
)

fig.show()

In [48]:
# --- prep ---
df_plot = df_wideBioDairyLS.dropna(subset=["Total SCFA_mg/mL"]).copy()
df_plot["Date"] = pd.to_datetime(df_plot["Date"], errors="coerce")
df_plot = df_plot.sort_values("Date").set_index("Date")

# choose interval: None (raw), "1M", "3M", "6M", "12M"
rule = "3M"  # <- change to None / "1M" / "3M" / "6M" / "12M"

if rule is None:
    df_bin = df_plot[["Weight_lbs", "Total SCFA_mg/mL"]].dropna().reset_index()
else:
    df_bin = df_plot[["Weight_lbs", "Total SCFA_mg/mL"]].resample(rule).mean().dropna().reset_index()

x_time   = df_bin["Date"]
y_weight = df_bin["Weight_lbs"]
y_scfa   = df_bin["Total SCFA_mg/mL"]

# --- quick stats (optional) ---
if len(df_bin) >= 3:
    pr, pp = pearsonr(y_weight, y_scfa)
    sr, sp = spearmanr(y_weight, y_scfa)
    print(f"{rule or 'RAW'}  |  Pearson r={pr:.3f} (p={pp:.3g})  |  Spearman r={sr:.3f} (p={sp:.3g})")

# --- plot ---
fig = go.Figure()

# Weight (left axis)
fig.add_trace(go.Scatter(
    x=x_time, y=y_weight,
    mode="lines+markers",
    name="Weight (lbs)",
    line=dict(width=3, color="black"),
))

# SCFA (right axis)  -> attach to y2
fig.add_trace(go.Scatter(
    x=x_time, y=y_scfa,
    mode="lines+markers",
    name="Total SCFA (mg/mL)",
    line=dict(width=2, color="blue"),
    yaxis="y2"
))

fig.update_layout(
    title=f"Weight vs Total SCFA Over Time ({rule or 'Raw'})",
    xaxis=dict(title="Date", dtick="M3", tickformat="%b %Y"),
    yaxis=dict(title="Weight (lbs)", side="left"),
    yaxis2=dict(title="Total SCFA (mg/mL)", overlaying="y", side="right"),
    legend=dict(x=0.01, y=0.99, bordercolor="lightgray", borderwidth=1),
    template="plotly_white"
)

fig.show()

3M  |  Pearson r=0.464 (p=0.000165)  |  Spearman r=0.501 (p=3.91e-05)


In [49]:
# prep 
df_plot = df_wideBioDairyLS.dropna(subset=["Total SCFA_mg/mL"]).copy()
df_plot["Date"] = pd.to_datetime(df_plot["Date"], errors="coerce")
df_plot = df_plot.sort_values("Date").set_index("Date")

# interval order: 1M, 3M, 6M, 12M
intervals = [
                ("1M",  "1-Month"),
                ("3M",  "3-Month"),
                ("6M",  "6-Month"),
                ("12M", "Yearly"),
]

results = []

for rule, label in intervals:
    # resample to interval bins (non-overlapping), take mean within each bin
    df_bin = df_plot[["Weight_lbs", "Total SCFA_mg/mL"]].resample(rule).mean().dropna().reset_index()

    x_time   = df_bin["Date"]
    y_weight = df_bin["Weight_lbs"]
    y_scfa   = df_bin["Total SCFA_mg/mL"]

    # quick stats
    if len(df_bin) >= 3:
        pr, pp = pearsonr(y_weight, y_scfa)
        sr, sp = spearmanr(y_weight, y_scfa)
    else:
        pr = pp = sr = sp = float("nan")

    results.append({
                    "Interval": label,
                    "N bins": len(df_bin),
                    "Pearson r": round(pr, 3) if pd.notna(pr) else pr,
                    "Pearson p": f"{pp:.3g}" if pd.notna(pp) else pp,
                    "Spearman r": round(sr, 3) if pd.notna(sr) else sr,
                    "Spearman p": f"{sp:.3g}" if pd.notna(sp) else sp,
    })

    # plot 
    fig = go.Figure()

    # Weight (left axis)
    fig.add_trace(go.Scatter(
                                x=x_time, 
                                y=y_weight,
                                mode="lines+markers",
                                name="Weight (lbs)",
                                line=dict(width=3, color="black"),
    ))

    # SCFA (right axis)
    fig.add_trace(go.Scatter(
                                x=x_time, 
                                y=y_scfa,
                                mode="lines+markers",
                                name="Total SCFA (mg/mL)",
                                line=dict(width=2, color="blue"),
                                yaxis="y2"
    ))

    fig.update_layout(
                        title=f"Weight vs Total SCFA Over Time ({label} Interval Means)",
                        xaxis=dict(title="Date", dtick="M3", tickformat="%b %Y"),
                        yaxis=dict(title="Weight (lbs)", side="left"),
                        yaxis2=dict(title="Total SCFA (mg/mL)", overlaying="y", side="right"),
                        legend=dict(x=0.01, y=0.99, bordercolor="lightgray", borderwidth=1),
                        template="plotly_white"
    )

    fig.show()

# summary table
df_results = pd.DataFrame(results, columns=["Interval","N bins","Pearson r","Pearson p","Spearman r","Spearman p"])
df_results

Unnamed: 0,Interval,N bins,Pearson r,Pearson p,Spearman r,Spearman p
0,1-Month,133,0.412,8.15e-07,0.453,4.45e-08
1,3-Month,61,0.464,0.000165,0.501,3.91e-05
2,6-Month,33,0.492,0.00362,0.585,0.000354
3,Yearly,18,0.412,0.0896,0.55,0.018
