In [7]:
#imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [12]:
#import data
df=pd.read_excel("Raman offline measurements.xlsx", "Reference Val")
df.head()

Unnamed: 0,batch_id,time,batch_time_h,solution_type,titer_mg_mL,acetate_mM,glucose_g_L,mg_mM,nh3_mM,phosphate_mM
0,210923 run 1,2021-09-23 09:33:05,0.0,1,0.003471,36.66,0.02,12.495,21.79,12.99
1,210923 run 1,2021-09-23 11:03:03,1.499444,1,0.105501,60.97,0.02,11.743,24.31,15.03
2,210923 run 1,2021-09-23 13:18:03,3.749444,1,0.216834,68.0,0.02,11.893,29.94,15.96
3,210923 run 1,2021-09-23 15:03:03,5.499444,1,0.301073,72.01,0.02,11.973,35.45,15.78
4,210923 run 1,2021-09-23 16:18:03,6.749444,1,0.330496,78.86,0.02,12.438,39.81,17.13


In [13]:

# Ensure proper ordering along the x-axis
df_plot = df.sort_values(["solution_type", "batch_id", "batch_time_h"])

fig = px.line(
    df_plot,
    x="batch_time_h",
    y="titer_mg_mL",
    color="solution_type",     # color by solution type (legend groups)
    line_group="batch_id",     # separate line per batch_id (no aggregation)
    hover_name="batch_id",     # show batch_id on hover
    labels={
        "batch_time_h": "batch_time_h",
        "titer_mg_mL": "titer_mg_mL",
        "solution_type": "solution_type"
    },
    title="Titer vs Batch Time — colored by solution type"
)

fig.update_layout(
    width=1000,
    height=600,
    template="plotly_white",
    legend_title_text="solution_type",
    margin=dict(l=60, r=20, t=60, b=60)
)

fig.show()

# Optional: save to an interactive HTML file
# fig.write_html("titer_vs_time_by_solution.html")

# Optional: save a static image (requires: pip install -U kaleido)
# fig.write_image("titer_vs_time_by_solution.png", width=1000, height=600, scale=2)



In [16]:
# ---- Config you can tweak ----
TIME_COL   = "batch_time_h"
STYPE_COL  = "solution_type"
BATCH_COL  = "batch_id"
MEAS_COLS  = ["titer_mg_mL", "acetate_mM", "glucose_g_L", "mg_mM", "nh3_mM", "phosphate_mM"]

TARGETS    = {1: 20, 2: 25, 3:20, 4:20}  # desired total runs per solution type
RANDOM_STATE = 7
HP = {
    "intercept_std_mult": 0.25,
    "scale_std": 0.08,
    "curvature_std": 0.01,
    "rw_std_mult": 0.18,
    "meas_std_mult": 0.15,
    "min_nonneg": 0.0,
}

def _polyfit_baseline(x, y):
    x = np.asarray(x, float)
    y = np.asarray(y, float)
    uniq = np.unique(x)
    deg = 3 if len(uniq) > 3 else max(1, min(2, len(uniq) - 1))
    coeffs = np.polyfit(x, y, deg)
    return np.poly1d(coeffs)

def _make_hourly_grid(df_t, time_col):
    tmin = int(np.floor(df_t[time_col].min()))
    tmax = int(np.ceil(df_t[time_col].max()))
    if tmax <= tmin:  # safety
        tmax = tmin + 1
    return np.arange(tmin, tmax + 1, 1, dtype=float)

def _estimate_residual_std(df_t, time_col, meas_col, poly):
    res = df_t[meas_col].values - poly(df_t[time_col].values)
    if np.all(np.isfinite(res)) and len(res) > 1:
        return max(np.nanstd(res), 1e-6)
    return max(np.nanstd(df_t[meas_col].values) * 0.2, 1e-6)

def _synth_series(poly, t, res_std, rng, hp):
    base = poly(t)
    intercept = rng.normal(0.0, res_std * hp["intercept_std_mult"])
    scale = rng.normal(1.0, hp["scale_std"])

    # gentle curvature tweak across time
    t_center = (t - t.min()) / (t.max() - t.min() + 1e-9) - 0.5
    curvature = rng.normal(0.0, hp["curvature_std"]) * (t_center**2 - np.mean(t_center**2))

    # smooth random walk + measurement noise
    steps = rng.normal(0.0, res_std * hp["rw_std_mult"], size=len(t))
    rw = np.cumsum(steps); rw -= rw.mean()
    meas = rng.normal(0.0, res_std * hp["meas_std_mult"], size=len(t))

    y = intercept + scale * base + curvature * base + rw + meas
    return np.maximum(y, hp["min_nonneg"])

def generate_synthetic_trends_hourly_from_df(
    df,
    time_col=TIME_COL,
    stype_col=STYPE_COL,
    batch_col=BATCH_COL,
    meas_cols=MEAS_COLS,
    targets=TARGETS,
    random_state=RANDOM_STATE,
    hp=HP,
):
    rng = np.random.default_rng(random_state)

    # Copy & coerce
    df = df.copy()
    df[time_col]  = pd.to_numeric(df[time_col], errors="coerce")
    df[stype_col] = pd.to_numeric(df[stype_col], errors="coerce").astype("Int64")
    df = df.dropna(subset=[time_col, stype_col, batch_col])

    # Fit baselines per (Solution Type, Measurement) + build hourly grid
    baselines = {}  # stype -> {"grid": grid, "meas": {meas: {"poly":..., "res_std":...}}}
    for stype, df_t in df.groupby(stype_col):
        stype = int(stype)
        grid = _make_hourly_grid(df_t, time_col)
        baselines[stype] = {"grid": grid, "meas": {}}
        for m in meas_cols:
            if m not in df.columns:
                continue
            poly = _polyfit_baseline(df_t[time_col].values, df_t[m].values)
            res_std = _estimate_residual_std(df_t, time_col, m, poly)
            baselines[stype]["meas"][m] = {"poly": poly, "res_std": res_std}

    # Count existing runs & synthesize to reach targets
    existing_counts = df.groupby(stype_col)[batch_col].nunique().to_dict()
    synthetic_frames = []
    for stype, info in baselines.items():
        n_exist = existing_counts.get(stype, 0)
        n_target = targets.get(stype, n_exist)
        n_to_make = max(0, n_target - n_exist)
        if n_to_make == 0:
            continue

        grid = info["grid"]
        for i in range(1, n_to_make + 1):
            syn_id = f"S{stype}_syn_{i:03d}"
            syn_df = pd.DataFrame({
                batch_col: syn_id,
                stype_col: stype,
                time_col: grid,
            })
            for m, obj in info["meas"].items():
                syn_df[m] = _synth_series(obj["poly"], grid, obj["res_std"], rng, hp)
            syn_df["is_synthetic"] = True
            synthetic_frames.append(syn_df)

    synthetic_df = (
        pd.concat(synthetic_frames, ignore_index=True)
        if synthetic_frames else
        pd.DataFrame(columns=[batch_col, stype_col, time_col] + list(meas_cols) + ["is_synthetic"])
    )

    # Keep originals as-is; synthetics are on the hourly grid
    df_out = df.copy()
    df_out["is_synthetic"] = False

    combined = pd.concat([df_out, synthetic_df], ignore_index=True)
    combined = combined.sort_values([stype_col, batch_col, time_col])
    return combined

combined_df = generate_synthetic_trends_hourly_from_df(df)
combined_df.head()


Unnamed: 0,batch_id,time,batch_time_h,solution_type,titer_mg_mL,acetate_mM,glucose_g_L,mg_mM,nh3_mM,phosphate_mM,is_synthetic
0,210923 run 1,2021-09-23 09:33:05,0.0,1,0.003471,36.66,0.02,12.495,21.79,12.99,False
1,210923 run 1,2021-09-23 11:03:03,1.499444,1,0.105501,60.97,0.02,11.743,24.31,15.03,False
2,210923 run 1,2021-09-23 13:18:03,3.749444,1,0.216834,68.0,0.02,11.893,29.94,15.96,False
3,210923 run 1,2021-09-23 15:03:03,5.499444,1,0.301073,72.01,0.02,11.973,35.45,15.78,False
4,210923 run 1,2021-09-23 16:18:03,6.749444,1,0.330496,78.86,0.02,12.438,39.81,17.13,False


In [17]:

# Ensure proper ordering along the x-axis
combined_plot = combined_df.sort_values(["solution_type", "batch_id", "batch_time_h"])

fig = px.line(
    combined_plot,
    x="batch_time_h",
    y="titer_mg_mL",
    color="solution_type",     # color by solution type (legend groups)
    line_group="batch_id",     # separate line per batch_id (no aggregation)
    hover_name="batch_id",     # show batch_id on hover
    labels={
        "batch_time_h": "batch_time_h",
        "titer_mg_mL": "titer_mg_mL",
        "solution_type": "solution_type"
    },
    title="Titer vs Batch Time — colored by solution type"
)

fig.update_layout(
    width=1000,
    height=600,
    template="plotly_white",
    legend_title_text="solution_type",
    margin=dict(l=60, r=20, t=60, b=60)
)

fig.show()

# Optional: save to an interactive HTML file
# fig.write_html("titer_vs_time_by_solution.html")

# Optional: save a static image (requires: pip install -U kaleido)
# fig.write_image("titer_vs_time_by_solution.png", width=1000, height=600, scale=2)



In [64]:
##have plots by solutio type, remove solution_type
combined_plot.query("is_synthetic == 1").drop(columns=["time","solution_type", "is_synthetic"]).to_excel("synthesized_data.xlsx", index=False)