In [38]:
import pandas as pd
import numpy as np
import sys
import os
sys.path.append(os.path.abspath(".."))
from core.viz import plot_line, create_subplot_grid, plot_bar, plot_corr_triangle
from core.s3 import S3AssetManager


import seaborn as sns
import re, unicodedata
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from typing import Optional, List, Tuple


In [39]:
notebook_name = "costa_rica_microbiologia"
s3 = S3AssetManager(notebook_name=notebook_name)

PALETTE =  [ "#1c8074","#1a494c", "#94af92", "#666666", "#f9ee77", "#f5ad68", "#c76931"]

In [40]:

def scatter_1x2_corporativo(
    df: pd.DataFrame,
    *,
    x_col: str = "log_coliformes",
    y1_col: str = "log_hongos",
    y2_col: str = "log_levaduras",
    color_by: str | None = None,   # p.ej. "Especie"
    width: int = 1000,
    height: int = 500,
    suptitle: str = "Tendencias microbianas (subplot 1×2)",
    output_html: str | None = None,
):
    d = df.copy()

    # Asegurar numéricos
    for c in [x_col, y1_col, y2_col]:
        d[c] = pd.to_numeric(d[c], errors="coerce")

    # Lienzo 1x2
    fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=(
            f"{x_col} vs {y1_col}",
            f"{x_col} vs {y2_col}"
        ),
        horizontal_spacing=0.08
    )

    # --- Subplot 1: x vs y1 (solo valores > 0) ---
    d1 = d[(d[x_col] > 0) & (d[y1_col] > 0)].dropna(subset=[x_col, y1_col])
    if color_by:
        # múltiples trazas por categoría usando colores corporativos
        f1 = px.scatter(
            d1, x=x_col, y=y1_col, color=color_by,
            color_discrete_sequence=CORP_PALETTE, opacity=0.85
        )
        # Muestra leyenda solo en el segundo subplot para no duplicar
        for tr in f1.data:
            tr.showlegend = False
            fig.add_trace(tr, row=1, col=1)
    else:
        fig.add_trace(
            go.Scatter(
                x=d1[x_col], y=d1[y1_col],
                mode="markers",
                marker=dict(size=7, opacity=0.85, color=CORP_PALETTE[0]),
                showlegend=False,
                name=f"{x_col} vs {y1_col}"
            ),
            row=1, col=1
        )

    # --- Subplot 2: x vs y2 (solo valores > 0) ---
    d2 = d[(d[x_col] > 0) & (d[y2_col] > 0)].dropna(subset=[x_col, y2_col])
    if color_by:
        f2 = px.scatter(
            d2, x=x_col, y=y2_col, color=color_by,
            color_discrete_sequence=CORP_PALETTE, opacity=0.85
        )
        # Aquí sí mostramos la leyenda (una sola vez)
        for i, tr in enumerate(f2.data):
            tr.showlegend = True if i == 0 else False
            fig.add_trace(tr, row=1, col=2)
    else:
        fig.add_trace(
            go.Scatter(
                x=d2[x_col], y=d2[y2_col],
                mode="markers",
                marker=dict(size=7, opacity=0.85, color=CORP_PALETTE[2]),
                showlegend=False,
                name=f"{x_col} vs {y2_col}"
            ),
            row=1, col=2
        )

    # Títulos de ejes
    fig.update_xaxes(title_text="log10 Coliformes (UFC/g)", row=1, col=1)
    fig.update_yaxes(title_text="log10 Hongos (UFC/g)",     row=1, col=1)
    fig.update_xaxes(title_text="log10 Coliformes (UFC/g)", row=1, col=2)
    fig.update_yaxes(title_text="log10 Levaduras (UFC/g)",  row=1, col=2)

    # Estilo corporativo + tamaño + fondo transparente + texto negro
    fig.update_layout(
        width=width, height=height,
        title=suptitle, title_x=0.5,
        paper_bgcolor="rgba(0,0,0,0)",
        plot_bgcolor="rgba(0,0,0,0)",
        font=dict(color="black"),
        margin=dict(l=70, r=30, t=80, b=60),
        legend=dict(font=dict(color="black"))
    )
    fig.update_xaxes(showline=True, linecolor="black",
                     tickfont=dict(color="black"), title_font=dict(color="black"))
    fig.update_yaxes(showline=True, linecolor="black",
                     tickfont=dict(color="black"), title_font=dict(color="black"))

    if output_html:
        fig.write_html(output_html, include_plotlyjs="cdn", full_html=False)

    return fig



# Colores corporativos
CORP_MARKER = "#1A494C"   # puntos
CORP_BAND   = "rgba(28, 128, 116, 0.08)"  # banda del rango

def scatter_regresion_rango_corporativo(
    df: pd.DataFrame,
    x: str,
    y: str,
    *,
    fit_range_x: tuple[float, float],   # (xmin, xmax) para el AJUSTE
    solo_positivos: bool = True,
    title: str | None = None,
    width: int = 1000,
    height: int = 500,
    output_html: str | None = None,
    reg_line_color: str = "#FF0000",    # ← línea de regresión en ROJO
):
    d = df.copy()
    d[x] = pd.to_numeric(d[x], errors="coerce")
    d[y] = pd.to_numeric(d[y], errors="coerce")
    d = d.dropna(subset=[x, y])

    if solo_positivos:
        d_scatter = d[(d[x] > 0) & (d[y] > 0)].copy()
    else:
        d_scatter = d.copy()

    xmin, xmax = fit_range_x
    d_fit = d_scatter[(d_scatter[x] >= xmin) & (d_scatter[x] <= xmax)].copy()
    if len(d_fit) < 2:
        raise ValueError(f"No hay suficientes puntos en el rango {fit_range_x} para ajustar.")

    xv = d_fit[x].to_numpy()
    yv = d_fit[y].to_numpy()
    slope, intercept = np.polyfit(xv, yv, 1)
    y_pred = intercept + slope * xv
    ss_res = np.sum((yv - y_pred) ** 2)
    ss_tot = np.sum((yv - np.mean(yv)) ** 2)
    r2 = float(1 - ss_res / ss_tot) if ss_tot > 0 else float("nan")
    n = int(len(d_fit))

    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=d_scatter[x], y=d_scatter[y],
        mode="markers",
        marker=dict(size=7, opacity=0.85, color=CORP_MARKER),
        name="Datos"
    ))

    fig.add_vrect(x0=xmin, x1=xmax, fillcolor=CORP_BAND, line_width=0, layer="below")

    x_line = np.array([xmin, xmax])
    y_line = intercept + slope * x_line
    fig.add_trace(go.Scatter(
        x=x_line, y=y_line,
        mode="lines",
        line=dict(width=3, color=reg_line_color),  # ← rojo
        name="Regresión (rango)"
    ))

    params_txt = (
        f"Ajuste en [{xmin:.2f}, {xmax:.2f}]<br>"
        f"y = {intercept:.3f} + {slope:.3f}·x<br>"
        f"R² = {r2:.3f} · n = {n}"
    )
    fig.add_annotation(
        xref="paper", yref="paper", x=0.02, y=0.98,
        text=params_txt, showarrow=False, align="left",
        font=dict(color="black"), bgcolor="rgba(0,0,0,0)"
    )

    fig.update_layout(
        width=width, height=height,
        title=title or f"{y} vs {x} (regresión en rango)",
        title_x=0.5,
        paper_bgcolor="rgba(0,0,0,0)",
        plot_bgcolor="rgba(0,0,0,0)",
        font=dict(color="black"),
        margin=dict(l=70, r=30, t=80, b=60),
        showlegend=False
    )
    fig.update_xaxes(title_text=x, showline=True, linecolor="black",
                     tickfont=dict(color="black"), title_font=dict(color="black"))
    fig.update_yaxes(title_text=y, showline=True, linecolor="black",
                     tickfont=dict(color="black"), title_font=dict(color="black"))

    if output_html:
        fig.write_html(output_html, include_plotlyjs="cdn", full_html=False)

    return fig


def boxplot_corporativo_con_mediana(
    df: pd.DataFrame,
    x: str,
    y: str,
    *,
    color_by: str | None = None,
    title: str | None = None,
    yaxis_title: str | None = None,
    points: str | bool = "outliers",
    solo_y_positivos: bool = False,
    output_html: str | None = None,
    median_line_color: str = "#FF0000",   # rojo
):
    d = df.copy()
    d[y] = pd.to_numeric(d[y], errors="coerce")
    if solo_y_positivos:
        d = d[d[y] > 0]
    d = d.dropna(subset=[x, y])

    kwargs = dict(
        x=x, y=y, points=points,
        title=title or f"Distribución de {y}",
        color_discrete_sequence=CORP_PALETTE,

    )
    if color_by:
        kwargs["color"] = color_by

    fig = px.box(d, **kwargs)
    fig.update_traces(boxmean=False)

    mediana = d[y].median(skipna=True)
    fig.add_hline(
        y=mediana,
        line_dash="dash",
        line_color=median_line_color,
        annotation_text=f"Mediana = {mediana:.2f}",
        annotation_position="top right",
        annotation_font_color="blue"
    )

    # Estilo corporativo + título centrado
    fig.update_layout(
        title_x=0.5,  # << centrado del título
        paper_bgcolor="rgba(0,0,0,0)",
        plot_bgcolor="rgba(0,0,0,0)",
        font=dict(color="black"),
        xaxis=dict(showline=True, linecolor="black",
                   tickfont=dict(color="black"), title_font=dict(color="black")),
        yaxis=dict(showline=True, linecolor="black",
                   tickfont=dict(color="black"), title_font=dict(color="black")),
        legend=dict(font=dict(color="black"))
    )
    fig.update_yaxes(title_text=yaxis_title or y)

    if output_html:
        fig.write_html(output_html, include_plotlyjs="cdn", full_html=False)

    return fig


import pandas as pd
import plotly.express as px

# Paleta corporativa por si no existe:
try:
    CORP_PALETTE
except NameError:
    CORP_PALETTE = ["#1C8074", "#666666", "#1A494C", "#94AF92", "#E6ECD8", "#C9C9C9"]

def violin_corporativo_con_mediana(
    df: pd.DataFrame,
    x: str,
    y: str,
    *,
    color_by: str | None = None,      # colorear por columna (opcional)
    title: str | None = None,
    yaxis_title: str | None = None,
    points: str | bool = "outliers",  # "outliers" | "all" | False
    solo_y_positivos: bool = False,
    output_html: str | None = None,
    median_line_color: str = "#FF0000",  # línea GLOBAL de mediana (roja)
    box_inside: bool = True,             # mini box por categoría
    meanline: bool = False,              # línea de MEDIA por categoría
    width: int = 1000,
    height: int = 500,
):
    d = df.copy()
    d[y] = pd.to_numeric(d[y], errors="coerce")
    if solo_y_positivos:
        d = d[d[y] > 0]
    d = d.dropna(subset=[x, y])

    kwargs = dict(
        x=x, y=y,
        points=points,
        box=box_inside,
        title=title or f"Distribución (violín) de {y}",
        color_discrete_sequence=CORP_PALETTE,
    )
    if color_by:
        kwargs["color"] = color_by

    fig = px.violin(d, **kwargs)

    # Activa la línea de MEDIA por categoría (compatible con versiones antiguas)
    if meanline:
        try:
            fig.update_traces(meanline_visible=True, selector=dict(type="violin"))
        except Exception:
            fig.for_each_trace(
                lambda tr: tr.update(meanline=dict(visible=True)) if tr.type == "violin" else None
            )

    # Línea de MEDIANA GLOBAL (todas las categorías)
    mediana = d[y].median(skipna=True)
    fig.add_hline(
        y=mediana,
        line_dash="dash",
        line_color=median_line_color,
        annotation_text=f"Mediana = {mediana:.2f}",
        annotation_position="top right",
        annotation_font_color="black",
    )

    # Estilo corporativo + título centrado + fondo transparente + tamaño
    fig.update_layout(
        width=width, height=height,
        title_x=0.5,
        paper_bgcolor="rgba(0,0,0,0)",
        plot_bgcolor="rgba(0,0,0,0)",
        font=dict(color="black"),
        xaxis=dict(showline=True, linecolor="black",
                   tickfont=dict(color="black"), title_font=dict(color="black")),
        yaxis=dict(showline=True, linecolor="black",
                   tickfont=dict(color="black"), title_font=dict(color="black")),
        legend=dict(font=dict(color="black")),
        margin=dict(l=60, r=30, t=70, b=50),
    )
    fig.update_yaxes(title_text=yaxis_title or y)

    if output_html:
        fig.write_html(output_html, include_plotlyjs="cdn", full_html=False)

    return fig


import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

def boxplot_corporativo_con_mediana(
    df: pd.DataFrame,
    x: str,
    y: str,
    *,
    color_by: str | None = None,
    title: str | None = None,
    yaxis_title: str | None = None,
    points: str | bool = "outliers",
    solo_y_positivos: bool = False,
    output_html: str | None = None,
    median_line_color: str = "#FF0000",   # rojo línea global
    median_connect_color: str = "#1f77b4" # color de la(s) línea(s) conectando medianas
):
    d = df.copy()
    d[y] = pd.to_numeric(d[y], errors="coerce")
    if solo_y_positivos:
        d = d[d[y] > 0]
    d = d.dropna(subset=[x, y])

    # Orden consistente del eje X
    # (si x es numérico, ordénalo; si es str/categórico, respeta el orden único de aparición)
    try:
        x_vals_sorted = sorted(pd.to_numeric(d[x], errors="coerce").dropna().unique())
    except Exception:
        x_vals_sorted = d[x].dropna().unique().tolist()

    kwargs = dict(
        x=x, y=y, points=points,
        title=title or f"Distribución de {y}",
        color_discrete_sequence=CORP_PALETTE,
    )
    if color_by:
        kwargs["color"] = color_by

    fig = px.box(d, **kwargs)
    fig.update_traces(boxmean=False)

    # Línea horizontal en la mediana global
    #mediana_global = d[y].median(skipna=True)
    #fig.add_hline(
    #    y=mediana_global,
    #    line_dash="dash",
    #    line_color=median_line_color,
    #    annotation_text=f"Mediana = {mediana_global:.2f}",
    #    annotation_position="top right",
    #    annotation_font_color="blue"
    #)

    # ===== Conectar medianas por grupo en X (y por color si aplica) =====
    if color_by:
        # Una línea por cada categoría en color_by
        meds = (
            d.groupby([x, color_by], dropna=False)[y]
              .median()
              .reset_index()
        )

        # Asegurar orden por X (numérico si aplica)
        def order_key(val):
            try:
                return float(val)
            except Exception:
                # fallback: orden por string
                return str(val)

        for k, sub in meds.groupby(color_by, dropna=False):
            sub = sub.sort_values(by=x, key=lambda s: s.map(order_key))
            fig.add_trace(
                go.Scatter(
                    x=sub[x],
                    y=sub[y],
                    mode="lines+markers",
                    name=f"Mediana ({color_by}={k})",
                    line=dict(dash="dash"),
                    marker=dict(size=6),
                    legendgroup=f"med-{k}",
                    showlegend=True
                )
            )
    else:
        # Una sola línea conectando las medianas por X
        meds = d.groupby(x, dropna=False)[y].median().reset_index()

        # Ordenar por X como arriba
        def order_key(val):
            try:
                return float(val)
            except Exception:
                return str(val)

        meds = meds.sort_values(by=x, key=lambda s: s.map(order_key))

        fig.add_trace(
            go.Scatter(
                x=meds[x],
                y=meds[y],
                mode="lines+markers",
                name=f"Mediana por grupo ({x})",
                line=dict(dash="dash"),
                marker=dict(size=6),
                showlegend=True
            )
        )

    # Estilo corporativo + título centrado
    fig.update_layout(
        title_x=0.5,
        paper_bgcolor="rgba(0,0,0,0)",
        plot_bgcolor="rgba(0,0,0,0)",
        font=dict(color="black"),
        xaxis=dict(
            showline=True, linecolor="black",
            tickfont=dict(color="black"), title_font=dict(color="black"),
            categoryorder="array", categoryarray=x_vals_sorted
        ),
        yaxis=dict(showline=True, linecolor="black",
                   tickfont=dict(color="black"), title_font=dict(color="black")),
        legend=dict(font=dict(color="black"))
    )
    fig.update_yaxes(title_text=yaxis_title or y)

    if output_html:
        fig.write_html(output_html, include_plotlyjs="cdn", full_html=False)

    return fig



import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots

# Paleta corporativa
CORP_PALETTE = ["#1C8074", "#666666",  "#E6ECD8", "#C9C9C9", "#1A494C", "#94AF92", "#E6ECD8", "#C9C9C9"]


def boxgrid_2x2_corporativo(
    df: pd.DataFrame,
    *,
    x_col: str,
    y_cols: list[str],                 # 4 variables: [coliformes, rat, levaduras, hongos]
    subplot_titles: list[str] | None = None,
    points: str | bool = "outliers",
    suptitle: str = "Distribuciones",
    output_html: str | None = None,
    solo_y_positivos: bool = False,
    width: int = 1000,
    height: int = 500,
    median_line_color: str = "#FF0000",
):
    """
    Grid 2x2 de boxplots corporativos con línea de mediana en rojo.
    Guarda HTML si se indica.
    """
    assert len(y_cols) == 4, "y_cols debe tener 4 variables (2x2)."
    d = df.copy()
    for y in y_cols:
        d[y] = pd.to_numeric(d[y], errors="coerce")
    d = d.dropna(subset=[x_col] + y_cols)
    if solo_y_positivos:
        for y in y_cols:
            d = d[d[y] > 0]

    titles = subplot_titles or y_cols
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=titles,
        horizontal_spacing=0.08,
        vertical_spacing=0.3
    )

    def rc(idx):  # índice -> (fila, columna)
        return (1 + idx // 2, 1 + idx % 2)

    # Añadir cada boxplot
    for i, y in enumerate(y_cols):
        fb = px.box(
            d, x=x_col, y=y,
            points=points,
            color_discrete_sequence=CORP_PALETTE,
        )
        # Evita leyendas repetidas por traza
        for tr in fb.data:
            tr.showlegend = False
            fig.add_trace(tr, row=rc(i)[0], col=rc(i)[1])

        # Mediana global de cada variable (línea roja)
        med = d[y].median(skipna=True)
        fig.add_hline(
            y=med,
            line_dash="dash",
            line_color=median_line_color,
            annotation_text=f"Mediana = {med:.2f}",
            annotation_position="top right",
            annotation_font_color="black",
            row=rc(i)[0], col=rc(i)[1]
        )

        # Títulos de ejes
        fig.update_xaxes(title_text=x_col, row=rc(i)[0], col=rc(i)[1])
        fig.update_yaxes(title_text="log10(UFC/g)", row=rc(i)[0], col=rc(i)[1])

    # Estilo global + tamaño
    fig.update_layout(
        width=width,
        height=height,
        title=suptitle,
        title_x=0.5,  # centrado
        paper_bgcolor="rgba(0,0,0,0)",
        plot_bgcolor="rgba(0,0,0,0)",
        font=dict(color="black"),
        legend=dict(font=dict(color="black")),
        margin=dict(l=70, r=30, t=90, b=60),
    )
    fig.update_xaxes(showline=True, linecolor="black",
                     tickfont=dict(color="black"), title_font=dict(color="black"))
    fig.update_yaxes(showline=True, linecolor="black",
                     tickfont=dict(color="black"), title_font=dict(color="black"))

    if output_html:
        fig.write_html(output_html, include_plotlyjs="cdn", full_html=True)

    return fig


# =========================
# EJEMPLOS DE USO
# =========================
# Ejemplo 1: un solo boxplot
# boxplot_corporativo_con_mediana(
#     r, x="AW", y="log_coliformes",
#     title="log10 Coliformes por rango de actividad de agua",
#     yaxis_title="log10(UFC/g)",
#     points="outliers",
#     output_html=f"{ROOT_IMAGEN}/box_coliformes.html",
#     width=1000, height=500
# )

# Ejemplo 2: grid 2×2
# fig = boxgrid_2x2_corporativo(
#     r,
#     x_col="AW",
#     y_cols=["log_coliformes", "log_rat", "log_levaduras", "log_hongos"],
#     subplot_titles=[
#         "log10 Coliformes por rango de actividad de agua",
#         "log10 RAT por rango de actividad de agua",
#         "log10 Levaduras por rango de actividad de agua",
#         "log10 Hongos por rango de actividad de agua",
#     ],
#     suptitle="Boxplots por AW (paleta corporativa, mediana roja)",
#     output_html=f"{ROOT_IMAGEN}/boxgrid_aw_2x2.html",
#     width=1000, height=500
# )
# fig.show()


def scatter_corporativo(
    df: pd.DataFrame,
    *,
    x_col: str,
    y_col: str,
    color_by: str | None = None,     # p.ej. "Especie"
    filter_col: str | None = None,   # p.ej. "log_coliformes" (col de filtro)
    min_filter_val: float | None = None,  # p.ej. 0  (umbral para filter_col)
    width: int = 900,
    height: int = 520,
    title: str = "",
    output_html: str | None = None,
    palette: list[str] | None = None
):
    d = df.copy()

    # Asegurar numéricos en ejes
    d[x_col] = pd.to_numeric(d[x_col], errors="coerce")
    d[y_col] = pd.to_numeric(d[y_col], errors="coerce")

    # Filtro opcional por tercera columna (como r[r['log_coliformes']>0])
    if filter_col is not None:
        d[filter_col] = pd.to_numeric(d[filter_col], errors="coerce")
        if min_filter_val is not None:
            d = d[d[filter_col] > min_filter_val]
        d = d.dropna(subset=[filter_col])

    d = d.dropna(subset=[x_col, y_col])

    # Paleta corporativa (usa CORP_PALETTE si existe; si no, una de Plotly)
    if palette is None:
        palette = globals().get("CORP_PALETTE", px.colors.qualitative.Set2)

    fig = go.Figure()

    if color_by and color_by in d.columns:
        cats = d[color_by].astype(str).fillna("NA").unique().tolist()
        for i, c in enumerate(cats):
            g = d[d[color_by].astype(str) == c]
            fig.add_trace(
                go.Scatter(
                    x=g[x_col], y=g[y_col],
                    mode="markers",
                    name=str(c),
                    marker=dict(size=7, opacity=1,
                                color=palette[i % len(palette)]),
                )
            )
        showlegend = True
    else:
        fig.add_trace(
            go.Scatter(
                x=d[x_col], y=d[y_col],
                mode="markers",
                name=f"{x_col} vs {y_col}",
                marker=dict(size=7, opacity=1,
                            color=palette[0]),
            )
        )
        showlegend = False

    # Estilo corporativo
    fig.update_layout(
        width=width, height=height,
        title=title, title_x=0.5,
        paper_bgcolor="rgba(0,0,0,0)",
        plot_bgcolor="rgba(0,0,0,0)",
        font=dict(color="black"),
        margin=dict(l=70, r=30, t=60, b=60),
        legend=dict(font=dict(color="black"), orientation="h",
                    yanchor="bottom", y=1.02, xanchor="left", x=0),
        showlegend=showlegend
    )
    fig.update_xaxes(title_text=x_col, showline=True, linecolor="black",
                     tickfont=dict(color="black"), title_font=dict(color="black"))
    fig.update_yaxes(title_text=y_col, showline=True, linecolor="black",
                     tickfont=dict(color="black"), title_font=dict(color="black"))

    if output_html:
        fig.write_html(output_html, include_plotlyjs="cdn", full_html=False)

    return fig


In [41]:
import unicodedata

def normalizar_categoria(s):
    if pd.isna(s):
        return s
    s = s.strip().lower()
    s = ''.join(
        c for c in unicodedata.normalize("NFKD", s)
        if not unicodedata.combining(c)
    )
    return ' '.join(s.split())  # colapsa espacios dobles



In [42]:
categoria_maiz = [
    "maiz quebrado",
    #"acemite",
    "harina de maiz",
    "maiz amarillo grano entero",
    "maiz grueso 5.0mm x 7.0mm",
    "maiz molido 3mm x 3mm",
    "maiz grueso 5.0mm x 5.0mm",
    "maiz grueso 8.0mm x 5mm",
    "maiz grueso 8.0mm x 8.0mm",
    "maiz molienda fina",
    "maiz amarillo molino venta",
    "maiz grueso 7.0mm x 7.0mm",
    "semola maiz",
    "maiz grueso 6.0mm x 7.0mm",
    "maiz molido 4mm x 4mm",
]


In [43]:
df_micro = s3.read_excel("raw/costa_rica/BASE DE DATOS MICROBIOLOGIA_2025 (3).xlsx", sheet_name='Base Microbiologia 2025')
df_toxinas = s3.read_excel("raw/costa_rica/BASE DE DATOS MICROBIOLOGIA_2025 (3).xlsx", sheet_name='Base Micotoxinas 2025')


Conditional Formatting extension is not supported and will be removed



In [44]:
import unicodedata
def normalizar_categoria(s):
    if pd.isna(s):
        return s
    s = s.strip().lower()
    s = ''.join(
        c for c in unicodedata.normalize("NFKD", s)
        if not unicodedata.combining(c)
    )
    return ' '.join(s.split())  # colapsa espacios dobles


In [45]:
dfs = [df_micro, df_toxinas]
for df in dfs:
    df.columns = [x.strip().lower() for x in df.columns]

df_micro = df_micro[df_micro["clientes"].notnull()].copy()

clients = ['SUPERFICIE', 'AGUA']
df_micro = df_micro[~df_micro["clientes"].isin(clients)]

df_micro["lote"] = df_micro["lote"].astype(str)
df_micro = df_micro[df_micro["lote"].notnull()]

df_micro["producto"] = df_micro["producto"].apply(normalizar_categoria)
df_micro["procedencia"] = df_micro["procedencia"].apply(normalizar_categoria)
df_micro["tolva"] = df_micro["tolva"].astype(str).apply(normalizar_categoria)
df_micro["etapa"] = np.where(df_micro["clientes"]=='MATERIA PRIMA', "mp", "pt")

for cl in ["hongos", "levaduras"]:
    df_micro[cl] = pd.to_numeric(df_micro[cl])
    df_micro[f"log_{cl}"] = np.log10(df_micro[cl]+1)

for cl in ["fecha", "fecha de toma de muestra"]:
    df_micro[cl] = pd.to_datetime(df_micro[cl], errors="coerce")

# Diferencia en días (fecha - fecha de toma de muestra)
df_micro["diff_dias"] = (df_micro["fecha"] - df_micro["fecha de toma de muestra"]).dt.days
df_micro["anio_mes"] = df_micro['fecha'].dt.to_period("M")#.astype(str)

df_micro["grupo_mp"] = np.where(
    df_micro["producto"].isin(categoria_maiz),
    "maiz y derivados",
    "otras materias primas",
)


cls_micro = [
'fecha',  'productos', 'producto', 'grupo_mp',
'procedencia', 'tolva', 'lote', 'placa',
'coliformes totales', 'e.coli', 'hongos', 'levaduras',
'rta',
'log_hongos', 'log_levaduras',
'anio_mes', 'etapa']
df_micro = df_micro[cls_micro].copy()
cols_micro = [
'log_hongos',
'log_levaduras'

]
for col in cols_micro:
    df_micro[col] = pd.to_numeric(df_micro[col])

df_micro_maiz = df_micro[df_micro["grupo_mp"]=='maiz y derivados'].copy()
df_micro_pt = df_micro[df_micro["etapa"]=='pt'].copy()
df_micro_pt["lote"] = pd.to_numeric(df_micro_pt["lote"], errors="coerce")

In [46]:
df_micro_maiz.groupby(["producto", "procedencia"]).agg(
      log_hongos=("log_hongos", "mean"),
    log_levaduras=("log_levaduras", "mean"),
)

Unnamed: 0_level_0,Unnamed: 1_level_0,log_hongos,log_levaduras
producto,procedencia,Unnamed: 2_level_1,Unnamed: 3_level_1
harina de maiz,fhacasa,0.0,0.0
harina de maiz,tres jotas,2.466437,1.807136
maiz amarillo grano entero,adm,0.0,5.000004
maiz amarillo grano entero,avin,0.0,3.301247
maiz amarillo grano entero,avuga,0.0,0.347131
maiz amarillo grano entero,bodega 1,,
maiz amarillo grano entero,bodega 1 manejos a granel,,
maiz amarillo grano entero,bodega cordoda,1.491362,0.0
maiz amarillo grano entero,bodegas calderas,1.041393,0.0
maiz amarillo grano entero,coproagro,1.775262,0.624962


In [47]:
df_micro_pt.groupby(["producto", "procedencia"]).agg(
      log_hongos=("log_hongos", "mean"),
    log_levaduras=("log_levaduras", "mean"),
)

Unnamed: 0_level_0,Unnamed: 1_level_0,log_hongos,log_levaduras
producto,procedencia,Unnamed: 2_level_1,Unnamed: 3_level_1
alicerdo 2,linea 1,3.000434,1.322219
alicerdo 3,linea 2,2.604866,1.500217
alicerdo 3,vida util,1.322219,1.041393
aliengorde 1,,,
aliengorde 1,linea 1,1.041393,1.491362
...,...,...,...
vitapostura inicio,linea 1,3.301247,2.149219
vitapostura inicio,linea 2,1.347276,1.935764
vitapostura inicio,marco tulio castillo,3.000434,3.049606
vitapostura prepostura,jason dueck,,


In [48]:
df_toxinas["etapa_mico"] = np.where(df_toxinas["tipo de cliente"]=='Materia Prima', "mp", "pt")
df_toxinas["fecha"] = pd.to_datetime(df_toxinas["fecha"], errors="coerce")
df_toxinas = df_toxinas[(df_toxinas["lote"].notnull()) & (df_toxinas["fecha"].notnull())].copy()
df_toxinas = df_toxinas.drop_duplicates(["lote"])

cls_toxinas = [
    'aflatoxina (20 ppb mp- 15 ppb pt)',
    'don (5 ppm mp- 4 ppm pt)',
    't2 (60 ppb mp- 20 ppb pt)',
    'fumonisina (50 ppm mp- 50 ppm pt)',
    'ocratóxina (10 ppb mp- 10 ppb pt',
    'zearelenona (200 ppb mp- 150 ppb pt)',
]

rename_toxinas = {
    'aflatoxina (20 ppb mp- 15 ppb pt)': 'aflatoxina',
    'don (5 ppm mp- 4 ppm pt)': 'don',
    't2 (60 ppb mp- 20 ppb pt)': 't2',
    'fumonisina (50 ppm mp- 50 ppm pt)': 'fumonisina',
    'ocratóxina (10 ppb mp- 10 ppb pt': 'ocratoxina',
    'zearelenona (200 ppb mp- 150 ppb pt)': 'zearelenona',
}

df_toxinas = df_toxinas.rename(columns=rename_toxinas)


for cl in rename_toxinas.values():
    df_toxinas[cl] = pd.to_numeric(df_toxinas[cl], errors="coerce")


cols_micotoxinas = [
'lote',
'fecha',
'producto',
'procedencia',
'don',
't2',
'aflatoxina',
'fumonisina',
'ocratoxina',
'zearelenona',
"tipo de cliente"
 ]
df_toxinas = df_toxinas[cols_micotoxinas].copy().rename(
    columns={"fecha": "fecha_micotoxinas", "producto": "producto_micotoxinas",
             "procedencia": "procedencia_micotoxinas"})
df_toxinas_pt = df_toxinas[df_toxinas["tipo de cliente"].isin(["Terceros", "Integracion"])].copy()
df_toxinas_mp = df_toxinas[~df_toxinas["tipo de cliente"].isin(["Terceros", "Integracion"])].copy()

In [49]:
rev_maiz = pd.merge(df_micro_maiz, df_toxinas_mp, on="lote", how="inner")
rev_maiz

Unnamed: 0,fecha,productos,producto,grupo_mp,procedencia,tolva,lote,placa,coliformes totales,e.coli,...,fecha_micotoxinas,producto_micotoxinas,procedencia_micotoxinas,don,t2,aflatoxina,fumonisina,ocratoxina,zearelenona,tipo de cliente
0,2024-02-19,,maiz amarillo grano entero,maiz y derivados,las palmas,,Silo 1,S-9536,0.0,0.0,...,2024-09-23,Maiz Molido Fino,TRANS 407,1.37,0.0,1.9,0.59,0.0,54.0,Materia Prima
1,2024-08-14,,maiz molido 3mm x 3mm,maiz y derivados,,,SILO 4,,10.0,0.0,...,2024-12-17,Maiz amarillo entero,COPROAGRO,0.01,28.0,4.8,0.05,0.2,23.0,Materia Prima
2,2024-09-11,,maiz molido 3mm x 3mm,maiz y derivados,,,SILO 4,,0.0,0.0,...,2024-12-17,Maiz amarillo entero,COPROAGRO,0.01,28.0,4.8,0.05,0.2,23.0,Materia Prima
3,2024-09-17,,maiz grueso 5.0mm x 7.0mm,maiz y derivados,,,SILO 4,,0.0,0.0,...,2024-12-17,Maiz amarillo entero,COPROAGRO,0.01,28.0,4.8,0.05,0.2,23.0,Materia Prima
4,2025-02-18,,maiz amarillo grano entero,maiz y derivados,las palmas,18 ext,SILO 4,,,,...,2024-12-17,Maiz amarillo entero,COPROAGRO,0.01,28.0,4.8,0.05,0.2,23.0,Materia Prima
5,2024-08-27,,maiz grueso 5.0mm x 7.0mm,maiz y derivados,,,SILO 3,,240.0,0.0,...,2024-10-25,Polvo de Maiz,Jose Boza,0.98,3.0,2.1,44.0,2.1,88.0,Materia Prima
6,2024-08-28,,maiz grueso 5.0mm x 7.0mm,maiz y derivados,,,SILO 3,,200.0,0.0,...,2024-10-25,Polvo de Maiz,Jose Boza,0.98,3.0,2.1,44.0,2.1,88.0,Materia Prima
7,2024-09-04,,maiz grueso 7.0mm x 7.0mm,maiz y derivados,,,SILO 3,,0.0,0.0,...,2024-10-25,Polvo de Maiz,Jose Boza,0.98,3.0,2.1,44.0,2.1,88.0,Materia Prima
8,2024-09-04,,maiz molido 3mm x 3mm,maiz y derivados,,,SILO 3,,0.0,0.0,...,2024-10-25,Polvo de Maiz,Jose Boza,0.98,3.0,2.1,44.0,2.1,88.0,Materia Prima
9,2024-09-05,,maiz grueso 7.0mm x 7.0mm,maiz y derivados,,,SILO 3,,10.0,0.0,...,2024-10-25,Polvo de Maiz,Jose Boza,0.98,3.0,2.1,44.0,2.1,88.0,Materia Prima


In [50]:
rev_pt = pd.merge(df_micro_pt, df_toxinas_pt, on="lote", how="inner")

In [51]:
rev_maiz.groupby(["producto_micotoxinas"], dropna=False).agg(
    count=("lote", "count"),
    coliformes=('coliformes totales', "mean"),
    hongos=('hongos', "mean"),
    levaduras=("levaduras", "mean"),
    aflatoxina=('aflatoxina', "mean"),
    don=('don', "mean"),
    t2=('t2', "mean"),
    fumonisina=('fumonisina', "mean"),
    ocratoxina=('ocratoxina', "mean"),
    zearelenona=('zearelenona', "mean"),
)


Unnamed: 0_level_0,count,coliformes,hongos,levaduras,aflatoxina,don,t2,fumonisina,ocratoxina,zearelenona
producto_micotoxinas,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Harina de soya,9,91.666667,25.0,6.666667,13.955556,0.003333,42.666667,0.0,0.888889,292.0
Maiz Entero,3,83.333333,1030.0,10.0,0.0,0.5,0.0,0.03,0.0,0.0
Maiz Molido Fino,1,0.0,1530.0,410.0,1.9,1.37,0.0,0.59,0.0,54.0
Maiz amarillo entero,4,3.333333,136.666667,13.333333,4.8,0.01,28.0,0.05,0.2,23.0
Polvo de Maiz,5,90.0,100.0,26.0,2.1,0.98,3.0,44.0,2.1,88.0
Pulido de Maiz,1,,,,9.4,44.0,27.0,,0.4,656.0


In [52]:
CORPORATE_COLORS = [
    "#1A494C",  # verde oscuro
    "#17877D",  # verde secundario
    "#94AF92",  # verde suave
    "#F6B27A",  # naranja suave
    "#F18F01",  # naranja intenso
    "#E4572E",  # rojo/naranja
    "#6C757D",  # gris medio
    "#343A40",  # gris oscuro
    "#A3CED0",  # azul verdoso suave
]


def plot_bar_log_micro(
    df: pd.DataFrame,
    x: str,                    # columna categórica (ej. 'procedencia_micotoxinas')
    y_cols: List[str],         # columnas de valores (ej. ['log_hongos', 'log_levaduras'])
    title: str = "<b>Comparación de parámetros microbiológicos</b>",
    y_axis_title: str = "Log\u2081\u2080(UFC/g)",
    output_html: Optional[str] = None,  # ruta para guardar html (opcional)
) -> go.Figure:
    """
    Grafica un diagrama de barras agrupadas con una o varias variables en el eje Y
    y una categoría en el eje X, usando colores corporativos y marco negro.

    Parameters
    ----------
    df : DataFrame con los datos agregados por categoría.
    x : nombre de la columna categórica.
    y_cols : lista de columnas numéricas a graficar como barras.
    title : título del gráfico (puede incluir <b> </b> para negrita).
    y_axis_title : etiqueta del eje Y.
    output_html : si se pasa una ruta, guarda el gráfico como HTML.

    Returns
    -------
    fig : plotly.graph_objects.Figure
    """
    fig = go.Figure()

    for i, col in enumerate(y_cols):
        fig.add_trace(
            go.Bar(
                x=df[x],
                y=df[col],
                name=col.replace("_", " ").title(),
                marker=dict(color=CORPORATE_COLORS[i % len(CORPORATE_COLORS)]),
            )
        )

    # Layout corporativo
    fig.update_layout(
        barmode="group",
        title=dict(text=title, x=0.5),
        plot_bgcolor="white",
        paper_bgcolor="white",
        font=dict(family="Arial", size=12, color="black"),
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1,
            font=dict(color="black"),
        ),
    )

    # Ejes y marco negro
    fig.update_xaxes(
        title_text=x.replace("_", " ").title(),
        showline=True,
        linewidth=1.5,
        linecolor="black",
        mirror=True,
        tickfont=dict(color="black"),
        titlefont=dict(color="black"),
    )
    fig.update_yaxes(
        title_text=y_axis_title,
        showline=True,
        linewidth=1.5,
        linecolor="black",
        mirror=True,
        tickfont=dict(color="black"),
        titlefont=dict(color="black"),
    )

    if output_html is not None:
        fig.write_html(output_html)

    return fig

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Paleta corporativa
CORPORATE_COLORS = [
    "#1A494C", "#17877D", "#94AF92", "#F6B27A", "#F18F01",
    "#E4572E", "#6C757D", "#343A40", "#A3CED0",
]
def cluster_procedencias_micro(
    df: pd.DataFrame,
    n_clusters: int = 3,
    feature_cols=None,
) -> pd.DataFrame:
    """
    Agrupa procedencias según carga microbiológica usando KMeans.

    df: DataFrame agregado por procedencia (ej. summary_pt)
    n_clusters: número de grupos que quieres (2, 3, 4…)
    feature_cols: columnas a usar como variables (por defecto: log_hongos,
                  log_levaduras y coliformes si están disponibles)

    Devuelve: df con columna 'cluster_micro' (0..n_clusters-1)
    """
    df_ = df.copy()

    # Si no se especifican columnas, usamos las micro clásicas
    if feature_cols is None:
        posibles = ["log_hongos", "log_levaduras",]
        feature_cols = [c for c in posibles if c in df_.columns]

    if len(feature_cols) == 0:
        raise ValueError("No se encontraron columnas microbiológicas en el DataFrame.")

    X = df_[feature_cols].values

    # Estandarizar para que todas las variables pesen parecido
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    df_["cluster_micro"] = kmeans.fit_predict(X_scaled)

    return df_, feature_cols, kmeans, scaler


import plotly.express as px
import plotly.graph_objects as go


def apply_corporate_layout(fig: go.Figure, title: str) -> go.Figure:
    fig.update_layout(
        title=title,
        title_x=0.5,
        template="plotly_white",
        font=dict(family="Arial", size=12, color="black"),
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1,
            font=dict(color="black"),
        ),
        plot_bgcolor="white",
        paper_bgcolor="white",
    )
    fig.update_xaxes(
        showline=True, linewidth=1.5, linecolor="black", mirror=True,
        tickfont=dict(color="black"), titlefont=dict(color="black")
    )
    fig.update_yaxes(
        showline=True, linewidth=1.5, linecolor="black", mirror=True,
        tickfont=dict(color="black"), titlefont=dict(color="black")
    )
    return fig

def plot_micro_clusters(df_clusters: pd.DataFrame, output_html: str | None = None) -> go.Figure:
    """
    Grafica las procedencias en el plano log_hongos vs log_levaduras
    coloreadas por cluster_micro.
    """
    if not {"log_hongos", "log_levaduras", "cluster_micro"}.issubset(df_clusters.columns):
        raise ValueError("El DataFrame debe contener log_hongos, log_levaduras y cluster_micro.")

    fig = px.scatter(
        df_clusters,
        x="log_hongos",
        y="log_levaduras",
        color="cluster_micro",
        hover_name="producto",
        size=df_clusters.get("Coliformes", df_clusters.get("coliformes", None)),
        size_max=20,
        color_discrete_sequence=CORPORATE_COLORS,
    )

    fig.update_xaxes(title_text="log₁₀(Hongos UFC/g)")
    fig.update_yaxes(title_text="log₁₀(Levaduras UFC/g)")
    fig = apply_corporate_layout(fig, "<b>Grupos de productos según carga microbiológica</b>")

    if output_html is not None:
        fig.write_html(output_html)

    return fig



In [53]:
summary_maiz_procedencia = rev_maiz.groupby(["procedencia_micotoxinas"], dropna=False).agg(
    count=("lote", "count"),

    log_hongos=('log_hongos', "mean"),
    log_levaduras=("log_levaduras", "mean"),
    aflatoxina=('aflatoxina', "mean"),
    don=('don', "mean"),
    t2=('t2', "mean"),
    fumonisina=('fumonisina', "mean"),
    ocratoxina=('ocratoxina', "mean"),
    zearelenona=('zearelenona', "mean"),
    coliformes=('coliformes totales', "mean"),
).reset_index()
summary_maiz_procedencia

Unnamed: 0,procedencia_micotoxinas,count,log_hongos,log_levaduras,aflatoxina,don,t2,fumonisina,ocratoxina,zearelenona,coliformes
0,COPROAGRO,5,1.974434,0.403196,3.84,0.014,22.4,0.04,0.16,52.8,5.0
1,Jose Boza,5,1.815682,0.825622,2.1,0.98,3.0,44.0,2.1,88.0,90.0
2,TRANS 407,1,3.184975,2.613842,1.9,1.37,0.0,0.59,0.0,54.0,0.0
3,,12,1.03259,0.497049,11.25,3.791667,34.25,0.008182,0.7,259.333333,98.75


In [54]:
fig = plot_bar_log_micro(
    df=summary_maiz_procedencia,  # tu df con columnas: 'procedencia_micotoxinas', 'log_hongos', 'log_levaduras'
    x="procedencia_micotoxinas",
    y_cols=["log_hongos", "log_levaduras"],
    title="<b>Hongos y levaduras (Log\u2081\u2080 UFC/g) por procedencia</b>",
    #output_html=f"{ROOT_IMAGEN}/hongos_levaduras_por_proveedor.html",
)

fig.show()
s3.save_plotly_html(fig, "hongos_levaduras_por_proveedor.html")


In [55]:
fig = plot_bar_log_micro(
    df=summary_maiz_procedencia,  # tu df con columnas: 'procedencia_micotoxinas', 'log_hongos', 'log_levaduras'
    x="procedencia_micotoxinas",
    y_cols=["aflatoxina",  "t2", "ocratoxina", "zearelenona"],
    title="<b>Micotoxinas (PPB) por procedencia</b>",
    y_axis_title = "Concentración (ppb)",
    #output_html=f"{ROOT_IMAGEN}/ppb_por_proveedor.html",
)

fig.show()
s3.save_plotly_html(fig, "ppb_por_proveedor.html")

In [56]:
fig = plot_bar_log_micro(
    df=summary_maiz_procedencia,  # tu df con columnas: 'procedencia_micotoxinas', 'log_hongos', 'log_levaduras'
    x="procedencia_micotoxinas",
    y_cols=["don","fumonisina",],
    title="<b>Micotoxinas (PPM) por procedencia</b>",
    y_axis_title = "Concentración (ppm)",
    #output_html=f"{ROOT_IMAGEN}/ppm_por_proveedor.html",
)

fig.show()
s3.save_plotly_html(fig, f"ppm_por_proveedor.html")

In [75]:

y_cols_ppb = ["aflatoxina", "t2", "ocratoxina", "zearelenona"]
y_cols_ppm = ["don", "fumonisina"]

fig = make_subplots(
    rows=1,
    cols=2,
    subplot_titles=("Micotoxinas (PPB) por procedencia",
                    "Micotoxinas (PPM) por procedencia"),
)

# --- Subplot 1: PPB ---
for i, col in enumerate(y_cols_ppb):
    fig.add_trace(
        go.Bar(
            x=summary_maiz_procedencia["procedencia_micotoxinas"],
            y=summary_maiz_procedencia[col],
            name=col.capitalize(),
            marker=dict(color=CORPORATE_COLORS[i % len(CORPORATE_COLORS)]),
        ),
        row=1,
        col=1,
    )

# --- Subplot 2: PPM ---
for j, col in enumerate(y_cols_ppm):
    fig.add_trace(
        go.Bar(
            x=summary_maiz_procedencia["procedencia_micotoxinas"],
            y=summary_maiz_procedencia[col],
            name=col.capitalize(),
            marker=dict(color=CORPORATE_COLORS[(j + len(y_cols_ppb)) % len(CORPORATE_COLORS)]),
        ),
        row=1,
        col=2,
    )

# Ejes
fig.update_xaxes(title_text="Procedencia Micotoxinas", row=1, col=1)
fig.update_xaxes(title_text="Procedencia Micotoxinas", row=1, col=2)
fig.update_yaxes(title_text="Concentración (ppb)", row=1, col=1)
fig.update_yaxes(title_text="Concentración (ppm)", row=1, col=2)

# Layout corporativo
fig.update_layout(barmode="group")
fig = apply_corporate_layout(fig, "<b>Micotoxinas por procedencia</b>")

# Guardar y mostrar
#ig.write_html(f"{ROOT_IMAGEN}/micotoxinas_ppb_ppm_por_proveedor.html")
fig.show()
s3.save_plotly_html(fig, "micotoxinas_ppb_ppm_por_proveedor.html")

In [58]:
rev_maiz_lote = rev_maiz.groupby(["lote",], dropna=False).agg(
    count=("lote", "count"),

    log_hongos=('log_hongos', "mean"),
    log_levaduras=("log_levaduras", "mean"),
    aflatoxina=('aflatoxina', "mean"),
    don=('don', "mean"),
    t2=('t2', "mean"),
    fumonisina=('fumonisina', "mean"),
    ocratoxina=('ocratoxina', "mean"),
    zearelenona=('zearelenona', "mean"),
    coliformes=('coliformes totales', "mean"),
).reset_index()
rev_maiz_lote

Unnamed: 0,lote,count,log_hongos,log_levaduras,aflatoxina,don,t2,fumonisina,ocratoxina,zearelenona,coliformes
0,MV EVA CARLTON,3,2.753574,0.787871,0.0,0.5,0.0,0.03,0.0,0.0,83.333333
1,MV JIN RUI,1,2.178977,0.0,0.0,0.03,0.0,0.0,0.0,172.0,10.0
2,MV. Dioni GR 14-24,8,0.0,0.322557,15.7,0.0,48.0,0.0,1.0,307.0,108.0
3,SILO 2,1,,,9.4,44.0,27.0,,0.4,656.0,
4,SILO 3,5,1.815682,0.825622,2.1,0.98,3.0,44.0,2.1,88.0,90.0
5,SILO 4,4,1.906253,0.537595,4.8,0.01,28.0,0.05,0.2,23.0,3.333333
6,Silo 1,1,3.184975,2.613842,1.9,1.37,0.0,0.59,0.0,54.0,0.0


In [59]:
fig = plot_bar_log_micro(
    df=rev_maiz_lote,  # tu df con columnas: 'procedencia_micotoxinas', 'log_hongos', 'log_levaduras'
    x="lote",
    y_cols=["log_hongos", "log_levaduras"],
    title="<b>Hongos y levaduras (Log\u2081\u2080 UFC/g) por Lote</b>",
    #output_html=f"{ROOT_IMAGEN}/hongos_levaduras_por_lote.html",
)

fig.show()
s3.save_plotly_html(fig, "hongos_levaduras_por_lote.html")


In [60]:
fig = plot_corr_triangle(
    df=rev_maiz,  # el nombre que tengas
    value_cols=["log_hongos", "log_levaduras", "aflatoxina",  "don",
                't2','fumonisina', 'ocratoxina', "zearelenona"],
    title = "<b>Correlación microbiología–micotoxinas en materias primas de maíz</b>",
    decimals=2,
    width=900,
    height=500,
    #salida_html=f"{ROOT_IMAGEN}/correlacion_maiz.html",
)
fig.show()
s3.save_plotly_html(fig, "correlacion_maiz.html")

In [61]:
f = scatter_corporativo(
        rev_maiz,
        x_col="t2",
        y_col="zearelenona",
        #color_by="procedencia_micotoxinas",
        #filter_col="log_levaduras",
        min_filter_val=0,
        title=f"{cl}",
        #palette=CORP_PALETTE_VIBRANT
        #output_html=f"{ROOT_IMAGEN}/log_levaduras_pt.html"
    )
f.show()
s3.save_plotly_html(f, "log_levaduras_pt.html")

In [62]:
summary_pt = rev_pt.groupby(["procedencia_micotoxinas"], dropna=False).agg(
    count=("lote", "count"),

    log_hongos=('log_hongos', "mean"),
    log_levaduras=("log_levaduras", "mean"),
    aflatoxina=('aflatoxina', "mean"),
    don=('don', "mean"),
    t2=('t2', "mean"),
    fumonisina=('fumonisina', "mean"),
    ocratoxina=('ocratoxina', "mean"),
    zearelenona=('zearelenona', "mean"),
    coliformes=('coliformes totales', "mean"),
).reset_index()
summary_pt


Unnamed: 0,procedencia_micotoxinas,count,log_hongos,log_levaduras,aflatoxina,don,t2,fumonisina,ocratoxina,zearelenona,coliformes
0,Arenal 2,1,0.0,0.0,0.5,0.0,0.0,0.46,0.0,2.0,0.0
1,Cascada seca,1,1.041393,0.0,0.0,0.0,0.0,1.86,0.0,38.0,0.0
2,Cipresal,1,,,12.2,0.53,32.0,0.9,2.8,150.0,
3,Don Jose,1,1.322219,0.0,4.1,0.0,3.0,0.67,0.0,80.0,10.0
4,El Alto,2,0.925629,0.0,0.35,0.0,0.0,0.135,0.0,11.0,0.0
5,El Cristo,1,,,12.6,0.07,7.0,0.14,0.1,99.0,
6,Esteban Jara,1,3.301247,3.301247,2.9,1.86,5.0,0.43,0.5,164.0,200.0
7,Euardo Alpizar,1,0.0,1.041393,3.2,0.0,6.0,0.17,0.0,65.0,0.0
8,Inara,1,1.491362,1.041393,2.1,0.0,28.0,0.53,0.0,80.0,190.0
9,La Pradera,2,0.0,0.66111,5.05,0.145,4.5,0.59,0.0,60.0,5.0


In [63]:
summary_pt = rev_pt.groupby(["producto"], dropna=False).agg(
    count=("lote", "count"),

    log_hongos=('log_hongos', "mean"),
    log_levaduras=("log_levaduras", "mean"),
    aflatoxina=('aflatoxina', "mean"),
    don=('don', "mean"),
    t2=('t2', "mean"),
    fumonisina=('fumonisina', "mean"),
    ocratoxina=('ocratoxina', "mean"),
    zearelenona=('zearelenona', "mean"),
    coliformes=('coliformes totales', "mean"),
).reset_index()
summary_pt


Unnamed: 0,producto,count,log_hongos,log_levaduras,aflatoxina,don,t2,fumonisina,ocratoxina,zearelenona,coliformes
0,aliengorde 3,1,1.041393,0.0,3.7,0.29,0.0,1.16,0.0,22.0,0.0
1,cerdo crecimiento especial,1,1.612784,1.322219,0.0,0.0,0.0,0.06,0.0,40.0,100.0
2,cerdo gestacion especial,1,3.000434,1.041393,1.7,0.04,0.0,0.22,0.0,41.0,250.0
3,cerdo inicio especial,1,1.041393,1.322219,8.2,0.27,0.0,0.06,1.4,76.0,0.0
4,econoforraje etapa 2,2,,,11.05,0.765,8.5,0.445,0.0,80.5,
5,fase 1,2,1.002161,0.0,1.35,0.0,1.0,0.34,0.15,41.5,0.0
6,fase 2,7,0.699137,0.624836,5.3,0.245714,15.571429,0.337143,0.285714,91.714286,50.0
7,fase 3,5,0.851251,0.330555,9.2,0.368,23.8,1.464,1.26,91.0,27.5
8,fase 4,6,0.921458,0.330555,6.283333,0.311667,11.666667,0.896667,0.466667,94.666667,677.5
9,gr-11,1,1.041393,0.0,0.0,0.0,1.0,0.13,0.0,59.0,0.0


In [64]:

fig = plot_corr_triangle(
df=rev_pt,
value_cols=["log_hongos", "log_levaduras", "aflatoxina",  "don",
            't2','fumonisina', 'ocratoxina', "zearelenona"],
title=f"<b>Correlación variables en producto terminado <b>",
decimals=2,
width=900,
height=500,
#salida_html=f"{ROOT_IMAGEN}/correlacion_pt.html",
    )
fig.show()
s3.save_plotly_html(fig, f"correlacion_pt.html")

In [65]:
f = scatter_corporativo(
        rev_pt,
        x_col="aflatoxina",
        y_col="zearelenona",
        #output_html=f"{ROOT_IMAGEN}/zeare_afla_pt.html"
    )
f.show()
s3.save_plotly_html(f, f"zeare_afla_pt.html")

In [66]:

fig = scatter_corporativo(
    rev_pt,
    x_col="aflatoxina",
    y_col="zearelenona",
    #output_html=f"{ROOT_IMAGEN}/zeare_afla_pt.html",
)
s3.save_plotly_html(fig, f"zeare_afla_pt.html")
# Datos
x = rev_pt["aflatoxina"].astype(float)
y = rev_pt["zearelenona"].astype(float)
mask = x.notna() & y.notna()
x_vals = x[mask].values
y_vals = y[mask].values

# --- Ajuste lineal: y = m x + b ---
m, b = np.polyfit(x_vals, y_vals, 1)

x_line = np.linspace(x_vals.min(), x_vals.max(), 100)
y_line = m * x_line + b

fig.add_trace(
    go.Scatter(
        x=x_line,
        y=y_line,
        mode="lines",
        name="Ajuste lineal",
        line=dict(color=CORPORATE_COLORS[1], width=3),
    )
)

fig.show()


In [67]:

x = rev_pt["aflatoxina"].astype(float)
y = rev_pt["zearelenona"].astype(float)
mask = x.notna() & y.notna()
x_vals = x[mask].values
y_vals = y[mask].values

# Scatter base
fig = scatter_corporativo(
    rev_pt,
    x_col="aflatoxina",
    y_col="zearelenona",
    #output_html=f"{ROOT_IMAGEN}/zeare_afla_pt.html",
)
s3.save_plotly_html(
    fig,
    f"zeare_afla_pt.html",
)
# Rango de x para las curvas
x_line = np.linspace(x_vals.min(), x_vals.max(), 200)

# ---- Ajuste lineal ----
coef1 = np.polyfit(x_vals, y_vals, 1)
m, b1 = coef1
y_line1 = m * x_line + b1

eq_lin = f"y = {m:.2f}x + {b1:.2f}"

fig.add_trace(
    go.Scatter(
        x=x_line,
        y=y_line1,
        mode="lines",
        name=f"Ajuste lineal ({eq_lin})",
        line=dict(color=CORPORATE_COLORS[3], width=5),
    )
)

# ---- Ajuste cuadrático ----
coef2 = np.polyfit(x_vals, y_vals, 2)
a2, b2, c2 = coef2
y_line2 = np.polyval(coef2, x_line)

eq_quad = f"y = {a2:.3f}x² + {b2:.3f}x + {c2:.3f}"

fig.add_trace(
    go.Scatter(
        x=x_line,
        y=y_line2,
        mode="lines",
        name=f"Ajuste cuadrático ({eq_quad})",
        line=dict(color=CORPORATE_COLORS[5], width=5, dash="dash"),
    )
)

# Anotación con las ecuaciones (opcional)
fig.add_annotation(
    xref="paper", yref="paper",
    x=0.02, y=0.98,
    text=f"Lineal: {eq_lin}<br>Cuadrático: {eq_quad}",
    showarrow=False,
    align="left",
    font=dict(color="black", size=11),
    bordercolor="black",
    borderwidth=1,
    borderpad=4,
    bgcolor="white",
)

fig.show()
s3.save_plotly_html(
    fig,
    f"zeare_afla_pt.html",
)


In [68]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Paleta corporativa
CORPORATE_COLORS = [
    "#1A494C", "#17877D", "#94AF92", "#F6B27A", "#F18F01",
    "#E4572E", "#6C757D", "#343A40", "#A3CED0",
]
def cluster_procedencias_micro(
    df: pd.DataFrame,
    n_clusters: int = 3,
    feature_cols=None,
) -> pd.DataFrame:
    """
    Agrupa procedencias según carga microbiológica usando KMeans.

    df: DataFrame agregado por procedencia (ej. summary_pt)
    n_clusters: número de grupos que quieres (2, 3, 4…)
    feature_cols: columnas a usar como variables (por defecto: log_hongos,
                  log_levaduras y coliformes si están disponibles)

    Devuelve: df con columna 'cluster_micro' (0..n_clusters-1)
    """
    df_ = df.copy()

    # Si no se especifican columnas, usamos las micro clásicas
    if feature_cols is None:
        posibles = ["log_hongos", "log_levaduras",]
        feature_cols = [c for c in posibles if c in df_.columns]

    if len(feature_cols) == 0:
        raise ValueError("No se encontraron columnas microbiológicas en el DataFrame.")

    X = df_[feature_cols].values

    # Estandarizar para que todas las variables pesen parecido
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    df_["cluster_micro"] = kmeans.fit_predict(X_scaled)

    return df_, feature_cols, kmeans, scaler


import plotly.express as px
import plotly.graph_objects as go


def apply_corporate_layout(fig: go.Figure, title: str) -> go.Figure:
    fig.update_layout(
        title=title,
        title_x=0.5,
        template="plotly_white",
        font=dict(family="Arial", size=12, color="black"),
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1,
            font=dict(color="black"),
        ),
        plot_bgcolor="white",
        paper_bgcolor="white",
    )
    fig.update_xaxes(
        showline=True, linewidth=1.5, linecolor="black", mirror=True,
        tickfont=dict(color="black"), titlefont=dict(color="black")
    )
    fig.update_yaxes(
        showline=True, linewidth=1.5, linecolor="black", mirror=True,
        tickfont=dict(color="black"), titlefont=dict(color="black")
    )
    return fig

def plot_micro_clusters(df_clusters: pd.DataFrame, output_html: str | None = None) -> go.Figure:
    """
    Grafica las procedencias en el plano log_hongos vs log_levaduras
    coloreadas por cluster_micro.
    """
    if not {"log_hongos", "log_levaduras", "cluster_micro"}.issubset(df_clusters.columns):
        raise ValueError("El DataFrame debe contener log_hongos, log_levaduras y cluster_micro.")

    fig = px.scatter(
        df_clusters,
        x="log_hongos",
        y="log_levaduras",
        color="cluster_micro",
        hover_name="producto",
        size=df_clusters.get("Coliformes", df_clusters.get("coliformes", None)),
        size_max=20,
        color_discrete_sequence=CORPORATE_COLORS,
    )

    fig.update_xaxes(title_text="log₁₀(Hongos UFC/g)")
    fig.update_yaxes(title_text="log₁₀(Levaduras UFC/g)")
    fig = apply_corporate_layout(fig, "<b>Grupos de productos según carga microbiológica</b>")

    if output_html is not None:
        fig.write_html(output_html)

    return fig

In [69]:
summary_clusters, feature_cols, kmeans, scaler = cluster_procedencias_micro(
    summary_pt.dropna(),
    n_clusters=3,
)
summary_clusters

Unnamed: 0,producto,count,log_hongos,log_levaduras,aflatoxina,don,t2,fumonisina,ocratoxina,zearelenona,coliformes,cluster_micro
0,aliengorde 3,1,1.041393,0.0,3.7,0.29,0.0,1.16,0.0,22.0,0.0,0
1,cerdo crecimiento especial,1,1.612784,1.322219,0.0,0.0,0.0,0.06,0.0,40.0,100.0,0
2,cerdo gestacion especial,1,3.000434,1.041393,1.7,0.04,0.0,0.22,0.0,41.0,250.0,1
3,cerdo inicio especial,1,1.041393,1.322219,8.2,0.27,0.0,0.06,1.4,76.0,0.0,0
5,fase 1,2,1.002161,0.0,1.35,0.0,1.0,0.34,0.15,41.5,0.0,0
6,fase 2,7,0.699137,0.624836,5.3,0.245714,15.571429,0.337143,0.285714,91.714286,50.0,0
7,fase 3,5,0.851251,0.330555,9.2,0.368,23.8,1.464,1.26,91.0,27.5,0
8,fase 4,6,0.921458,0.330555,6.283333,0.311667,11.666667,0.896667,0.466667,94.666667,677.5,0
9,gr-11,1,1.041393,0.0,0.0,0.0,1.0,0.13,0.0,59.0,0.0,0
10,gr-13,1,0.0,0.0,0.5,0.0,0.0,0.46,0.0,2.0,0.0,0


In [70]:

summary_clusters[["producto", "cluster_micro"] + feature_cols].sort_values("cluster_micro")
fig = plot_micro_clusters(summary_clusters)
s3.save_plotly_html(fig, f"clusters_carga_micro.html")
fig.show()


In [73]:
groups = summary_clusters.groupby("cluster_micro").agg(
    procedencias=(
        "producto",
        lambda x: ", ".join(sorted(set(map(str, x))))
    ),
    aflatoxina=("aflatoxina", "mean"),
    don=("don", "mean"),
    t2=("t2", "mean"),
    fumonisina=('fumonisina', 'mean'),
    ocratoxina=("ocratoxina", "mean"),
    zearelenona=('zearelenona', 'mean'),

).reset_index().round(2)
s3.save_dataframe(
    groups,
    f"clusters_carga_micro.csv"
)


In [74]:
groups

Unnamed: 0,cluster_micro,procedencias,aflatoxina,don,t2,fumonisina,ocratoxina,zearelenona
0,0,"aliengorde 3, cerdo crecimiento especial, cerd...",4.06,0.19,6.71,0.44,0.25,58.57
1,1,"cerdo gestacion especial, postura 2 ponedora, ...",3.2,0.34,5.46,0.51,0.2,59.0
2,2,"super postura fase 2, vitalechero 16 hiper, vi...",7.77,1.08,15.67,0.78,0.67,93.67
