In [4]:


import sys
sys.path.append(os.path.abspath(".."))
from core.viz import plot_line, create_subplot_grid, plot_bar, plot_statistical_strip
from core.s3 import S3AssetManager

from pathlib import Path
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import numpy as np
import pandas as pd
from typing import Dict, List, Optional, Tuple, Iterable, Mapping, Sequence
import matplotlib.pyplot as plt
import seaborn as sns
import re, unicodedata
import datetime

import os
import glob

In [12]:
notebook_name = "el_dorado_seguimiento_orp"
s3 = S3AssetManager(notebook_name=notebook_name)

In [13]:
colors = [ "#1c8074","#1a494c", "#94af92", "#666666", "#f9ee77", "#f5ad68", "#c76931"]
PALETTE =  [ "#1c8074","#1a494c", "#94af92", "#666666", "#f9ee77", "#f5ad68", "#c76931"]
CORPORATE_COLORS = [
    "#1A494C",  # 0
    "#17877D",  # 1
    "#94AF92",  # 2
    "#F6B27A",  # 3
    "#F18F01",  # 4
    "#E4572E",  # 5
    "#6C757D",  # 6
    "#343A40",  # 7
    "#A3CED0",  # 8
]


In [14]:
def build_stats(
    df: pd.DataFrame,
    group_col: str,
    value_cols: Sequence[str],
) -> pd.DataFrame:
    """
    Calcula estad√≠sticas descriptivas por grupo para un conjunto de columnas num√©ricas.

    Para cada columna en `value_cols` y para cada categor√≠a de `group_col` se calculan:
    - min, max, mean, median, std
    - q05: cuantil 5%
    - q95: cuantil 95%
    - cv : coeficiente de variaci√≥n (std / mean)

    Par√°metros
    ----------
    df : pd.DataFrame
        DataFrame de origen.
    group_col : str
        Nombre de la columna por la que se agrupar√°.
    value_cols : Sequence[str]
        Lista de columnas num√©ricas sobre las que se calcular√°n las estad√≠sticas.

    Returns
    -------
    pd.DataFrame
        DataFrame con una fila por grupo y columnas del tipo:
        `<variable>_<estadistico>` (por ejemplo: `col1_mean`, `col1_cv`, etc.).
    """

    # Definimos funciones de cuantiles con nombre para que salgan bien en las columnas
    def q05(x: pd.Series) -> float:
        return x.quantile(0.05)
    q05.__name__ = "q05"

    def q95(x: pd.Series) -> float:
        return x.quantile(0.95)
    q95.__name__ = "q95"

    # Agrupaci√≥n y c√°lculo de estad√≠sticas b√°sicas
    g = (
        df
        .groupby(group_col,observed=False)[list(value_cols)]
        .agg(["min",q05, "mean", "median", "std",  q95, "max",])
    )

    # A√±adir coeficiente de variaci√≥n (cv = std / mean)
    for c in value_cols:
        mean_ = g[(c, "mean")]
        std_ = g[(c, "std")]

        # Evita divisi√≥n por cero: donde mean_ = 0, el cv se deja en NaN
        cv_ = std_ / mean_.replace(0, np.nan)
        g[(c, "cv")] = cv_

    # Aplanar MultiIndex de columnas: (variable, estadistico) -> "variable_estadistico"
    g.columns = [f"{var}_{stat}" for var, stat in g.columns]

    # Devolver con el √≠ndice (grupo) como columna
    g = g.reset_index()

    return g


In [15]:
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd

def compute_prev_by_metric(
    df: pd.DataFrame,
    *,
    metric_col: str,
    target_col: str,
    n_bins: int = 10,
) -> pd.DataFrame:
    """
    Prevalencia del target (0/1) por rangos de una m√©trica.
    Devuelve: bin, prevalencia, n_total, n_positivos
    """
    tmp = df[[metric_col, target_col]].dropna().copy()

    vmin, vmax = tmp[metric_col].min(), tmp[metric_col].max()
    bins = np.linspace(vmin, vmax, n_bins + 1)

    tmp["bin"] = pd.cut(tmp[metric_col], bins, include_lowest=True)

    prev_por_bin = (
        tmp.groupby("bin", observed=False)[target_col]
           .agg(
               prevalencia="mean",
               n_total="size",
               n_positivos="sum",
           )
           .reset_index()
    )
    return prev_por_bin
def plot_salmo_campy_by_metric(
    df: pd.DataFrame,
    *,
    metric_col: str,          # ej: "cloro_chiller_mean"
    metric_label: str,        # ej: "Cloro chiller"
    salmo_col: str = "salmo_pos",
    campy_col: str = "campy_pos",
    n_bins: int = 10,
    width: int = 800,
    height: int = 600,
) -> go.Figure:

    prev_salmo = compute_prev_by_metric(
        df, metric_col=metric_col, target_col=salmo_col, n_bins=n_bins
    )
    prev_campy = compute_prev_by_metric(
        df, metric_col=metric_col, target_col=campy_col, n_bins=n_bins
    )

    fig = make_subplots(
        rows=1,
        cols=2,
        subplot_titles=("Salmonella", "Campylobacter"),
        shared_yaxes=True,
    )

    # Panel 1: Salmonella
    fig.add_trace(
        go.Bar(
            x=prev_salmo["bin"].astype(str),
            y=prev_salmo["prevalencia"],
            name="Salmonella",
            marker=dict(
                color=CORPORATE_COLORS[1],                   # verde principal
                line=dict(color=CORPORATE_COLORS[0], width=1),
            ),
            customdata=np.stack(
                [prev_salmo["n_total"], prev_salmo["n_positivos"]], axis=-1
            ),
            hovertemplate=(
                "Rango: %{x}<br>"
                "Prevalencia: %{y:.2%}<br>"
                "An√°lisis totales: %{customdata[0]}<br>"
                "An√°lisis positivos: %{customdata[1]}<extra></extra>"
            ),
            showlegend=False,
        ),
        row=1, col=1,
    )

    # Panel 2: Campylobacter
    fig.add_trace(
        go.Bar(
            x=prev_campy["bin"].astype(str),
            y=prev_campy["prevalencia"],
            name="Campylobacter",
            marker=dict(
                color=CORPORATE_COLORS[5],                   # rojo corporativo
                line=dict(color=CORPORATE_COLORS[0], width=1),
            ),
            customdata=np.stack(
                [prev_campy["n_total"], prev_campy["n_positivos"]], axis=-1
            ),
            hovertemplate=(
                "Rango: %{x}<br>"
                "Prevalencia: %{y:.2%}<br>"
                "An√°lisis totales: %{customdata[0]}<br>"
                "An√°lisis positivos: %{customdata[1]}<extra></extra>"
            ),
            showlegend=False,
        ),
        row=1, col=2,
    )

    # T√≠tulos de ejes
    fig.update_xaxes(title_text=f"Rango de {metric_label}", row=1, col=1)
    fig.update_xaxes(title_text=f"Rango de {metric_label}", row=1, col=2)
    fig.update_yaxes(
        title_text="Prevalencia (%)",
        row=1, col=1,
    )

    # Recuadro (marcos) en negro y sin grid
    fig.update_xaxes(
        showline=True,
        linewidth=1,
        linecolor="black",   # üîπ recuadro negro
        mirror=True,
        showgrid=False,
        tickfont=dict(color="black"),   # üîπ ticks negros
        title_font=dict(color="black"), # üîπ label eje X negro
    )
    fig.update_yaxes(
        showline=True,
        linewidth=1,
        linecolor="black",   # üîπ recuadro negro
        mirror=True,
        showgrid=False,
        tickfont=dict(color="black"),   # üîπ ticks negros
        title_font=dict(color="black"), # üîπ label eje Y negro
    )

    fig.update_layout(
        title=f"<b>Prevalencia por rango de {metric_label} <b>",
        paper_bgcolor="rgba(0,0,0,0)",   # sin fondo
        plot_bgcolor="rgba(0,0,0,0)",
        bargap=0.05,
        width=width,
        height=height,
        font=dict(color="black"),
    )

    return fig



In [16]:

# Paleta corporativa
CORPORATE_COLORS = [
    "#1A494C",  # 0
    "#17877D",  # 1
    "#94AF92",  # 2
    "#F6B27A",  # 3
    "#F18F01",  # 4
    "#E4572E",  # 5
    "#6C757D",  # 6
    "#343A40",  # 7
    "#A3CED0",  # 8
]


def plot_validacion_variable_diaria(
    df: pd.DataFrame,
    *,
    date_col: str = "date",
    value_col: str = "cloro_chiller",
    central: str = "median",                # "mean" o "median"
    p_low: float = 0.02,                    # percentil bajo (2 %)
    p_high: float = 0.98,                   # percentil alto (98 %)
    rango_objetivo: tuple[float, float] | None = None,  # (min, max) o None
    title: str | None = None,
    yaxis_title: str | None = None,
    colors: list[str] | None = None,
) -> go.Figure:
    """
    Grafica validaci√≥n diaria de una variable con estilo corporativo:

    - L√≠nea central diaria (media o mediana).
    - Sombra entre los percentiles [p_low, p_high] diarios (por defecto P2‚ÄìP98).
    - Zona √≥ptima HACCP como franja resaltada por encima del fondo.

    Par√°metros
    ----------
    df : DataFrame con columnas [date_col, value_col] (idealmente datos crudos).
    date_col : columna de fecha/hora (se convierte a datetime).
    value_col : columna con la medici√≥n (cloro, pH, ORP, etc.).
    central : "mean" o "median".
    p_low, p_high : percentiles (0‚Äì1) para la banda sombreada.
    rango_objetivo : (min, max) para dibujar banda de zona √≥ptima HACCP.
    """

    if colors is None:
        colors = CORPORATE_COLORS

    # --- Preprocesado ---
    df_plot = df[[date_col, value_col]].dropna(subset=[value_col]).copy()
    df_plot[date_col] = pd.to_datetime(df_plot[date_col], errors="coerce")

    # Agrupamos por d√≠a (ignorando hora)
    df_plot["__date__"] = df_plot[date_col].dt.normalize()
    group = df_plot.groupby("__date__")[value_col]

    # Estad√≠sticos b√°sicos (media y mediana diaria)
    stats = group.agg(["mean", "median"])

    # Percentiles p_low‚Äìp_high (P2‚ÄìP98 por defecto)
    q = group.quantile([p_low, p_high]).unstack(level=-1)
    q.columns = [f"q{int(qi * 100)}" for qi in q.columns]  # q2, q98
    stats = stats.join(q)

    stats = stats.reset_index().rename(columns={"__date__": "date"})
    stats = stats.sort_values("date")

    # Elegir serie central
    central = central.lower()
    if central not in {"mean", "median"}:
        raise ValueError("central debe ser 'mean' o 'median'")
    central_series = stats[central]

    col_low = f"q{int(p_low * 100)}"      # q2
    col_high = f"q{int(p_high * 100)}"    # q98

    # --- Figura ---
    fig = go.Figure()

    # Banda P2‚ÄìP98 (fondo)
    fig.add_trace(
        go.Scatter(
            x=stats["date"],
            y=stats[col_high],
            mode="lines",
            line=dict(width=0),
            showlegend=False,
            hoverinfo="skip",
        )
    )

    fig.add_trace(
        go.Scatter(
            x=stats["date"],
            y=stats[col_low],
            mode="lines",
            line=dict(width=0),
            fill="tonexty",
            fillcolor="rgba(163, 206, 208, 0.55)",  # azul corporativo m√°s intenso
            name=f"Rango diario (P{int(p_low*100)}‚ÄìP{int(p_high*100)})",
            hoverinfo="skip",
        )
    )

    # Zona √≥ptima HACCP (m√°s oscura y por encima del fondo)
    if rango_objetivo is not None:
        y0, y1 = rango_objetivo
        label = f"<b>Zona √≥ptima HACCP ({y0:g}‚Äì{y1:g})<b>"

        fig.add_hrect(
            y0=y0,
            y1=y1,
            fillcolor="rgba(228, 87, 46, 0.45)",  # rojo corporativo m√°s intenso
            line_width=0,
            layer="above",                         # por encima de la franja de fondo
        )

        # Traza dummy solo para leyenda
        fig.add_trace(
            go.Scatter(
                x=[stats["date"].min()],
                y=[y1],
                mode="lines",
                line=dict(width=10, color="rgba(228, 87, 46, 0.9)"),
                name=label,
                showlegend=True,
                hoverinfo="skip",
            )
        )

    # L√≠nea central (m√°s gruesa)
    fig.add_trace(
        go.Scatter(
            x=stats["date"],
            y=central_series,
            mode="lines+markers",
            name=f"{value_col} ({'media' if central=='mean' else 'mediana'} diaria)",
            line=dict(width=3, color=colors[1]),  # l√≠nea m√°s gruesa
            marker=dict(size=6, color=colors[1]),
        )
    )

    # T√≠tulos por defecto
    if title is None:
        title = f"Validaci√≥n diaria de {value_col}"
    if yaxis_title is None:
        yaxis_title = value_col

    # --- Estilo corporativo: sin fondo, con marco negro ---
    fig.update_layout(
        title=title,
        xaxis_title="Fecha",
        yaxis_title=yaxis_title,
        plot_bgcolor="rgba(0,0,0,0)",     # sin fondo
        paper_bgcolor="rgba(0,0,0,0)",    # sin fondo
        font=dict(family="Inter, Arial, sans-serif", size=12, color="#343A40"),
        xaxis=dict(
            showgrid=True,
            gridcolor="#E9ECEF",
            zeroline=False,
            showline=True,
            linecolor="#000000",          # marco negro
            mirror=True,
            ticks="outside",
            tickcolor="#000000",
        ),
        yaxis=dict(
            showgrid=True,
            gridcolor="#E9ECEF",
            zeroline=False,
            showline=True,
            linecolor="#000000",          # marco negro
            mirror=True,
            ticks="outside",
            tickcolor="#000000",
        ),
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1,
        ),
        margin=dict(l=60, r=20, t=60, b=40),
    )

    return fig


In [40]:
def plot_chiller_by_hour_range(
    df_stats: pd.DataFrame,
    hour_col: str = "hour_range",
    title: str = "<b>Comportamiento por rango de hora en salida de chiller</b>",
) -> go.Figure:
    """
    Grafica ORP, Cloro y pH promedio por rango de hora,
    incluyendo la banda entre Q05 y Q95.
    """
    df_plot = df_stats.copy()
    df_plot[hour_col] = df_plot[hour_col].astype(str)
    x = df_plot[hour_col]

    fig = make_subplots(
        rows=3,
        cols=1,
        shared_xaxes=True,
        vertical_spacing=0.05,
        subplot_titles=[
            "ORP medio por rango de hora (mV)",          # fila 1
            "Cloro libre medio por rango de hora (ppm)",# fila 2
            "pH medio por rango de hora",               # fila 3
        ],
    )

    # --------- 1) ORP (fila 1) ----------
    fig.add_trace(
        go.Scatter(
            x=x,
            y=df_plot["orp_chiller_q05"],
            mode="lines",
            line=dict(width=0),
            hoverinfo="skip",
            showlegend=False,
        ),
        row=1,
        col=1,
    )
    fig.add_trace(
        go.Scatter(
            x=x,
            y=df_plot["orp_chiller_q95"],
            mode="lines",
            line=dict(width=0),
            fill="tonexty",
            fillcolor="rgba(163, 206, 208, 0.35)",
            hoverinfo="skip",
            name="Rango (Q05‚ÄìQ95)",
            showlegend=True,
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=x,
            y=df_plot["orp_chiller_mean"],
            mode="lines+markers",
            name="ORP (mV)",
            line=dict(color=CORPORATE_COLORS[4], width=2),
            marker=dict(size=6),
        ),
        row=1,
        col=1,
    )

    # --------- 2) CLORO (fila 2) ----------
    fig.add_trace(
        go.Scatter(
            x=x,
            y=df_plot["cloro_chiller_q05"],
            mode="lines",
            line=dict(width=0),
            hoverinfo="skip",
            showlegend=False,
        ),
        row=2,
        col=1,
    )
    fig.add_trace(
        go.Scatter(
            x=x,
            y=df_plot["cloro_chiller_q95"],
            mode="lines",
            line=dict(width=0),
            fill="tonexty",
            fillcolor="rgba(163, 206, 208, 0.35)",
            hoverinfo="skip",
            showlegend=False,  # la banda ya est√° en la leyenda arriba
        ),
        row=2,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=x,
            y=df_plot["cloro_chiller_mean"],
            mode="lines+markers",
            name="Cloro libre (ppm)",
            line=dict(color=CORPORATE_COLORS[1], width=2),
            marker=dict(size=6),
        ),
        row=2,
        col=1,
    )

    # --------- 3) pH (fila 3) ----------
    fig.add_trace(
        go.Scatter(
            x=x,
            y=df_plot["ph_chiller_q05"],
            mode="lines",
            line=dict(width=0),
            hoverinfo="skip",
            showlegend=False,
        ),
        row=3,
        col=1,
    )
    fig.add_trace(
        go.Scatter(
            x=x,
            y=df_plot["ph_chiller_q95"],
            mode="lines",
            line=dict(width=0),
            fill="tonexty",
            fillcolor="rgba(163, 206, 208, 0.35)",
            hoverinfo="skip",
            showlegend=False,
        ),
        row=3,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=x,
            y=df_plot["ph_chiller_mean"],
            mode="lines+markers",
            name="pH",
            line=dict(color=CORPORATE_COLORS[2], width=2),
            marker=dict(size=6),
        ),
        row=3,
        col=1,
    )

    # Ejes
    fig.update_yaxes(title_text="ORP (mV)", row=1, col=1)
    fig.update_yaxes(title_text="Cloro (ppm)", row=2, col=1)
    fig.update_yaxes(title_text="pH", row=3, col=1)

    fig.update_xaxes(
        title_text="Rango de hora",
        type="category",
        categoryorder="array",
        categoryarray=df_plot[hour_col].tolist(),
        row=3,
        col=1,
    )

    # Estilo general
    fig.update_layout(
        title=title,
        hovermode="x unified",
        paper_bgcolor="rgba(0,0,0,0)",
        plot_bgcolor="rgba(0,0,0,0)",
        font=dict(color="black"),
    )

    return fig


In [24]:
import awswrangler as wr
import io
import re

def procesar_carpeta_dorado_s3(s3_path, s3_manager):
    """
    Procesa archivos Excel desde una ruta de S3 usando la l√≥gica de negocio 'Dorado'.
    """
    # 1. Listar archivos en S3 usando awswrangler
    print(f"Buscando archivos en: {s3_path}...")
    archivos = wr.s3.list_objects(path=s3_path, suffix=".xlsx", boto3_session=s3_manager.session)

    print(f"Encontrados {len(archivos)} archivos en S3.")

    lista_dfs = []

    for archivo_uri in archivos:
        # Extraer nombre del archivo para logs y columnas
        nombre_archivo = archivo_uri.split('/')[-1]

        if nombre_archivo.startswith("~$"):
            continue

        print(f"Procesando: {nombre_archivo}...")

        try:
            # 2. Leer el archivo desde S3 a memoria (BytesIO)
            # Parseamos bucket y key desde la URI s3://
            path_parts = archivo_uri.replace("s3://", "").split("/", 1)
            bucket = path_parts[0]
            key = path_parts[1]

            obj = s3_manager.s3_client.get_object(Bucket=bucket, Key=key)
            archivo_bytes = io.BytesIO(obj['Body'].read())

            # 3. Cargar en Pandas ExcelFile usando el objeto en memoria
            xls = pd.ExcelFile(archivo_bytes)

            # --- AQUI COMIENZA TU L√ìGICA ORIGINAL EXACTA ---
            for nombre_hoja in xls.sheet_names:
                df = pd.read_excel(xls, sheet_name=nombre_hoja)

                valores_col_a = df.iloc[:, 0].dropna().astype(str).tolist()
                fecha_encontrada = None
                lote_encontrado = None

                for valor in valores_col_a:
                    if re.search(r'\d{2}/\d{2}/\d{4}', valor) or re.search(r'\d{4}-\d{2}-\d{2}', valor):
                        fecha_encontrada = valor
                    elif "Fecha" not in valor and "Lote" not in valor:
                        lote_encontrado = valor
                
                if df.shape[1] > 1:
                    df = df.dropna(subset=[df.columns[1]])

                df['Fecha'] = fecha_encontrada
                if lote_encontrado:
                    df['Lote'] = lote_encontrado
                else:
                    df['Lote'] = None
                
                # Reordenamiento y limpieza columnas
                cols = ['Fecha', 'Lote'] + [c for c in df.columns if c not in ['Fecha', 'Lote']]
                df = df[cols]
                # Ajuste de √≠ndices seguro
                if len(df.columns) > 2:
                    df = df.drop(df.columns[2], axis=1)
                
                df['Archivo_Origen'] = nombre_archivo # Usamos el nombre extra√≠do
                lista_dfs.append(df)
            # --- FIN L√ìGICA ORIGINAL ---

        except Exception as e:
            print(f"Error leyendo {nombre_archivo}: {e}")

    if not lista_dfs:
        return "No se encontraron datos para procesar."

    df_final = pd.concat(lista_dfs, ignore_index=True)

    # Procesamiento final de columnas
    df_final['Fecha'] = df_final['Fecha'].astype(str)
    separacion = df_final['Fecha'].str.split(' ', n=1, expand=True)

    df_final['Fecha'] = separacion[0]
    if 1 in separacion.columns:
         df_final['Lote'] = df_final['Lote'].fillna(separacion[1])

    # Limpieza de Hora (Aseg√∫rate que 'normalizar_hora' est√© definida en tu scope)
    col_hora = [c for c in df_final.columns if "ora" in str(c).lower()]
    if col_hora:
        try:
            nombre_col_hora = col_hora[0]
            # Nota: Asumo que normalizar_hora existe en tu entorno global
            df_final['Hora_Limpia'] = df_final[nombre_col_hora].apply(normalizar_hora)
            
            df_final['Fecha_dt'] = pd.to_datetime(df_final['Fecha'], dayfirst=True, errors='coerce')
            df_final['Fecha_Str'] = df_final['Fecha_dt'].dt.strftime('%Y-%m-%d')
            df_final['Fecha_Hora'] = df_final['Fecha_Str'].astype(str) + ' ' + df_final['Hora_Limpia'].astype(str)
            df_final['Fecha_Hora'] = pd.to_datetime(df_final['Fecha_Hora'], errors='coerce')
            df_final = df_final.drop(columns=['Fecha_dt', 'Fecha_Str'])
        except NameError:
            print("Advertencia: La funci√≥n 'normalizar_hora' no est√° definida. Saltando limpieza de hora.")

    return df_final

In [32]:
ruta_archivos_s3 = f"s3://{s3.bucket_name}/raw/dorado/sensores/"
df_consolidado = procesar_carpeta_dorado_s3(ruta_archivos_s3, s3)
micro = s3.read_csv("raw/dorado/sensores/ARCHIVOS INTERNOS EL DORADO/microbiologia.csv")

Buscando archivos en: s3://galileo-c4e9a2f1/raw/dorado/sensores/...
Encontrados 11 archivos en S3.
Procesando: 10. OCTUBRE.xlsx...
Procesando: 11. Noviembre.xlsx...
Procesando: 3. Control de paraÃÅmetros PCC Marzo 25.xlsx...
Procesando: 4.Control de parametros PCC Abril 2025.xlsx...
Procesando: 5. Control de parametros Enero.xlsx...
Procesando: 5.Control de paraÃÅmetros PCC Mayo.xlsx...
Procesando: 6. Control de paraÃÅmetros Febrero.xlsx...
Procesando: 6.Control de paraÃÅmetros PCC Junio.xlsx...
Procesando: 7.Control de paraÃÅmetros PCC Julio.xlsx...
Procesando: 8. AGOSTO.xlsx...
Procesando: 9.SEPTIEMBRE.xlsx...




In [33]:

def normalizar_hora(valor):
    if pd.isna(valor) or valor == "": return None
    if isinstance(valor, datetime.time): return valor.strftime("%H:%M:%S")
    if isinstance(valor, datetime.datetime): return valor.time().strftime("%H:%M:%S")
    if isinstance(valor, str):
        t = valor.upper().strip().replace("::", ":").replace(";", ":").replace(".", "").replace("  ", " ")
        try:
            dt = pd.to_datetime(t, errors='coerce')
            if pd.notna(dt): return dt.strftime("%H:%M:%S")
        except: pass
    return valor

In [34]:
cls = [
'Lote','Fecha_Hora', 'Hora_Limpia',
'Residual de cloro Tanque   Chiller 1,0-5,0 ppm',
'ph chiller Tanque Chiller', 'Medici√≥n ORP Mv Tanque Chiller',
 'Residual de cloro Prechiller 1,0-5,0 ppm',
'ph Prechiller', 'Medici√≥n ORP Mv Prechiller',
'ppm Cloro Menudencia  30-40 ppm',
'Medici√≥n ORP Mv Menudenicas ',
'ppm cloro Agua Duchado final 30-40 ppm',
'Medici√≥n Maquina lavadora de canales 30-40ppm',
'ppm Cloro Red Potable 1,0-2,0 ppm', 'Realiz√≥', 'Granja',
'Archivo_Origen',

]

rename = {
'Lote': 'lot',
'Fecha_Hora': 'datetime',
'Hora_Limpia': 'hour',
'Residual de cloro Tanque   Chiller 1,0-5,0 ppm': 'cloro_chiller',
'ph chiller Tanque Chiller': 'ph_chiller',
'Medici√≥n ORP Mv Tanque Chiller': 'orp_chiller',
'Residual de cloro Prechiller 1,0-5,0 ppm': 'cloro_prechiller',
'ph Prechiller': 'ph_prechiller',
'Medici√≥n ORP Mv Prechiller': 'orp_prechiller',
'ppm Cloro Menudencia  30-40 ppm': 'cloro_menudencia',
'Medici√≥n ORP Mv Menudenicas ': 'orp_menudencia',
'ppm cloro Agua Duchado final 30-40 ppm': 'cloro_agua_ducha_final',
'Medici√≥n Maquina lavadora de canales 30-40ppm': 'cloro_lavadora_canales',
'ppm Cloro Red Potable 1,0-2,0 ppm': 'cloro_red_potable',
'Realiz√≥': 'operario',
'Granja': 'granja',
'Archivo_Origen': 'source'
}
df_sensor = df_consolidado[cls].rename(columns=rename)
cls_num = [
'cloro_chiller',
'orp_chiller',
'ph_chiller',
'cloro_prechiller',
'ph_prechiller',
'orp_prechiller',
'cloro_menudencia',
'orp_menudencia',
'cloro_agua_ducha_final',
'cloro_lavadora_canales',
'cloro_red_potable',
]

for cl in cls_num:
    df_sensor[cl] = pd.to_numeric(df_sensor[cl], errors='coerce')


df_sensor = df_sensor[df_sensor['datetime'].notnull()]

df_sensor.sort_values(by=['datetime'], ascending=True, inplace=True)
df_sensor["date"] = pd.to_datetime(df_sensor['datetime'], errors='coerce').dt.date
df_sensor["month"] = pd.to_datetime(df_sensor['datetime'], errors='coerce').dt.month
df_sensor[cls_num].describe()

Unnamed: 0,cloro_chiller,orp_chiller,ph_chiller,cloro_prechiller,ph_prechiller,orp_prechiller,cloro_menudencia,orp_menudencia,cloro_agua_ducha_final,cloro_lavadora_canales,cloro_red_potable
count,4260.0,4256.0,4253.0,1371.0,1364.0,1334.0,1296.0,1301.0,1292.0,1209.0,1380.0
mean,4.3298,538.623355,5.942152,6.493795,8.555784,427.511244,37.822353,343.115296,339.834335,27.903271,1.677246
std,0.422952,180.215456,0.176595,20.132414,10.436013,181.541927,2.168892,187.170152,5416.621035,12.365313,0.068473
min,2.0,24.0,5.383,1.826087,4.51,210.0,2.1,0.4,31.48,5.9,1.4
25%,4.1,450.0,5.86,4.0,7.4,320.0,37.44,230.0,37.44,11.06,1.7
50%,4.4,498.0,5.91,4.2,7.45,359.0,38.29,288.0,38.29,36.16,1.7
75%,4.6,561.0,5.98,4.4,7.5,453.0,38.29,396.0,38.71,37.1,1.7
max,13.0,8763.0,7.58,227.799,120.0,2936.0,39.99,842.0,97500.0,49.107692,1.8


In [36]:
#TODO valores permitidos
ORP_MAX = 1_000
PH = 9
CLORO_LIBRE = 7


In [39]:
df_sensor

Unnamed: 0,lot,datetime,hour,cloro_chiller,ph_chiller,orp_chiller,cloro_prechiller,ph_prechiller,orp_prechiller,cloro_menudencia,orp_menudencia,cloro_agua_ducha_final,cloro_lavadora_canales,cloro_red_potable,operario,granja,source,date,month
2091,0022025,2025-01-02 00:08:00,00:08:00,4.4,5.79,826.0,,,,,,,,,,,5. Control de parametros Enero.xlsx,2025-01-02,1
2092,0022025,2025-01-02 01:15:00,01:15:00,3.5,5.90,503.0,,,,,,,,,,,5. Control de parametros Enero.xlsx,2025-01-02,1
2093,0022025,2025-01-02 02:06:00,02:06:00,4.0,5.86,516.0,,,,,,,,,,,5. Control de parametros Enero.xlsx,2025-01-02,1
2094,0022025,2025-01-02 03:01:00,03:01:00,4.3,5.92,493.0,,,,,,,,,,,5. Control de parametros Enero.xlsx,2025-01-02,1
2095,0022025,2025-01-02 04:05:00,04:05:00,4.1,6.00,451.0,,,,,,,,,,,5. Control de parametros Enero.xlsx,2025-01-02,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1173,[ ] ppm cloro,2025-11-13 11:07:00,11:07:00,3.9,5.80,461.0,4.0,7.41,321.0,38.29,210.0,38.29,10.21,1.7,E.Araque,,11. Noviembre.xlsx,2025-11-13,11
1174,[ ] ppm cloro,2025-11-13 11:48:00,11:48:00,4.1,5.70,430.0,4.0,7.48,316.0,38.71,186.0,38.71,10.64,1.7,Verif.U.Cardozo,,11. Noviembre.xlsx,2025-11-13,11
1175,[ ] ppm cloro,2025-11-13 12:02:00,12:02:00,4.1,5.87,449.0,4.2,7.44,310.0,37.44,163.0,37.86,10.64,1.7,E.Araque,,11. Noviembre.xlsx,2025-11-13,11
1176,[ ] ppm cloro,2025-11-13 13:05:00,13:05:00,4.0,5.82,440.0,4.1,7.40,274.0,37.86,150.0,38.29,11.49,1.7,E.Araque,,11. Noviembre.xlsx,2025-11-13,11


In [41]:
df_sensor.loc[df_sensor["orp_chiller"]>ORP_MAX, "orp_chiller"] = df_sensor["orp_chiller"].median()
df_sensor.loc[df_sensor["cloro_chiller"]>CLORO_LIBRE, "cloro_chiller"] = df_sensor["cloro_chiller"].median()
df_sensor.loc[df_sensor["ph_chiller"]>PH, "ph_chiller"] = df_sensor["ph_chiller"].median()

df_sensor.loc[df_sensor["orp_prechiller"]>ORP_MAX, "orp_prechiller"] = df_sensor["orp_prechiller"].median()
df_sensor.loc[df_sensor["cloro_prechiller"]>CLORO_LIBRE, "cloro_prechiller"] = df_sensor["cloro_prechiller"].median()
df_sensor.loc[df_sensor["ph_prechiller"]>PH, "ph_prechiller"] = df_sensor["ph_prechiller"].median()

In [42]:
df_sensor

Unnamed: 0,lot,datetime,hour,cloro_chiller,ph_chiller,orp_chiller,cloro_prechiller,ph_prechiller,orp_prechiller,cloro_menudencia,orp_menudencia,cloro_agua_ducha_final,cloro_lavadora_canales,cloro_red_potable,operario,granja,source,date,month
2091,0022025,2025-01-02 00:08:00,00:08:00,4.4,5.79,826.0,,,,,,,,,,,5. Control de parametros Enero.xlsx,2025-01-02,1
2092,0022025,2025-01-02 01:15:00,01:15:00,3.5,5.90,503.0,,,,,,,,,,,5. Control de parametros Enero.xlsx,2025-01-02,1
2093,0022025,2025-01-02 02:06:00,02:06:00,4.0,5.86,516.0,,,,,,,,,,,5. Control de parametros Enero.xlsx,2025-01-02,1
2094,0022025,2025-01-02 03:01:00,03:01:00,4.3,5.92,493.0,,,,,,,,,,,5. Control de parametros Enero.xlsx,2025-01-02,1
2095,0022025,2025-01-02 04:05:00,04:05:00,4.1,6.00,451.0,,,,,,,,,,,5. Control de parametros Enero.xlsx,2025-01-02,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1173,[ ] ppm cloro,2025-11-13 11:07:00,11:07:00,3.9,5.80,461.0,4.0,7.41,321.0,38.29,210.0,38.29,10.21,1.7,E.Araque,,11. Noviembre.xlsx,2025-11-13,11
1174,[ ] ppm cloro,2025-11-13 11:48:00,11:48:00,4.1,5.70,430.0,4.0,7.48,316.0,38.71,186.0,38.71,10.64,1.7,Verif.U.Cardozo,,11. Noviembre.xlsx,2025-11-13,11
1175,[ ] ppm cloro,2025-11-13 12:02:00,12:02:00,4.1,5.87,449.0,4.2,7.44,310.0,37.44,163.0,37.86,10.64,1.7,E.Araque,,11. Noviembre.xlsx,2025-11-13,11
1176,[ ] ppm cloro,2025-11-13 13:05:00,13:05:00,4.0,5.82,440.0,4.1,7.40,274.0,37.86,150.0,38.29,11.49,1.7,E.Araque,,11. Noviembre.xlsx,2025-11-13,11


In [43]:

hour_num = pd.to_datetime(df_sensor["hour"], format="%H:%M:%S", errors="coerce").dt.hour

# 2) Crear rangos de 1 hora (0-1, 1-2, ..., 23-24)
bins = range(0, 25)  # 0‚Äì1, 1‚Äì2, ..., 23‚Äì24
labels = [f"{h:02d}:00-{h+1:02d}:00" for h in range(0, 24)]

df_sensor["hour_range"] = pd.cut(
    hour_num,
    bins=bins,
    right=False,      # [h, h+1)
    labels=labels
)
hr = build_stats(df_sensor, group_col="hour_range", value_cols=["cloro_chiller", "orp_chiller", "ph_chiller"])
hr

Unnamed: 0,hour_range,cloro_chiller_min,cloro_chiller_q05,cloro_chiller_mean,cloro_chiller_median,cloro_chiller_std,cloro_chiller_q95,cloro_chiller_max,orp_chiller_min,orp_chiller_q05,...,ph_chiller_min,ph_chiller_q05,ph_chiller_mean,ph_chiller_median,ph_chiller_std,ph_chiller_q95,ph_chiller_max,cloro_chiller_cv,orp_chiller_cv,ph_chiller_cv
0,00:00-01:00,2.4,3.4,4.151852,4.2,0.487384,4.8,4.9,432.0,717.65,...,5.79,5.841,6.112901,6.08,0.241112,6.38,7.53,0.11739,0.077367,0.039443
1,01:00-02:00,3.4,3.7,4.301619,4.4,0.351879,4.8,4.9,403.0,453.4,...,5.77,5.82,5.982834,5.96,0.143861,6.19,7.39,0.081801,0.167379,0.024046
2,02:00-03:00,2.8,3.8,4.327977,4.4,0.312372,4.8,4.9,385.0,430.4,...,5.383,5.8,5.942416,5.92,0.160967,6.109,7.42,0.072175,0.18202,0.027088
3,03:00-04:00,3.2,3.8,4.35,4.4,0.332997,4.8,5.0,380.0,439.25,...,5.43,5.8025,5.924624,5.91,0.137572,6.0575,7.4,0.076551,0.17661,0.02322
4,04:00-05:00,2.6,3.8,4.325993,4.4,0.337794,4.8,5.0,362.0,429.0,...,5.73,5.8,5.91296,5.9,0.140581,6.05,7.44,0.078085,0.160609,0.023775
5,05:00-06:00,2.8,3.7,4.315326,4.4,0.356364,4.8,4.9,24.0,420.0,...,5.77,5.8,5.925402,5.9,0.157322,6.06,7.38,0.082581,0.154621,0.02655
6,06:00-07:00,3.1,3.7,4.33668,4.4,0.319287,4.8,4.8,386.0,415.0,...,5.5,5.8,5.918224,5.9,0.145288,6.08,7.4,0.073625,0.120071,0.024549
7,07:00-08:00,3.3,3.8,4.357318,4.4,0.310806,4.8,4.8,271.0,401.0,...,5.55,5.8,5.925441,5.9,0.174244,6.08,7.5,0.07133,0.107818,0.029406
8,08:00-09:00,2.0,3.8,4.325856,4.4,0.326467,4.7,5.0,270.0,412.1,...,5.78,5.801,5.937681,5.91,0.202483,6.088,7.58,0.075469,0.109824,0.034101
9,09:00-10:00,3.1,3.8,4.33254,4.4,0.303191,4.7,4.9,305.0,410.55,...,5.6,5.8,5.930833,5.91,0.171837,6.0745,7.47,0.06998,0.107315,0.028974


In [45]:
stats_hour = build_stats(
    df_sensor,
    group_col="hour_range",
    value_cols=[ "orp_chiller", "cloro_chiller","ph_chiller"]
)

fig = plot_chiller_by_hour_range(stats_hour)
fig.show()
file = f"comportamiento_rango_hora_global.html"
s3.save_plotly_html(fig, file)


In [46]:
stats_hour_ = build_stats(
        df_sensor[df_sensor["month"].isin([10,11])],
        group_col="hour_range",
        value_cols=["cloro_chiller", "orp_chiller", "ph_chiller"],
    )

stats_hour_[(stats_hour_["cloro_chiller_mean"]>4.4)]

Unnamed: 0,hour_range,cloro_chiller_min,cloro_chiller_q05,cloro_chiller_mean,cloro_chiller_median,cloro_chiller_std,cloro_chiller_q95,cloro_chiller_max,orp_chiller_min,orp_chiller_q05,...,ph_chiller_min,ph_chiller_q05,ph_chiller_mean,ph_chiller_median,ph_chiller_std,ph_chiller_q95,ph_chiller_max,cloro_chiller_cv,orp_chiller_cv,ph_chiller_cv
6,06:00-07:00,3.7,3.86,4.427273,4.5,0.268413,4.7,4.8,390.0,400.0,...,5.78,5.81,5.894242,5.89,0.058632,6.004,6.05,0.060627,0.078326,0.009947
17,17:00-18:00,4.0,4.105,4.4125,4.4,0.229518,4.665,4.7,406.0,414.4,...,5.85,5.8535,5.8875,5.89,0.026049,5.9165,5.92,0.052015,0.036868,0.004425


In [47]:
stats_hour_[(stats_hour_["ph_chiller_mean"]<5.9)]

Unnamed: 0,hour_range,cloro_chiller_min,cloro_chiller_q05,cloro_chiller_mean,cloro_chiller_median,cloro_chiller_std,cloro_chiller_q95,cloro_chiller_max,orp_chiller_min,orp_chiller_q05,...,ph_chiller_min,ph_chiller_q05,ph_chiller_mean,ph_chiller_median,ph_chiller_std,ph_chiller_q95,ph_chiller_max,cloro_chiller_cv,orp_chiller_cv,ph_chiller_cv
4,04:00-05:00,2.6,3.77,4.24359,4.3,0.400523,4.7,4.8,396.0,400.0,...,5.8,5.82,5.893333,5.9,0.046698,5.961,6.02,0.094383,0.119374,0.007924
6,06:00-07:00,3.7,3.86,4.427273,4.5,0.268413,4.7,4.8,390.0,400.0,...,5.78,5.81,5.894242,5.89,0.058632,6.004,6.05,0.060627,0.078326,0.009947
7,07:00-08:00,3.8,3.9,4.357576,4.4,0.306217,4.7,4.8,389.0,400.0,...,5.55,5.764,5.888485,5.89,0.093545,6.038,6.05,0.070272,0.065837,0.015886
13,13:00-14:00,3.9,4.0,4.246154,4.2,0.19643,4.5,4.7,372.0,377.75,...,5.81,5.82,5.895,5.895,0.04901,5.965,6.01,0.046261,0.056331,0.008314
16,16:00-17:00,4.0,4.11,4.375,4.4,0.186474,4.6,4.6,387.0,399.65,...,5.8,5.83,5.896364,5.91,0.041779,5.94,5.94,0.042623,0.048909,0.007085
17,17:00-18:00,4.0,4.105,4.4125,4.4,0.229518,4.665,4.7,406.0,414.4,...,5.85,5.8535,5.8875,5.89,0.026049,5.9165,5.92,0.052015,0.036868,0.004425


In [48]:
for mt in [10,11]:

    stats_hour_ = build_stats(
        df_sensor[df_sensor["month"] == mt],
        group_col="hour_range",
        value_cols=["cloro_chiller", "orp_chiller", "ph_chiller"],
    )

    fig = plot_chiller_by_hour_range(
        stats_hour_,
        title = f"<b>Comportamiento por rango de hora en salida de chiller mes {mt} </b>")
    fig.show()
    file = f"comportamiento_rango_hora_{mt}.html"
    s3.save_plotly_html(fig, file)

In [52]:

micro_chiller = micro.loc[micro["etapa"] == "salida chiller"].copy()
micro_chiller["date"] = pd.to_datetime(
    micro_chiller["date"], errors="coerce"
).dt.date

# 2) Definir grupo: Salmonella vs otros
micro_chiller["grupo"] = np.where(
    micro_chiller["microorganismo"] == "Salmonella",
    "salmo",
    "campy",
)

# 3) Un solo groupby para ambos grupos
agg = (
    micro_chiller
    .groupby(["date", "grupo"])
    .agg(
        n_analysis=("date", "count"),
        positive=("have_micro", "sum"),
        result=("result", "sum"),
        #ph=("ph", "mean"),
        #orp=("orp", "mean"),
        #cloro=("cloro", "mean"),
    )
    .reset_index()
)

agg["prev"] = agg["positive"] / agg["n_analysis"]
agg["log_result"] = np.log10(agg["result"] + 1)

wide = (
    agg
    .set_index(["date", "grupo"])
    .unstack("grupo")
)

wide.columns = [f"{col}_{grp}" for col, grp in wide.columns]
wide = wide.reset_index()
wide


Unnamed: 0,date,n_analysis_campy,n_analysis_salmo,positive_campy,positive_salmo,result_campy,result_salmo,prev_campy,prev_salmo,log_result_campy,log_result_salmo
0,2025-01-02,,1.0,,0.0,,0.0,,0.0,,0.000000
1,2025-01-03,,1.0,,0.0,,0.0,,0.0,,0.000000
2,2025-01-04,,1.0,,0.0,,0.0,,0.0,,0.000000
3,2025-01-06,,1.0,,1.0,,28.0,,1.0,,1.462398
4,2025-01-07,,1.0,,1.0,,642255.0,,1.0,,5.807708
...,...,...,...,...,...,...,...,...,...,...,...
227,2025-10-04,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
228,2025-10-06,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
229,2025-10-07,1.0,3.0,1.0,0.0,1.0,0.0,1.0,0.0,0.301030,0.000000
230,2025-10-08,1.0,3.0,1.0,0.0,1409.0,0.0,1.0,0.0,3.149219,0.000000


In [53]:
stats_chiller = build_stats(df_sensor, group_col="date", value_cols=["cloro_chiller", "orp_chiller", "ph_chiller"])
stats_chiller = pd.merge(stats_chiller, wide, on='date', how='left')

stats_chiller["salmo_pos"] = (stats_chiller["positive_salmo"] > 0).astype(int)
stats_chiller["campy_pos"] = (stats_chiller["positive_campy"] > 0).astype(int)  # 1 si hubo Salmonella ese d√≠a


In [54]:
stats_chiller.groupby("salmo_pos")["cloro_chiller_mean"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
salmo_pos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,175.0,4.317784,0.165096,3.669231,4.216324,4.321053,4.436731,4.694118
1,70.0,4.33945,0.17919,3.852632,4.257843,4.365686,4.454017,4.664706


In [55]:
metricas = {
    "cloro_chiller_mean": "Cloro chiller",
    "orp_chiller_mean": "ORP chiller",
    "ph_chiller_mean": "pH chiller",
}

for col, label in metricas.items():
    fig = plot_salmo_campy_by_metric(
        stats_chiller,
        metric_col=col,
        metric_label=label,
        salmo_col="salmo_pos",
        campy_col="campy_pos",
        n_bins=10,
         width=1300,
        height=400,
    )
    fig.show()
    file = f"prev_{col}.html"
    s3.save_plotly_html(fig, file)


In [56]:
stats_chiller[["orp_chiller_mean", "orp_chiller_median", "ph_chiller_mean", "cloro_chiller_mean",  "n_analysis_salmo", "n_analysis_campy", "positive_campy", "positive_salmo", "prev_campy", "prev_salmo","log_result_campy", "log_result_salmo"]]

Unnamed: 0,orp_chiller_mean,orp_chiller_median,ph_chiller_mean,cloro_chiller_mean,n_analysis_salmo,n_analysis_campy,positive_campy,positive_salmo,prev_campy,prev_salmo,log_result_campy,log_result_salmo
0,548.555556,519.5,5.924444,4.261111,1.0,,,0.0,,0.0,,0.000000
1,556.722222,537.5,5.934444,4.522222,1.0,,,0.0,,0.0,,0.000000
2,577.187500,552.5,5.924375,4.456250,1.0,,,0.0,,0.0,,0.000000
3,564.944444,511.0,5.984444,4.461111,1.0,,,1.0,,1.0,,1.462398
4,592.388889,571.0,5.913889,4.205556,1.0,,,1.0,,1.0,,5.807708
...,...,...,...,...,...,...,...,...,...,...,...,...
240,483.000000,441.0,5.916471,4.152941,,,,,,,,
241,539.687500,475.5,5.903125,4.231250,,,,,,,,
242,559.250000,517.5,5.910000,4.418750,,,,,,,,
243,544.235294,491.0,5.941176,4.370588,,,,,,,,


In [57]:


THRESH = 0.10        # 10 %
BIN_WIDTH = 30       # ancho de los rangos de ORP (aj√∫stalo)

# 1) Definir rangos de ORP seg√∫n tus datos
orp_min = stats_chiller["orp_chiller_mean"].min()
orp_max = stats_chiller["orp_chiller_mean"].max()

bins = np.arange(np.floor(orp_min / BIN_WIDTH) * BIN_WIDTH,
                 np.ceil(orp_max / BIN_WIDTH) * BIN_WIDTH + BIN_WIDTH,
                 BIN_WIDTH)

stats_chiller["orp_bin"] = pd.cut(
    stats_chiller["orp_chiller_mean"],
    bins=bins,
    include_lowest=True
)

# 2) Agrupar por rango de ORP
by_orp = (
    stats_chiller
    .groupby("orp_bin")
    .agg(
        n=("prev_campy", "size"),
        prev_campy_mean=("prev_campy", "mean"),
        prev_salmo_mean=("prev_salmo", "mean"),
        ph_mean=("ph_chiller_mean", "mean"),
        ph_median=("ph_chiller_mean", "median"),
        cloro_mean=("cloro_chiller_mean", "mean"),
        cloro_median=("cloro_chiller_mean", "median"),
        orp_mean=("orp_chiller_mean", "mean"),
        orp_median=("orp_chiller_median", "median"),
        log_result_campy=('log_result_campy', "mean"),
        log_result_salmo =('log_result_salmo', "mean"),
    )
)

# 3) Rangos ‚Äú√≥ptimos‚Äù: ambas prevalencias < 10 %
optimal_ranges = by_orp[
    (by_orp["prev_campy_mean"] < THRESH) &
    (by_orp["prev_salmo_mean"] < THRESH)
].copy()
optimal_ranges




Unnamed: 0_level_0,n,prev_campy_mean,prev_salmo_mean,ph_mean,ph_median,cloro_mean,cloro_median,orp_mean,orp_median,log_result_campy,log_result_salmo
orp_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"(660.0, 690.0]",1,0.0,0.0,5.912222,5.912222,4.655556,4.655556,669.666667,618.0,0.0,0.0


In [59]:

# --- Par√°metros ---
BIN_WIDTH = 30

# 1) Definir los bins de ORP
orp_min = stats_chiller["orp_chiller_mean"].min()
orp_max = stats_chiller["orp_chiller_mean"].max()

bins = np.arange(
    np.floor(orp_min / BIN_WIDTH) * BIN_WIDTH,
    np.ceil(orp_max / BIN_WIDTH) * BIN_WIDTH + BIN_WIDTH,
    BIN_WIDTH
)

stats_chiller["orp_bin"] = pd.cut(
    stats_chiller["orp_chiller_mean"],
    bins=bins,
    include_lowest=True
)

# 2) Agrupar por rango de ORP
by_bin = (
    stats_chiller
    .groupby("orp_bin")
    .agg(
        # totales para calcular prevalencia
        n_salmo=("n_analysis_salmo", "sum"),
        pos_salmo=("positive_salmo", "sum"),
        n_campy=("n_analysis_campy", "sum"),
        pos_campy=("positive_campy", "sum"),
        # valores t√≠picos de proceso
        orp_mean=("orp_chiller_mean", "mean"),
        ph_mean=("ph_chiller_mean", "mean"),
        cloro_mean=("cloro_chiller_mean", "mean"),
        log_result_campy=('log_result_campy', "mean"),
        log_result_salmo =('log_result_salmo', "mean"),
    ).reset_index()
)

# 3) Prevalencias por rango (cuidando divisi√≥n por cero)
by_bin["prev_salmo"] = np.where(
    by_bin["n_salmo"] > 0,
    by_bin["pos_salmo"] / by_bin["n_salmo"],
    np.nan
)

by_bin["prev_campy"] = np.where(
    by_bin["n_campy"] > 0,
    by_bin["pos_campy"] / by_bin["n_campy"],
    np.nan
)

bins_ordenados_salmo = by_bin.sort_values(["prev_salmo", "prev_campy"])
prev = ["prev_salmo", "prev_campy"]
for pr in prev:
    bins_ordenados_salmo[pr] = bins_ordenados_salmo[pr] * 100
    # Cambiar 0 por "<10" y otros n√∫meros por "<n√∫mero"
    bins_ordenados_salmo[pr] = bins_ordenados_salmo[pr].apply(
        lambda x: "< 10" if x == 0 else f"< {x:.0f}"
    )

cls_help = [
    'log_result_campy', 'log_result_salmo', 'n_salmo', 'n_campy'
]
bins_ordenados_salmo = bins_ordenados_salmo[["orp_bin", "orp_mean", "cloro_mean", "ph_mean", "prev_salmo", "prev_campy"]].round(2) #+cls_help
bins_ordenados_salmo = bins_ordenados_salmo.rename(columns={
    "orp_bin": "Rango HACCP",
    "orp_mean": "ORP Promedio (mV)",
    "ph_mean": "PH Promedio",
    "cloro_mean": "Cloro Promedio (ppm)",
    "prev_salmo": "Prevalencia Salmonella Estimada (%)",
    "prev_campy": "Prevalencia Campylobacter Estimada (%)",
})
s3.save_dataframe(bins_ordenados_salmo, "orp_bin.csv")
bins_ordenados_salmo


Unnamed: 0,Rango HACCP,ORP Promedio (mV),Cloro Promedio (ppm),PH Promedio,Prevalencia Salmonella Estimada (%),Prevalencia Campylobacter Estimada (%)
8,"(660.0, 690.0]",669.67,4.66,5.91,< 10,< 10
2,"(480.0, 510.0]",498.67,4.28,5.93,< 7,< 54
3,"(510.0, 540.0]",525.91,4.31,5.94,< 9,< 53
1,"(450.0, 480.0]",471.11,4.17,6.02,< 12,< 60
4,"(540.0, 570.0]",552.93,4.33,5.92,< 14,< 49
5,"(570.0, 600.0]",581.44,4.4,5.94,< 28,< 55
7,"(630.0, 660.0]",637.4,4.51,6.01,< 33,< 100
6,"(600.0, 630.0]",611.04,4.46,5.92,< 37,< 29
0,"(419.999, 450.0]",440.11,4.37,7.45,< 50,< 100


In [60]:
bins_ordenados_salmo["n_salmo"].sum(), bins_ordenados_salmo["n_campy"].sum()

KeyError: 'n_salmo'

In [61]:
micro["month"] = pd.to_datetime(micro["date"]).dt.month
micro[micro["etapa"]=="salida chiller"].groupby(["month", "microorganismo"]).agg(count=("microorganismo", "count"))

Unnamed: 0_level_0,Unnamed: 1_level_0,count
month,microorganismo,Unnamed: 2_level_1
1,Campylobacter,41
1,Salmonella,60
2,Campylobacter,26
2,Salmonella,72
3,Campylobacter,26
3,Salmonella,75
4,Campylobacter,24
4,Salmonella,72
5,Campylobacter,26
5,Salmonella,78


In [68]:
fig = plot_validacion_variable_diaria(
        df_sensor,
        date_col="date",
        value_col="orp_chiller",
        central="mean",
        p_low=0.02,
        p_high=0.98,
       #(660.0, 690.0]
        rango_objetivo=(660, 690),
        title="<b>Comportamiento diario del ORP en salida de chiller<b>",
        yaxis_title="ORP (mV)",
)
fig.show()
file = f"comportamiento_orp_diario.html"
s3.save_plotly_html(fig, file)

In [69]:
fig = plot_validacion_variable_diaria(
        df_sensor,
        date_col="date",
        value_col="ph_chiller",
        central="mean",
        p_low=0.02,
        p_high=0.98,
        rango_objetivo=(5.85, 5.91),
        title="<b>Comportamiento diario del pH en salida de chiller<b>",
        yaxis_title="pH",
)
fig.show()
file = "comportamiento_ph_diario.html"
s3.save_plotly_html(fig, file)

In [63]:
fig = plot_validacion_variable_diaria(
        df_sensor,
        date_col="date",
        value_col="cloro_chiller",
        central="mean",
        p_low=0.02,
        p_high=0.98,
        rango_objetivo=(4.65, 4.71),
        title="<b>Comportamiento diario del Cloro en salida de chiller<b>",
        yaxis_title="Cloro (ppm)",
)
fig.show()
file = f"comportamiento_cloro_diario.html"
s3.save_plotly_html(fig, file)


In [64]:


# Paleta corporativa de ejemplo (ajusta si ya la tienes definida)
CORPORATE_COLORS = [
    "#1A494C",  # 0
    "#17877D",  # 1
    "#94AF92",  # 2
    "#F6B27A",  # 3
    "#F18F01",  # 4
    "#E4572E",  # 5
    "#6C757D",  # 6
    "#343A40",  # 7
    "#A3CED0",  # 8
]


def plot_metrics_distribution_grid(
    df: pd.DataFrame,
    *,
    metric_cols: list[str],
    metric_labels: list[str] | None = None,
    nbins_list: list[int] | None = None,
    title: str = "<b>Distribuci√≥n de las medidas en la salida del chiller<b>",
    show_median: bool = False,
) -> go.Figure:
    """
    Crea un subplot 1xN con la distribuci√≥n (histograma) de varias m√©tricas,
    sin fondo, con marcos y con la media pintada en cada panel.
    Adem√°s:
      - En ORP: franja vertical 660‚Äì690 mV + flecha "Rango √≥ptimo".
      - En Cloro: l√≠nea vertical en 4.66 ppm.
      - En pH: l√≠nea vertical en 5.91.
    """
    n = len(metric_cols)
    if metric_labels is None:
        metric_labels = metric_cols

    if nbins_list is None:
        nbins_list = [30] * n

    fig = make_subplots(
        rows=1,
        cols=n,
        subplot_titles=metric_labels,
        shared_yaxes=True,
    )

    for i, (col, label, nbins) in enumerate(zip(metric_cols, metric_labels, nbins_list), start=1):
        data = df[col].dropna()
        mean_val = data.mean()
        med_val = data.median()

        # --- Histograma ---
        fig.add_trace(
            go.Histogram(
                x=data,
                nbinsx=nbins,
                name=label,
                marker=dict(color=CORPORATE_COLORS[1]),
                opacity=0.85,
                showlegend=False,
                  histnorm="percent"

            ),
            row=1,
            col=i,
        )

        # --- L√≠nea de media ---
        fig.add_vline(
            x=mean_val,
            line_color=CORPORATE_COLORS[4],
            line_width=3,
            line_dash="solid",
            row=1,
            col=i,
        )

        # --- Overlays espec√≠ficos por m√©trica ---
        if col == "orp_chiller":
            # Franja vertical gris entre 660 y 690 mV
            fig.add_vrect(
                x0=660,
                x1=690,
                fillcolor="rgba(128,128,128,0.25)",  # gris claro transl√∫cido
                line_width=0,
                row=1,
                col=i,
                layer="below",  # detr√°s de las barras
            )

            # Flecha gris oscuro con texto "Rango √≥ptimo"
            fig.add_annotation(
                x=(660 + 690) / 2,  # centro del rango
                xref=f"x{i}",
                y=1.02,
                yref="paper",
                text="Rango √≥ptimo",
                showarrow=True,
                arrowhead=2,
                arrowsize=1,
                arrowwidth=2,
                arrowcolor="#343A40",  # gris oscuro
                font=dict(color="#343A40", size=14),
                ax=0,    # sin desplazamiento horizontal
                ay=-30,   # flecha hacia abajo
            )

        if col == "cloro_chiller":
            # L√≠nea vertical en 4.66 ppm
            fig.add_vline(
                x=4.66,
                line_color="#343A40",
                line_width=3,
                line_dash="dash",
                row=1,
                col=i,
            )

        if col == "ph_chiller":
            # L√≠nea vertical en pH 5.91
            fig.add_vline(
                x=5.91,
                line_color="#343A40",
                line_width=3,
                line_dash="dash",
                row=1,
                col=i,
            )

        # --- Anotaci√≥n de la mediana ---
        if show_median:
            fig.add_annotation(
                x=mean_val + mean_val / 4,
                y=0.1,
                xref=f"x{i}",
                yref="paper",
                text=f"Mediana: {med_val:.2f}",
                showarrow=False,
                font=dict(color=CORPORATE_COLORS[4], size=15),
                align="center",
            )

        fig.update_xaxes(title_text=label, row=1, col=i)

    # Eje Y
    fig.update_yaxes(title_text="Porcentaje", row=1, col=1)

    # Marcos + labels y ticks en negro
    fig.update_xaxes(
        showline=True,
        linewidth=1,
        linecolor="black",
        mirror=True,
        showgrid=False,
        tickfont=dict(color="black"),
        title_font=dict(color="black"),
    )
    fig.update_yaxes(
        showline=True,
        linewidth=1,
        linecolor="black",
        mirror=True,
        showgrid=False,
        tickfont=dict(color="black"),
        title_font=dict(color="black"),
    )

    # Layout general
    fig.update_layout(
        title=title,
        paper_bgcolor="rgba(0,0,0,0)",
        plot_bgcolor="rgba(0,0,0,0)",
        bargap=0.05,
        font=dict(color="black"),
    )

    return fig


In [65]:
fig = plot_metrics_distribution_grid(
       df_sensor,
    metric_cols=["orp_chiller", "cloro_chiller", "ph_chiller"],
    metric_labels=[ "ORP (mV)","Cloro Libre (ppm)", "pH"],
    nbins_list=[40, 40, 100],
)
fig.show()
file = f"histogramas_fq.html"
s3.save_plotly_html(fig, file)