In [1]:
import pandas as pd
import numpy as np
import sys
import os
sys.path.append(os.path.abspath(".."))
from core.viz import plot_line, create_subplot_grid, plot_bar
from core.s3 import S3AssetManager


In [2]:

def parse_cfu_series(s: pd.Series,
                     *,
                     lt_as_nan: bool = True,   # "<L" -> NaN
                     lt_as_half: bool = False  # "<L" -> L/2 (si True)
                     ) -> pd.Series:
    """
    Acepta: "277 x 10^2", "1X10^2", "10^2", "123",
            "<10", "<10^2", "< 1x10^3", "ND", "NA", etc.
    Devuelve Float64 (nullable).
    """
    x = (
        s.astype(str).str.strip()
         .str.replace("×", "x", regex=False)
         .str.replace("X", "x", regex=False)        # mayúscula X -> x
         .str.replace(",", ".", regex=False)
         .str.replace(r"\s+", " ", regex=True)
    )
    # superíndices -> normales (10² -> 10^2)
    x = x.str.translate(str.maketrans("⁰¹²³⁴⁵⁶⁷⁸⁹⁻⁺", "0123456789-+"))

    # ---------------- "<L" con posibles formas científicas ----------------
    # Casos: "< 10^2" | "<10^2" | "< 1x10^2" | "<1 x 10^3" | "< 123.4"
    lt_df = x.str.extract(
        r"^\s*<\s*(?:(?P<mant>\d+(?:\.\d+)?)\s*(?:x|\*)\s*10\^?\s*(?P<exp>[-+]?\d+)"
        r"|10\^?\s*(?P<exp2>[-+]?\d+)"
        r"|(?P<num>\d+(?:\.\d+)?))\s*$"
    )

    lt_mask = lt_df.notna().any(axis=1)
    # valor límite (L)
    lt_val = pd.Series(np.nan, index=x.index, dtype="Float64")
    # < mant * 10^exp
    m1 = lt_df["mant"].notna() & lt_df["exp"].notna()
    lt_val[m1] = lt_df.loc[m1, "mant"].astype(float) * np.power(10.0, lt_df.loc[m1, "exp"].astype(float))
    # < 10^exp
    m2 = lt_df["exp2"].notna()
    lt_val[m2] = np.power(10.0, lt_df.loc[m2, "exp2"].astype(float))
    # < num
    m3 = lt_df["num"].notna()
    lt_val[m3] = lt_df.loc[m3, "num"].astype(float)

    # ND/NA/etc. -> NaN
    na_mask = x.str.fullmatch(r"(?i)\s*(nd|na|ndc|n/?a)\s*")

    # Prepara salida
    out = pd.Series(np.nan, index=x.index, dtype="Float64")

    # Asigna "<L" según política
    if lt_as_half:
        out[lt_mask] = (lt_val[lt_mask] / 2).astype("Float64")
    elif not lt_as_nan:
        out[lt_mask] = lt_val[lt_mask].astype("Float64")
    # si lt_as_nan=True, se deja NaN (por diseño)

    # ---------------- Casos numéricos y científicos "normales" -------------
    # 1) mant x 10^exp  (acepta 'x' o '*', sin/ con espacios)
    sci = x.str.extract(r"^\s*(?P<mant>-?\d+(?:\.\d+)?)\s*(?:x|\*)\s*10\^?\s*(?P<exp>[-+]?\d+)\s*$")
    m_ok = sci.notna().all(axis=1)
    val_sci = sci["mant"].astype(float) * np.power(10.0, sci["exp"].astype(float))
    out = out.where(~m_ok, val_sci.astype("Float64"))

    # 2) 10^exp (mantisa implícita 1)
    pow_only = x.str.extract(r"^\s*10\^?\s*(?P<exp>[-+]?\d+)\s*$")
    p_ok = pow_only["exp"].notna()
    val_pow = np.power(10.0, pow_only.loc[p_ok, "exp"].astype(float))
    out = out.where(~p_ok, pd.Series(val_pow, index=pow_only.index, dtype="Float64"))

    # 3) número simple
    plain = pd.to_numeric(x, errors="coerce").astype("Float64")
    out = out.fillna(plain)

    # ND/NA/etc. anulan cualquier valor
    out = out.mask(na_mask, np.nan)

    return out


# -------- Conversor de VARIAS columnas en un DataFrame --------
def convert_cfu_columns(df: pd.DataFrame,
                        columns: list[str],
                        *,
                        suffix: str = "_num",
                        create_log10: bool = True,
                        log10_prefix: str = "log10_",
                        lt_as_nan: bool = True,
                        lt_as_half: bool = False,
                        inplace: bool = True) -> pd.DataFrame:
    """
    Convierte columnas con strings tipo 'A x 10^B' y variantes (incluye '<…') a Float64.
    """
    target = df if inplace else df.copy()

    for col in columns:
        if col not in target.columns:
            continue

        num_col = f"{col}{suffix}"
        target[num_col] = parse_cfu_series(
            target[col],
            lt_as_nan=lt_as_nan,
            lt_as_half=lt_as_half
        )

        if create_log10:
            log_col = f"{log10_prefix}{col}"
            target[log_col] = np.log10(target[num_col]).where(target[num_col].gt(0))

    return target


import re
import unicodedata
import pandas as pd

# Diccionario: prefijo -> etiqueta (ambos en minúsculas y sin acentos)
PREFIX_MAP = {
    "gran ave":     "gran ave",
    "gran cerdo":   "gran cerdo",
    "gran cuy":     "gran cuy",
    "gran lechero": "gran lechero",
    "gran tilapia": "gran tilapia",
    "mas ahorro":   "mas ahorro",
    "casta dog":    "casta dog",
}

def _normalize(s: str) -> str:
    if s is None or (isinstance(s, float) and pd.isna(s)):
        return ""
    s = str(s).strip().lower()
    # quita acentos
    s = unicodedata.normalize("NFD", s)
    s = "".join(c for c in s if unicodedata.category(c) != "Mn")
    # colapsa espacios múltiples
    s = re.sub(r"\s+", " ", s)
    return s

def clasificar_prefijo(texto: str, default: str = "gran can") -> str:
    t = _normalize(texto)
    for prefijo, etiqueta in PREFIX_MAP.items():
        if t.startswith(prefijo):
            return etiqueta
    return default


In [3]:
notebook_name = "avipaz_microbiology_overview"
s3 = S3AssetManager(notebook_name=notebook_name)

In [4]:
report = s3.read_excel('raw/avipaz/Reporte Extenso_All-Products_20250801_0000_To_20251024_1024_avg AGO SEPT OCT (2).xlsx', sheet_name='ExtensiveReport')

In [5]:

report = report.copy()  # garantiza que trabajas sobre una copia independiente
report["lot"] = pd.to_numeric(report["Sample ID"], errors="coerce")
report = report[report["lot"].notnull()].copy()
report["lot"] = report["lot"].astype(int).astype(str)
report["date"] = pd.to_datetime(report["Date/Time of Analysis"], format="%d/%m/%Y %H:%M:%S")

In [6]:
report["date"].min(), report["date"].max()

(Timestamp('2025-08-01 07:50:29'), Timestamp('2025-10-24 09:31:02'))

In [7]:
report["Product Name"].value_counts()

62 GRAN AVE ECO CRECIMIENTO               139
33 GRAN AVE POLLO CRECIMIENTO FINQUERO    139
13 GRAN AVE POLLO CRECIMIENTO             128
34 GRAN AVE POLLO ENGORDE FINQUERO         77
14 GRAN AVE POLLO ENGORDE                  68
63 GRAN AVE ECO ENGORDE                    60
31 GRAN CERDO CRECIMIENTO FINQUERO         59
12 GRAN AVE POLLO INICIAL 2                58
22 GRAN CERDO CRECIMIENTO                  57
60 GRAN CERDO ECO CRECIMIENTO              52
15 GRAN AVE POSTURA                        52
32 GRAN CERDO ENGORDE FINQUERO             50
61 GRAN CERDO ECO ENGORDE                  49
11 GRAN AVE POLLO INICIAL 1                49
25 GRAN CERDO LACTANCIA                    37
23 GRAN CERDO ENGORDE                      32
28 GRAN LECHERO VALLE                      31
24 GRAN CERDO GESTACION                    28
30 GRAN LECHERO ALTO                       24
73 GRAN CERDO CONCENTRADO PROTEICO         23
29 GRAN LECHERO TERNERAS                   19
72 GRAN AVE CONCENTRADO PROTEICO  

In [8]:
mp = s3.read_excel("raw/avipaz/LB 07 01 INFORME  MICROBIOLOGIA DE MP 2025 (1).xlsx", skiprows=4, sheet_name="MATERIAS PRIMAS 2025")
mp["date"] = pd.to_datetime(mp["Fecha"], errors="coerce")
mp = mp[(mp["Lote"].notnull()) & mp["date"].notnull()]
mp.columns = [x.strip() for x in mp.columns]

rename_map = {
    "Materia Prima": "raw_material",
    "Presentación": "packaging",          #
    "Lote": "lot",
    "Flora Total ufc/g": "total_cfu_g",
    "Flora Coliforme ufc/g": "coliforms_cfu_g",
    "E. coli": "e_coli",
    "Salmonella": "salmonella",
    "Flora Micotica ufc/g": "yeasts_molds_cfu_g",
    "Disposición": "disposition",
    "date": "date",
}
mp = mp.rename(columns=rename_map)


pt = s3.read_excel("raw/avipaz/LB 07 02 INFORME MICRO PT 2025 (1).xlsx", skiprows=4, sheet_name="Hoja1")

pt["date"] = pd.to_datetime(pt["Fecha"], errors="coerce")
pt = pt[(pt["Lote"].notnull()) & pt["date"].notnull()]

rename_map = {
    "Producto": "product",
    "Presentación ": "packaging",
    "Lote": "lot",
    "Flora Total ufc/g": "total_cfu_g",
    "Flora Coliforme ufc/g": "coliforms_cfu_g",
    "E. coli": "e_coli",
    "Salmonella": "salmonella",
    "Flora Micotica ufc/g": "yeasts_molds_cfu_g",
    "Disposición": "disposition",
    "date": "date",
}
pt = pt.rename(columns=rename_map)

In [9]:
# Ejemplo: convertir 1 ó varias columnas
pt = convert_cfu_columns(
    pt,
    columns=["total_cfu_g", "coliforms_cfu_g", "yeasts_molds_cfu_g"],
    suffix="_num",
    create_log10=True,
    log10_prefix="log10_",
    lt_as_nan=True,
    lt_as_half=False,
    inplace=True
)
mp = convert_cfu_columns(
    mp,
    columns=["total_cfu_g", "coliforms_cfu_g", 'e_coli', "yeasts_molds_cfu_g"],
    suffix="_num",
    create_log10=True,
    log10_prefix="log10_",
    lt_as_nan=True,
    lt_as_half=False,
    inplace=True
)

In [10]:
pt["specie"] = pt["product"].apply(clasificar_prefijo)
pt["lot"] = pt["lot"].astype(str)
mp["lot"] = mp["lot"].astype(str)
pt["month"] = pt["date"].dt.month
mp["month"] = mp["date"].dt.month

In [11]:
pt_month_species = pt.groupby(["month", "specie"]).agg(
    total_cfu_g_num=("total_cfu_g_num", "mean"),
    coliforms_cfu_g_num=("coliforms_cfu_g_num", "mean"),
    yeasts_molds_cfu_g_num=("yeasts_molds_cfu_g_num", "mean"),
    n_analysis=("specie", "count")
).reset_index()
cols = ["total_cfu_g_num", "coliforms_cfu_g_num", "yeasts_molds_cfu_g_num"]
for col in cols:
    pt_month_species[f"log_{col}"] = np.log10(pt_month_species[col])
pt_month_species

Unnamed: 0,month,specie,total_cfu_g_num,coliforms_cfu_g_num,yeasts_molds_cfu_g_num,n_analysis,log_total_cfu_g_num,log_coliforms_cfu_g_num,log_yeasts_molds_cfu_g_num
0,1,gran ave,24550.0,,1700.0,2,4.390051,,3.230449
1,1,gran cerdo,3200.0,,100.0,1,3.50515,,2.0
2,1,gran tilapia,7400.0,,100.0,1,3.869232,,2.0
3,1,mas ahorro,400.0,,,1,2.60206,,
4,2,casta dog,2200.0,,,1,3.342423,,
5,2,gran ave,28800.0,,400.0,1,4.459392,,2.60206
6,2,gran cerdo,3500.0,,,1,3.544068,,
7,2,gran tilapia,9600.0,,100.0,1,3.982271,,2.0
8,2,mas ahorro,4200.0,,,1,3.623249,,
9,3,casta dog,2700.0,,500.0,2,3.431364,,2.69897


In [12]:
fig1 = plot_line(
    pt_month_species,
    x_col="month",
    y_col="log_total_cfu_g_num",
    group_col="specie",
    title="Flora total Log10(UFC/g)",
    x_title="Mes",
    y_title="Log10 UFC/g",
)
fig1.show()


In [13]:
fig2 = plot_line(
    pt_month_species[pt_month_species["log_yeasts_molds_cfu_g_num"].notnull()],
    x_col="month",
    y_col="log_yeasts_molds_cfu_g_num",
    group_col="specie",
    title="Flora Micótica Log10(UFC/g)",
     x_title="Mes",
    y_title="Log10 UFC/g",
)
fig2.show()


In [14]:
fig_final = create_subplot_grid(
    figures=[fig1 , fig2],
    rows=1, cols=2,
    titles=["Flora Total Log10(UFC/g)", "Flora Coliformes Log10(UFC/g)"],
    main_title="<b>Tendencia mensual de carga microbiológica por especie</b>",
    shared_y=False,
    width=1200, height=400
)

fig_final.show()
s3.save_plotly_html(fig_final, "tendencia_mensual_especie.html")

In [16]:
mp_pres_date = mp.groupby(["packaging", "month"]).agg(
    total_cfu_g_num=("total_cfu_g_num", "mean"),
    coliforms_cfu_g_num=("coliforms_cfu_g_num", "mean"),
    yeasts_molds_cfu_g_num=("yeasts_molds_cfu_g_num", "mean"),
    n_analysis=("packaging", "count")
).reset_index()
cols = ["total_cfu_g_num", "coliforms_cfu_g_num", "yeasts_molds_cfu_g_num"]
for col in cols:
    mp_pres_date[f"log_{col}"] = np.log10(mp_pres_date[col])
mp_pres_date

Unnamed: 0,packaging,month,total_cfu_g_num,coliforms_cfu_g_num,yeasts_molds_cfu_g_num,n_analysis,log_total_cfu_g_num,log_coliforms_cfu_g_num,log_yeasts_molds_cfu_g_num
0,GRANO,1,300.0,100.0,,1,2.477121,2.0,
1,GRANO,3,1800.0,100.0,700.0,1,3.255273,2.0,2.845098
2,GRANO,4,1100.0,,,2,3.041393,,
3,GRANO,5,16000.0,,,1,4.20412,,
4,GRANO,8,10000.0,10000.0,34000.0,1,4.0,4.0,4.531479
5,HARINA,1,400.0,,,1,2.60206,,
6,HARINA,2,10000.0,450.0,250.0,2,4.0,2.653213,2.39794
7,HARINA,3,25000.0,13550.0,200.0,4,4.39794,4.131939,2.30103
8,HARINA,4,5133.333333,400.0,800.0,3,3.710399,2.60206,2.90309
9,HARINA,5,2500.0,,,4,3.39794,,


In [17]:
fig = plot_line(
    mp_pres_date[mp_pres_date["log_yeasts_molds_cfu_g_num"].notnull()],
    x_col="month",
    y_col="log_yeasts_molds_cfu_g_num",
    group_col="packaging",
    title="log10 Hongos y Levadoras UFC/g por mes y especie en materias primas",
     x_title="Mes",
    y_title="Log10(UFC/g)",
)
fig.show()


In [18]:
fig = plot_line(
    mp_pres_date[mp_pres_date["log_total_cfu_g_num"].notnull()],
    x_col="month",
    y_col="log_total_cfu_g_num",
    group_col="packaging",
    title="log10 Total UFC/g por mes y especie en materias primas",
    x_title="Mes",
    y_title="Log10(UFC/g)",
)
fig.show()


In [19]:
mp["sample"] = "mp"
pt["sample"] = "pt"

all_samples = pd.concat([mp, pt])
all_samples_month = all_samples.groupby(["month", "sample"]).agg(
    total_cfu_g_num=("total_cfu_g_num", "mean"),
    coliforms_cfu_g_num=("coliforms_cfu_g_num", "mean"),
    yeasts_molds_cfu_g_num=("yeasts_molds_cfu_g_num", "mean"),
).reset_index()
cols = ["total_cfu_g_num", "coliforms_cfu_g_num", "yeasts_molds_cfu_g_num"]
for col in cols:
    all_samples_month[f"log_{col}"] = np.log10(all_samples_month[col])
all_samples_month

Unnamed: 0,month,sample,total_cfu_g_num,coliforms_cfu_g_num,yeasts_molds_cfu_g_num,log_total_cfu_g_num,log_coliforms_cfu_g_num,log_yeasts_molds_cfu_g_num
0,1,mp,350.0,100.0,,2.544068,2.0,
1,1,pt,12020.0,,900.0,4.079904,,2.954243
2,2,mp,10000.0,450.0,250.0,4.0,2.653213,2.39794
3,2,pt,9660.0,,250.0,3.984977,,2.39794
4,3,mp,20360.0,9066.666667,450.0,4.308778,3.957448,2.653213
5,3,pt,13916.666667,1266.666667,366.666667,4.143535,3.102662,2.564271
6,4,mp,3520.0,400.0,800.0,3.546543,2.60206,2.90309
7,4,pt,16180.0,100.0,100.0,4.208979,2.0,2.0
8,5,mp,5200.0,,,3.716003,,
9,5,pt,7000.0,,100.0,3.845098,,2.0


In [20]:
fig1 = plot_line(
    all_samples_month[all_samples_month["log_total_cfu_g_num"].notnull()],
    x_col="month",
    y_col="log_total_cfu_g_num",
    group_col="sample",
    title="Flora Total Log10(UFC/g)",
     x_title="Mes",
    y_title="Log10(UFC/g)",
)
fig1.show()


In [21]:
fig2 = plot_line(
    all_samples_month[all_samples_month["log_coliforms_cfu_g_num"].notnull()],
    x_col="month",
    y_col="log_coliforms_cfu_g_num",
    group_col="sample",
    title="Flora Coliformes Log10(UFC/g)",
    x_title="Mes",
    y_title="Log10(UFC/g)",
)
fig2.show()

In [22]:
fig3 = plot_line(
    all_samples_month[all_samples_month["log_yeasts_molds_cfu_g_num"].notnull()],
    x_col="month",
    y_col="log_yeasts_molds_cfu_g_num",
    group_col="sample",
    title="Flora Micótica Log10(UFC/g)",
     x_title="Mes",
    y_title="Log10(UFC/g)",
)
fig3.show()


In [23]:
fig_final = create_subplot_grid(
    figures=[fig1 , fig2, fig3],
    rows=1, cols=3,
    titles=["Flora Total Log10(UFC/g)", "Flora Coliformes Log10(UFC/g)", "Flora Micótica Log10(UFC/g)"],
    main_title="<b>Tendencia mensual de carga microbiológica por muestra en MP y PT</b>",
    shared_y=False,
    width=1200, height=400
)

fig_final.show()
s3.save_plotly_html(fig_final, "log_microbiologia_samples_subplots_2x2.html")