In [1]:
# Setup and imports
import sys
import os

# Add project root to path (handles both Docker /work and local environments)
if os.path.exists('/work'):
    sys.path.insert(0, '/work')  # Docker environment
else:
    sys.path.insert(0, os.path.abspath('..'))  # Local environment

# ====================================================================================
# ===============================================                                   ==
# Libraries for dataset verification with DVC. ==                                   ==
# ===============================================                                   ==
from pathlib import Path  # Cross-platform path handling                            ==
from typing import Dict, Tuple, Optional  # Optional type hints for better clarity  ==
import os  # File system and environment variable handling                          ==
import yaml  # Read .dvc (YAML) pointer files                                       ==
import hashlib  # Compute MD5 hashes to verify data integrity                       ==
import subprocess    # Execute SO commands                                          ==
# ====================================================================================

#import de librerias para EDA
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import math
import numpy as np

#importa libraies for data drifting
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score
from sklearn.mixture import GaussianMixture
from alibi_detect.cd import KSDrift

# Import our custom modules
from src import config
from src.data_utils import load_data
from src.plots import (
    plot_target_distribution,
    plot_correlation_matrix,
    create_eda_summary_dashboard,
    plot_categorical_analysis,
    plot_numerical_relationship
)

# Import Phase 1 feature engine (from features.py file, not features/ directory)
import importlib
features_module = importlib.import_module('src.features_engine')
AbsenteeismFeatureEngine = features_module.AbsenteeismFeatureEngine

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
# ===============================================================
# DVC dataset verification helpers (MD5 check + pull fallback) ==
# ===============================================================

def ensure_repo_ready(repo_root: str = "/work") -> None:
    """
    Verifies that:
    - `repo_root` is a valid project folder with Git and DVC.
    - Directory `repo_root` exists.
    - It contains a `.git` subdirectory (it's a Git repo).
    - It contains a `.dvc` subdirectory (it's a DVC repo).

    Raises:
    - FileNotFoundError if `repo_root` does not exist.
    - RuntimeError if `.git` or `.dvc` is missing.
    """
    if not os.path.isdir(repo_root):
        raise FileNotFoundError(f"Repo root does not exist: {repo_root}")
    if not os.path.isdir(os.path.join(repo_root, ".git")):
        raise RuntimeError(f"Not a Git repo: {repo_root}")
    if not os.path.isdir(os.path.join(repo_root, ".dvc")):
        raise RuntimeError(f"Not a DVC repo: {repo_root} (.dvc not found)")


def _md5_file(path: str, chunk_size: int = 1024 * 1024) -> str:
    """
    Computes the MD5 hash of a file by streaming it from disk to verify integrity
    against the value stored by DVC in the `.dvc` pointer (default md5-based cache).

    Parameters:
    - path: absolute file path.
    - chunk_size: read block size in bytes (default 1 MB).

    Returns:
    - Hex MD5 string of the file content.
    """
    h = hashlib.md5()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(chunk_size), b""):
            h.update(chunk)
    return h.hexdigest()


def _read_expected_md5_from_dvc(pointer_path: str) -> Optional[str]:
    """
    Reads the expected MD5 from a single-file `.dvc` pointer.

    `.dvc` format:
      - md5: <hash>
      - hash: md5
      - path: <file_name>

    Parameters:
    - pointer_path: absolute path to the `.dvc` file.

    Returns:
    - The MD5 string if present, or None if the pointer does not exist / lacks md5.

    Use:
    - Compare the expected MD5 from `.dvc` with the actual local file MD5.
    """
    if not os.path.exists(pointer_path):
        return None
    with open(pointer_path, "r", encoding="utf-8") as f:
        data = yaml.safe_load(f) or {}
    outs = data.get("outs") or []
    if not outs:
        return None
    out = outs[0]
    return out.get("md5") or out.get("checksum") or None


def _dvc_pull_target(path_repo_rel: str, repo_root: str = "/work") -> None:
    """
    Runs `dvc pull <path>` or `<path>.dvc` to materialize the correct version from the remote (S3)
    into the local workspace/cache. Raises if it fails (credentials, permissions, etc.).

    Parameters:
    - path_repo_rel: repo-relative path to fetch (e.g., "data/raw/file.csv").
    - repo_root: repo root (e.g., "/work").
    """
# Build possible targets
    dvc_pointer = os.path.join(repo_root, path_repo_rel + ".dvc")
    if os.path.exists(dvc_pointer):
        target = path_repo_rel + ".dvc"
    else:
        target = path_repo_rel

    # Try pulling
    result = subprocess.run(
        ["dvc", "pull", "--quiet", target],
        cwd=repo_root,
        capture_output=True,
        text=True,
    )

    # Raise if failed
    if result.returncode != 0:
        raise RuntimeError(
            f"Failed to run 'dvc pull {target}':\n"
            f"STDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}"
        )



def dvc_read_csv_verified(
    path_repo_rel: str,
    repo_root: str = "/work",
    prefer_dvc: bool = False,
    verify_local_md5: bool = True,
    pandas_read_csv_kwargs: Optional[Dict] = None,
) -> Tuple[pd.DataFrame, str]:
    """
    Read a DVC-versioned CSV ensuring integrity when reading locally.

    Strategy:
    - If `prefer_dvc=True`: force fetching the official version with `dvc pull`
      and then read locally. Returns ("pulled").
    - If `prefer_dvc=False`:
        1) If the local file exists and `verify_local_md5=True`, compare local MD5
           with the expected MD5 from the `.dvc` pointer.
           * If equal -> read local (fast). Returns ("local").
           * If NOT equal -> run `dvc pull` and read the official version. Returns ("pulled").
        2) If the file does NOT exist -> run `dvc pull` and read the official version. Returns ("pulled").

    Parameters:
    - path_repo_rel: repo-relative CSV path (e.g., "data/raw/file.csv").
    - repo_root: repo root (e.g., "/work").
    - prefer_dvc: if True, ignore local state and fetch official version with `dvc pull`.
    - verify_local_md5: if True, validate local MD5 before trusting local read.
    - pandas_read_csv_kwargs: kwargs for `pandas.read_csv()` (sep, encoding, etc.).

    Returns:
    - (df, source) where source ‚àà {"local", "pulled"} describing the read source.

    Exceptions:
    - Raises if the file cannot be materialized from the remote (credentials,
      permissions, or missing blob).
    """
    ensure_repo_ready(repo_root)
    if pandas_read_csv_kwargs is None:
        pandas_read_csv_kwargs = {}

    local_path = os.path.join(repo_root, path_repo_rel)
    dvc_pointer = local_path + ".dvc"  # e.g., data/raw/file.csv.dvc
    expected_md5 = _read_expected_md5_from_dvc(dvc_pointer)

    # Option: force ‚Äúofficial‚Äù read by fetching from S3
    if prefer_dvc:
        _dvc_pull_target(path_repo_rel, repo_root)
        # Note: when forcing, we don‚Äôt compare MD5; we assume `dvc pull` fetched the official version.
        return pd.read_csv(local_path, **pandas_read_csv_kwargs), "pulled"

    # If a local file exists, decide based on MD5
    if os.path.exists(local_path):
        if verify_local_md5 and expected_md5:
            try:
                md5_local = _md5_file(local_path)
                if md5_local == expected_md5:
                    # Note: ‚ÄúMD5 OK: local matches .dvc‚Äù
                    # Use the local version (faster) because it‚Äôs identical to the ‚Äúofficial‚Äù one.
                    return pd.read_csv(local_path, **pandas_read_csv_kwargs), "local"
                else:
                    # MD5 differs: local != .dvc ‚Üí run dvc pull
                    _dvc_pull_target(path_repo_rel, repo_root)
                    return pd.read_csv(local_path, **pandas_read_csv_kwargs), "pulled"
            except Exception:
                # Any issue during the check ‚Üí ensure consistency with a pull
                _dvc_pull_target(path_repo_rel, repo_root)
                return pd.read_csv(local_path, **pandas_read_csv_kwargs), "pulled"
        else:
            # Local read without MD5 verification
            return pd.read_csv(local_path, **pandas_read_csv_kwargs), "local"

    # If no local file, fetch the official version
    _dvc_pull_target(path_repo_rel, repo_root)
    return pd.read_csv(local_path, **pandas_read_csv_kwargs), "pulled"


In [3]:
# =======================================
# Configurable dataset read parameters ==
# =======================================

# Docker mounts the project at /work. If your compose changes, adjust REPO_ROOT accordingly.
REPO_ROOT = "/work"  # Where the repo is mounted.
PATH = "data/raw/work_absenteeism_modified.csv"  # Repo-relative path of the DVC-versioned dataset.

# Arguments forwarded to pandas.read_csv. Optional: delimiter, encoding, etc.
READ_KW: Dict = {}  # e.g.: {"sep": ",", "encoding": "utf-8"}

# Read mode:
# - PREFER_DVC=True  -> Force fetching the official version with `dvc pull` and read it.
# - PREFER_DVC=False -> Prefer local only if (and only if) MD5 matches the one in the .dvc.
PREFER_DVC = False
VERIFY_LOCAL_MD5 = True

# =====================================================
# Environment inspection + demo read with MD5 legend ==
# =====================================================
print("Repo root:", REPO_ROOT, "| exists:", Path(REPO_ROOT).exists())
print("Expected CSV:", PATH)

# Show expected MD5 (if the pointer exists)
pointer_path = os.path.join(REPO_ROOT, PATH) + ".dvc"
expected = _read_expected_md5_from_dvc(pointer_path)
print("Expected MD5 (.dvc):", expected or "<no md5 in pointer>")

# If a local file exists, compute local MD5 and compare
local_abs = os.path.join(REPO_ROOT, PATH)
if os.path.exists(local_abs) and expected:
    try:
        local_md5 = _md5_file(local_abs)
        print("Local MD5:", local_md5)
        print("MD5 matches .dvc?", "YES ‚úÖ" if local_md5 == expected else "NO ‚ùå")
    except Exception as e:
        print("Could not compute local MD5:", type(e).__name__, str(e)[:120])

# --- Robust read with integrity verification ---
# dvc_read_csv_verified does:
#   1) If PREFER_DVC=True -> run `dvc pull` and read the official version (‚Äúpulled‚Äù).
#   2) If PREFER_DVC=False:
#        - If the local file exists and VERIFY_LOCAL_MD5=True:
#            compare local MD5 against the MD5 from the .dvc pointer.
#            * If equal -> read local (fast) and consistent.
#            * If different -> `dvc pull` and read the official version.
#        - If the file does not exist locally -> `dvc pull` and read the official version.
df, source = dvc_read_csv_verified(
    PATH,
    repo_root=REPO_ROOT,
    prefer_dvc=PREFER_DVC,
    verify_local_md5=VERIFY_LOCAL_MD5,
    pandas_read_csv_kwargs=READ_KW,
)

print(f"Read from: {source} | rows={len(df)} | cols={len(df.columns)}")


Repo root: /work | exists: True
Expected CSV: data/raw/work_absenteeism_modified.csv
Expected MD5 (.dvc): 96c318341d1846f567be7127f52d03e1
Local MD5: 96c318341d1846f567be7127f52d03e1
MD5 matches .dvc? YES ‚úÖ
Read from: local | rows=754 | cols=22


**1. Visualizaci√≥n del dataset**

In [4]:
absenteeism_df = df

In [5]:
def analyze_potential_targets(absenteeism_df):
    """Analiza columnas candidatas a ser target."""

    print("="*70)
    print("AN√ÅLISIS DE POTENCIALES VARIABLES TARGET")
    print("="*70)

    for col in df.columns:
        print(f"\nüìä Columna: {col}")
        print(f"   Tipo: {absenteeism_df[col].dtype}")
        print(f"   Nulos: {absenteeism_df[col].isnull().sum()} ({df[col].isnull().mean()*100:.1f}%)")
        print(f"   √önicos: {absenteeism_df[col].nunique()}")

        # Si es num√©rica
        if pd.api.types.is_numeric_dtype(df[col]):
            print(f"   Rango: [{absenteeism_df[col].min():.2f}, {df[col].max():.2f}]")
            print(f"   Media: {absenteeism_df[col].mean():.2f}")

        # Si es categ√≥rica
        if df[col].nunique() < 20:
            print(f"   Valores: {absenteeism_df[col].value_counts().to_dict()}")

        print("-"*70)

# Uso:
analyze_potential_targets(df)

AN√ÅLISIS DE POTENCIALES VARIABLES TARGET

üìä Columna: ID
   Tipo: object
   Nulos: 8 (1.1%)
   √önicos: 59
----------------------------------------------------------------------

üìä Columna: Reason for absence
   Tipo: object
   Nulos: 6 (0.8%)
   √önicos: 51
----------------------------------------------------------------------

üìä Columna: Month of absence
   Tipo: object
   Nulos: 11 (1.5%)
   √önicos: 31
----------------------------------------------------------------------

üìä Columna: Day of the week
   Tipo: object
   Nulos: 8 (1.1%)
   √önicos: 21
----------------------------------------------------------------------

üìä Columna: Seasons
   Tipo: object
   Nulos: 4 (0.5%)
   √önicos: 17
   Valores: {'4.0': 191, '2.0': 183, '3.0': 173, '1.0': 160, ' 2.0 ': 10, ' 4.0 ': 10, ' 3.0 ': 7, ' 1.0 ': 7, '986.0': 1, ' NAN ': 1, '643.0': 1, '963.0': 1, '866.0': 1, '949.0': 1, '45.0': 1, '79.0': 1, ' 246.0 ': 1}
------------------------------------------------------------------

In [6]:
def validate_target(absenteeism_df, target_col):
    """Valida si una columna es adecuada como target."""

    print(f"\nüîç Validando: {target_col}")

    # 1. Verificar existencia
    if target_col not in absenteeism_df.columns:
        print("‚ùå La columna no existe")
        return False

    # 2. Verificar nulos
    null_pct = absenteeism_df[target_col].isnull().mean() * 100
    if null_pct > 30:
        print(f"‚ö†Ô∏è Muchos nulos: {null_pct:.1f}%")

    # 3. Verificar variabilidad
    n_unique = absenteeism_df[target_col].nunique()
    if n_unique == 1:
        print("‚ùå Sin variabilidad (todos iguales)")
        return False

    # 4. Verificar tipo
    if pd.api.types.is_numeric_dtype(absenteeism_df[target_col]):
        print(f"‚úÖ Num√©rica: {n_unique} valores √∫nicos")
    else:
        print(f"‚úÖ Categ√≥rica: {n_unique} clases")

    # 5. Distribuci√≥n
    print(f"\nDistribuci√≥n:")
    if n_unique < 20:
        print(absenteeism_df[target_col].value_counts())
    else:
        print(absenteeism_df[target_col].describe())

    return True

# Uso:
validate_target(absenteeism_df, 'Absenteeism time in hours')


üîç Validando: Absenteeism time in hours
‚úÖ Categ√≥rica: 37 clases

Distribuci√≥n:
count     744
unique     37
top       8.0
freq      195
Name: Absenteeism time in hours, dtype: object


True

In [7]:
def show_dataset_info(absenteeism_df):
    """Muestra informaci√≥n b√°sica del dataset."""
    print("Primeras filas del dataset:")
    display(absenteeism_df.head())

    print("\nEstad√≠sticas descriptivas:")
    display(absenteeism_df.describe().T)

In [8]:
# =====================================================================
# FUNCIONES DE ENTRENAMIENTO Y EVALUACI√ìN
# =====================================================================

def train_model(X_train, y_train, n_estimators=10, max_depth=10, random_state=42):
    """Entrena un clasificador Random Forest."""
    clf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=random_state
    )
    clf.fit(X_train, y_train)
    return clf


def evaluate_model(clf, X_test, y_test, data_type="original"):
    """Eval√∫a el modelo y retorna las m√©tricas."""
    y_pred = clf.predict(X_test)

    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred, average="weighted"),
        'f1': f1_score(y_test, y_pred, average="weighted")
    }

    print(f'\nM√©tricas con datos {data_type}:')
    print(f'  Accuracy: {metrics["accuracy"]:.2f}')
    print(f'  Recall: {metrics["recall"]:.2f}')
    print(f'  F1-Score: {metrics["f1"]:.2f}')

    return metrics

In [9]:

# =====================================================================
# FUNCIONES DE GENERACI√ìN DE DATOS SINT√âTICOS
# =====================================================================

def create_gmm_model(data, n_components=5, random_state=0):
    """Crea y ajusta un modelo GMM a los datos."""
    gmm = GaussianMixture(n_components=n_components, random_state=random_state)
    gmm.fit(data.reshape(-1, 1))
    return gmm


def modify_gmm_means(gmm, mean_shift):
    """Modifica las medias del GMM aplicando un desplazamiento."""
    print(f"\nMedias originales del GMM:\n{gmm.means_}")

    modified_means = gmm.means_.copy()
    for i in range(len(modified_means)):
        modified_means[i] *= (1 + mean_shift)

    gmm.means_ = modified_means
    print(f"\nMedias modificadas del GMM:\n{gmm.means_}")

    return gmm


def generate_synthetic_data(wine_df, target_column, mean_shift):
    """Genera datos sint√©ticos modificando una columna con GMM."""
    # Crear y entrenar GMM
    gmm = create_gmm_model(wine_df[target_column].to_numpy())

    # Modificar medias
    gmm = modify_gmm_means(gmm, mean_shift)

    # Generar datos sint√©ticos
    synthetic_df = wine_df.copy()
    n_samples = int(wine_df[target_column].count())
    synthetic_df[target_column] = gmm.sample(n_samples)[0].reshape(-1)

    return synthetic_df

In [10]:
# =====================================================================
# FUNCIONES DE DETECCI√ìN DE DRIFT
# =====================================================================

def detect_drift(X_reference, X_test, p_val=0.05):
    """Detecta data drift usando Kolmogorov-Smirnov test."""
    cd = KSDrift(X_reference, p_val=p_val)
    drift_pred = cd.predict(X_test)

    is_drift = drift_pred['data']['is_drift']
    print(f'\n¬øDrift detectado?: {is_drift}')

    return is_drift, drift_pred

In [11]:
# =====================================================================
# FUNCIONES DE VISUALIZACI√ìN
# =====================================================================

def plot_distribution_comparison(original_data, synthetic_data, column_name):
    """Compara distribuciones de datos originales vs sint√©ticos."""
    plt.figure(figsize=(10, 5))

    # Distribuci√≥n original
    sns.kdeplot(
        original_data[column_name],
        label=f'Original Data ({column_name})',
        fill=True,
        color='blue'
    )

    # Distribuci√≥n sint√©tica
    sns.kdeplot(
        synthetic_data[column_name],
        label=f'Synthetic Data ({column_name})',
        fill=True,
        color='red'
    )

    plt.title(
        f'Comparison: Original vs Synthetic Data ({column_name})',
        fontsize=16
    )
    plt.xlabel(f'{column_name} content', fontsize=14)
    plt.ylabel('Density', fontsize=14)
    plt.legend()
    plt.grid(True)
    plt.show()


In [12]:
# =====================================================================
# FUNCI√ìN ORQUESTADORA PRINCIPAL
# =====================================================================

def run_drift_detection_pipeline(
    target_column='Absenteeism time in hours',
    mean_shift=0.3,
    test_size=0.3,
    random_state=42,
    p_val=0.05
):
    """
    Pipeline completo de detecci√≥n de drift.

    Args:
        target_column: Columna a modificar para generar datos sint√©ticos
        mean_shift: Porcentaje de desplazamiento en las medias del GMM
        test_size: Proporci√≥n de datos para test
        random_state: Semilla aleatoria
        p_val: Valor p para el test de drift
    """
    print("="*70)
    print("INICIO DEL PIPELINE DE DETECCI√ìN DE DRIFT")
    print("="*70)


    # 1. Obtener variable target
    print("\n[1/7] obtener variables target...")
    y = absenteeism_df[target_column]
    X = absenteeism_df.drop(columns=[target_column])

    # Limpiar datos - convertir a num√©rico y rellenar NaNs
    print("\n[2/7] limpieza general...")
    for col in X.columns:
        X[col] = pd.to_numeric(X[col], errors='coerce')
    X = X.fillna(X.median())
    y = pd.to_numeric(y, errors='coerce')
    y = y.fillna(y.median())

    # 2. Dividir datos
    print("\n[3/7] Divisi√≥n de datos...")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=test_size,
        random_state=random_state
    )

    # 3. Entrenar modelo
    print("\n[4/7] Entrenando modelo...")
    clf = train_model(X_train, y_train, random_state=random_state)

    # 4. Evaluar con datos originales
    print("\n[5/7] Evaluando con datos originales...")
    original_metrics = evaluate_model(clf, X_test, y_test, "original")

    # 5. Generar datos sint√©ticos
    print(f"\n[6/7] Generando datos sint√©ticos (shift={mean_shift})...")
    synthetic_df = generate_synthetic_data(absenteeism_df, target_column, mean_shift)
    X_synthetic = synthetic_df[wine.feature_names].values
    y_synthetic = synthetic_df['target'].values

    # 6. Evaluar con datos sint√©ticos
    print("\n[7/7] Evaluando con datos sint√©ticos...")
    synthetic_metrics = evaluate_model(clf, X_synthetic, y_synthetic, "synthetic")

    # 7. Detectar drift
    print("\n[8/7] Detectando drift...")
    is_drift, drift_pred = detect_drift(X_train, X_synthetic, p_val=p_val)

    # 8. Visualizar comparaci√≥n
    print("\n[VISUALIZACI√ìN] Generando gr√°fico de comparaci√≥n...")
    plot_distribution_comparison(absenteeism_df, synthetic_df, target_column)

    print("\n" + "="*70)
    print("PIPELINE COMPLETADO")
    print("="*70)

    # Retornar resultados
    return {
        'df': absenteeism_df,
        'synthetic_df': synthetic_df,
        'model': clf,
        'original_metrics': original_metrics,
        'synthetic_metrics': synthetic_metrics,
        'drift_detected': is_drift,
        'drift_prediction': drift_pred
    }


In [13]:
# =====================================================================
# EJECUCI√ìN
# =====================================================================

if __name__ == "__main__":
    # Ejecutar pipeline con par√°metros por defecto
    results = run_drift_detection_pipeline(
        #target_column='Absenteeism time in hours',
        mean_shift=0.3,
        test_size=0.3,
        random_state=42,
        p_val=0.05
    )

INICIO DEL PIPELINE DE DETECCI√ìN DE DRIFT

[1/7] obtener variables target...

[2/7] limpieza general...

[3/7] Divisi√≥n de datos...

[4/7] Entrenando modelo...

[5/7] Evaluando con datos originales...

M√©tricas con datos original:
  Accuracy: 0.48
  Recall: 0.48
  F1-Score: 0.44

[6/7] Generando datos sint√©ticos (shift=0.3)...


ValueError: could not convert string to float: 'error'