<a href="https://colab.research.google.com/github/javierhellch/MLOps/blob/main/MLOps_PenguinsML_Train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


Penguin Species Classification - Production Training Pipeline
Modelo: Random Forest (optimizado para evitar overfitting)


In [None]:
import pandas as pd
import numpy as np
import pickle
import json
from datetime import datetime
from pathlib import Path

# ML Libraries
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, learning_curve
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

# Visualization
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
def clonar_repo(repo_origen,repo_destino):
  !git clone {repo_origen} {repo_destino}

In [None]:
clonar_repo("https://github.com/javierhellch/MLOps.git","/content/MLOps")

Cloning into '/content/MLOps'...
remote: Enumerating objects: 14, done.[K
remote: Counting objects:   7% (1/14)[Kremote: Counting objects:  14% (2/14)[Kremote: Counting objects:  21% (3/14)[Kremote: Counting objects:  28% (4/14)[Kremote: Counting objects:  35% (5/14)[Kremote: Counting objects:  42% (6/14)[Kremote: Counting objects:  50% (7/14)[Kremote: Counting objects:  57% (8/14)[Kremote: Counting objects:  64% (9/14)[Kremote: Counting objects:  71% (10/14)[Kremote: Counting objects:  78% (11/14)[Kremote: Counting objects:  85% (12/14)[Kremote: Counting objects:  92% (13/14)[Kremote: Counting objects: 100% (14/14)[Kremote: Counting objects: 100% (14/14), done.[K
remote: Compressing objects:  10% (1/10)[Kremote: Compressing objects:  20% (2/10)[Kremote: Compressing objects:  30% (3/10)[Kremote: Compressing objects:  40% (4/10)[Kremote: Compressing objects:  50% (5/10)[Kremote: Compressing objects:  60% (6/10)[Kremote: Compressing objects:  70%

In [None]:

class PenguinPipeline:
    """Pipeline completo de entrenamiento para clasificaciÃ³n de pingÃ¼inos"""

    def __init__(self, data_path, output_dir='./models', random_state=42):
        self.data_path = data_path
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True, parents=True)
        self.random_state = random_state

        # Componentes
        self.label_encoders = {}
        self.target_encoder = None
        self.scaler = StandardScaler()
        self.imputer = KNNImputer(n_neighbors=5, weights='distance')
        self.model = None

        # Datos
        self.raw_data = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.feature_names = None

        # Resultados
        self.results = {}

    def load_data(self):
        """Carga y limpieza inicial de datos"""
        print("\n[1/8] Cargando datos...")
        self.raw_data = pd.read_csv(self.data_path)

        # Eliminar columnas irrelevantes
        cols_to_drop = ['Unnamed: 0', 'year'] if 'Unnamed: 0' in self.raw_data.columns else ['year']
        if 'year' in self.raw_data.columns:
            self.raw_data = self.raw_data.drop(columns=cols_to_drop, errors='ignore')

        print(f"âœ“ Datos cargados: {self.raw_data.shape}")
        print(f"  Especies: {self.raw_data['species'].value_counts().to_dict()}")

        # AnÃ¡lisis de valores faltantes
        missing = self.raw_data.isnull().sum()
        missing = missing[missing > 0]
        if len(missing) > 0:
            print(f"\n  Valores faltantes detectados:")
            for col, count in missing.items():
                pct = (count / len(self.raw_data) * 100)
                print(f"    â€¢ {col}: {count} ({pct:.2f}%)")

        return self

    def preprocess_data(self):
        """Preprocesamiento: encoding, feature engineering, imputaciÃ³n"""
        print("\n[2/8] Preprocesando datos...")
        df = self.raw_data.copy()

        # 1. Encoding de variables categÃ³ricas
        categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

        for col in categorical_cols:
            if col != 'species':
                le = LabelEncoder()
                mask = df[col].notna()
                df.loc[mask, col] = le.fit_transform(df.loc[mask, col].astype(str))
                self.label_encoders[col] = le
            else:
                # Encodear el target
                self.target_encoder = LabelEncoder()
                df['species'] = self.target_encoder.fit_transform(df['species'])

        print(f"âœ“ Variables categÃ³ricas codificadas")

        # 2. Feature Engineering
        df['bill_ratio'] = df['bill_length_mm'] / (df['bill_depth_mm'] + 1e-6)
        df['body_mass_index'] = df['body_mass_g'] / (df['flipper_length_mm'] + 1e-6)
        df['bill_size'] = df['bill_length_mm'] * df['bill_depth_mm']

        print(f"âœ“ Feature engineering completado (3 nuevas features)")

        # 3. Separar features y target
        X = df.drop(columns=['species'])
        y = df['species']

        self.feature_names = X.columns.tolist()

        # 4. ImputaciÃ³n KNN
        X_imputed = self.imputer.fit_transform(X)
        X_imputed = pd.DataFrame(X_imputed, columns=self.feature_names, index=X.index)

        print(f"âœ“ ImputaciÃ³n KNN completada")

        # 5. Train/Test Split estratificado
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X_imputed, y,
            test_size=0.2,
            random_state=self.random_state,
            stratify=y
        )

        print(f"\nâœ“ Split completado:")
        print(f"  Train: {self.X_train.shape[0]} muestras")
        print(f"  Test: {self.X_test.shape[0]} muestras")

        # 6. Scaling (DESPUÃ‰S del split para evitar data leakage)
        self.X_train = pd.DataFrame(
            self.scaler.fit_transform(self.X_train),
            columns=self.feature_names,
            index=self.X_train.index
        )

        self.X_test = pd.DataFrame(
            self.scaler.transform(self.X_test),
            columns=self.feature_names,
            index=self.X_test.index
        )

        print(f"âœ“ Escalado aplicado")

        return self

    def build_model(self):
        """
        Construye Random Forest con hiperparÃ¡metros CONSERVADORES
        para evitar overfitting
        """
        print("\n[3/8] Construyendo modelo Random Forest...")

        self.model = RandomForestClassifier(
            n_estimators=100,           # Menos Ã¡rboles para evitar overfitting
            max_depth=8,                # Profundidad limitada (conservador)
            min_samples_split=10,       # MÃ­nimo alto para splits
            min_samples_leaf=4,         # MÃ­nimo alto por hoja
            max_features='sqrt',        # Solo sqrt(n) features por split
            bootstrap=True,             # Bootstrap para diversidad
            oob_score=True,             # Out-of-bag score para validaciÃ³n
            class_weight='balanced',    # Maneja desbalance de clases
            random_state=self.random_state,
            n_jobs=-1
        )

        print(f"âœ“ Random Forest creado con hiperparÃ¡metros conservadores:")
        print(f"  â€¢ n_estimators: 100")
        print(f"  â€¢ max_depth: 8 (evita Ã¡rboles muy profundos)")
        print(f"  â€¢ min_samples_split: 10")
        print(f"  â€¢ min_samples_leaf: 4")
        print(f"  â€¢ max_features: sqrt")
        print(f"  â€¢ class_weight: balanced")

        return self

    def train_model(self):
        """Entrena el modelo y realiza validaciÃ³n cruzada"""
        print("\n[4/8] Entrenando modelo...")

        # Entrenar
        self.model.fit(self.X_train, self.y_train)

        # Out-of-bag score (similar a validaciÃ³n cruzada)
        oob_score = self.model.oob_score_
        print(f"âœ“ Modelo entrenado")
        print(f"  â€¢ OOB Score: {oob_score:.4f}")

        return self

    def cross_validate(self):
        """ValidaciÃ³n cruzada estratificada"""
        print("\n[5/8] ValidaciÃ³n cruzada (5-fold)...")

        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=self.random_state)
        cv_scores = cross_val_score(
            self.model, self.X_train, self.y_train,
            cv=skf, scoring='accuracy', n_jobs=-1
        )

        self.results['cv_scores'] = cv_scores
        self.results['cv_mean'] = cv_scores.mean()
        self.results['cv_std'] = cv_scores.std()

        print(f"âœ“ Cross-validation completada:")
        print(f"  â€¢ Accuracy: {cv_scores.mean():.4f} Â± {cv_scores.std():.4f}")
        print(f"  â€¢ Scores: {[f'{s:.4f}' for s in cv_scores]}")

        # Verificar varianza - alto std puede indicar overfitting
        if cv_scores.std() > 0.05:
            print(f"  âš  DesviaciÃ³n estÃ¡ndar alta (>{0.05:.2f}) - posible overfitting")

        return self

    def evaluate_model(self):
        """EvalÃºa el modelo en el conjunto de test"""
        print("\n[6/8] Evaluando en conjunto de test...")

        # Predicciones
        y_pred = self.model.predict(self.X_test)
        y_pred_proba = self.model.predict_proba(self.X_test)

        # MÃ©tricas
        self.results['test_accuracy'] = accuracy_score(self.y_test, y_pred)
        self.results['test_f1'] = f1_score(self.y_test, y_pred, average='weighted')
        self.results['confusion_matrix'] = confusion_matrix(self.y_test, y_pred)
        self.results['classification_report'] = classification_report(
            self.y_test, y_pred,
            target_names=self.target_encoder.classes_
        )

        print(f"\nâœ“ Resultados en Test Set:")
        print(f"  â€¢ Accuracy: {self.results['test_accuracy']:.4f}")
        print(f"  â€¢ F1-Score: {self.results['test_f1']:.4f}")

        print(f"\n  Classification Report:")
        print(self.results['classification_report'])

        print(f"\n  Confusion Matrix:")
        print(self.results['confusion_matrix'])

        # Comparar train vs test (detectar overfitting)
        train_accuracy = self.model.score(self.X_train, self.y_train)
        gap = train_accuracy - self.results['test_accuracy']

        print(f"\n  AnÃ¡lisis de Overfitting:")
        print(f"  â€¢ Train Accuracy: {train_accuracy:.4f}")
        print(f"  â€¢ Test Accuracy: {self.results['test_accuracy']:.4f}")
        print(f"  â€¢ Gap: {gap:.4f}")

        if gap > 0.05:
            print(f"  âš  Gap alto (>{0.05:.2f}) - posible overfitting")
        elif gap < 0.02:
            print(f"  âœ“ Gap bajo (<0.02) - buena generalizaciÃ³n")
        else:
            print(f"  âœ“ Gap moderado - generalizaciÃ³n aceptable")

        return self

    def create_visualizations(self):
        """Crea visualizaciones del modelo"""
        print("\n[7/8] Creando visualizaciones...")

        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        fig.suptitle('Random Forest - Production Model Analysis', fontsize=16, fontweight='bold')

        # 1. Feature Importance
        ax1 = axes[0, 0]
        importances = self.model.feature_importances_
        indices = np.argsort(importances)[-10:]  # Top 10

        ax1.barh(range(len(indices)), importances[indices], color='#4ECDC4')
        ax1.set_yticks(range(len(indices)))
        ax1.set_yticklabels([self.feature_names[i] for i in indices])
        ax1.set_title('Top 10 Feature Importance', fontweight='bold')
        ax1.set_xlabel('Importance')

        # 2. Confusion Matrix
        ax2 = axes[0, 1]
        cm = self.results['confusion_matrix']
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax2,
                    xticklabels=self.target_encoder.classes_,
                    yticklabels=self.target_encoder.classes_)
        ax2.set_title('Confusion Matrix (Test Set)', fontweight='bold')
        ax2.set_ylabel('True Label')
        ax2.set_xlabel('Predicted Label')

        # 3. Train vs Test Accuracy (Overfitting check)
        ax3 = axes[1, 0]
        train_acc = self.model.score(self.X_train, self.y_train)
        test_acc = self.results['test_accuracy']
        cv_acc = self.results['cv_mean']

        metrics = ['Train', 'CV (5-fold)', 'Test']
        accuracies = [train_acc, cv_acc, test_acc]
        colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']

        bars = ax3.bar(metrics, accuracies, color=colors, alpha=0.8)
        ax3.set_ylim([0.85, 1.0])
        ax3.set_title('Accuracy Comparison (Overfitting Check)', fontweight='bold')
        ax3.set_ylabel('Accuracy')
        ax3.axhline(y=0.95, color='gray', linestyle='--', alpha=0.5, label='Target: 95%')

        for bar, acc in zip(bars, accuracies):
            height = bar.get_height()
            ax3.text(bar.get_x() + bar.get_width()/2., height + 0.005,
                    f'{acc:.4f}', ha='center', va='bottom', fontweight='bold')

        ax3.legend()

        # 4. Cross-Validation Scores
        ax4 = axes[1, 1]
        cv_scores = self.results['cv_scores']
        folds = [f'Fold {i+1}' for i in range(len(cv_scores))]

        ax4.bar(folds, cv_scores, color='#96CEB4', alpha=0.8)
        ax4.axhline(y=cv_scores.mean(), color='red', linestyle='--',
                   label=f'Mean: {cv_scores.mean():.4f}')
        ax4.fill_between(range(len(cv_scores)),
                        cv_scores.mean() - cv_scores.std(),
                        cv_scores.mean() + cv_scores.std(),
                        alpha=0.2, color='red', label=f'Â±1 STD: {cv_scores.std():.4f}')
        ax4.set_title('Cross-Validation Scores (Stability Check)', fontweight='bold')
        ax4.set_ylabel('Accuracy')
        ax4.set_ylim([0.90, 1.0])
        ax4.legend()
        ax4.tick_params(axis='x', rotation=45)

        plt.tight_layout()

        viz_path = self.output_dir / 'model_analysis.png'
        plt.savefig(viz_path, dpi=300, bbox_inches='tight')
        plt.close()

        print(f"âœ“ VisualizaciÃ³n guardada: {viz_path}")

        return self

    def save_model(self):
        """Guarda el modelo y todos los artefactos necesarios para producciÃ³n"""
        print("\n[8/8] Guardando modelo y artefactos...")

        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        # 1. Guardar modelo
        model_path = self.output_dir / 'penguin_model.pkl'
        with open(model_path, 'wb') as f:
            pickle.dump(self.model, f)

        # 2. Guardar transformadores
        transformers = {
            'label_encoders': self.label_encoders,
            'target_encoder': self.target_encoder,
            'scaler': self.scaler,
            'imputer': self.imputer,
            'feature_names': self.feature_names
        }

        transformer_path = self.output_dir / 'transformers.pkl'
        with open(transformer_path, 'wb') as f:
            pickle.dump(transformers, f)

        # 3. Guardar metadata
        metadata = {
            'model_type': 'RandomForestClassifier',
            'timestamp': timestamp,
            'train_date': datetime.now().isoformat(),
            'random_state': self.random_state,
            'hyperparameters': {
                'n_estimators': self.model.n_estimators,
                'max_depth': self.model.max_depth,
                'min_samples_split': self.model.min_samples_split,
                'min_samples_leaf': self.model.min_samples_leaf,
                'max_features': self.model.max_features,
                'class_weight': 'balanced'
            },
            'performance': {
                'test_accuracy': float(self.results['test_accuracy']),
                'test_f1_score': float(self.results['test_f1']),
                'cv_accuracy_mean': float(self.results['cv_mean']),
                'cv_accuracy_std': float(self.results['cv_std']),
                'oob_score': float(self.model.oob_score_),
                'train_accuracy': float(self.model.score(self.X_train, self.y_train)),
                'overfitting_gap': float(self.model.score(self.X_train, self.y_train) - self.results['test_accuracy'])
            },
            'data_info': {
                'total_samples': len(self.raw_data),
                'train_samples': len(self.X_train),
                'test_samples': len(self.X_test),
                'n_features': len(self.feature_names),
                'target_classes': self.target_encoder.classes_.tolist()
            },
            'feature_names': self.feature_names
        }

        metadata_path = self.output_dir / 'model_metadata.json'
        with open(metadata_path, 'w') as f:
            json.dump(metadata, f, indent=2)

        print(f"âœ“ Modelo guardado: {model_path}")
        print(f"âœ“ Transformadores guardados: {transformer_path}")
        print(f"âœ“ Metadata guardada: {metadata_path}")

        # 4. Crear archivo de configuraciÃ³n para FastAPI
        config = {
            'model_path': 'models/penguin_model.pkl',
            'transformers_path': 'models/transformers.pkl',
            'metadata_path': 'models/model_metadata.json'
        }

        config_path = self.output_dir / 'config.json'
        with open(config_path, 'w') as f:
            json.dump(config, f, indent=2)

        print(f"âœ“ ConfiguraciÃ³n guardada: {config_path}")

        return self

    def run(self):
        """Ejecuta el pipeline completo"""
        print("\n" + "="*70)
        print("PENGUIN CLASSIFICATION - PRODUCTION TRAINING PIPELINE")
        print("="*70)

        self.load_data()
        self.preprocess_data()
        self.build_model()
        self.train_model()
        self.cross_validate()
        self.evaluate_model()
        self.create_visualizations()
        self.save_model()

        print("\n" + "="*70)
        print("âœ“ PIPELINE COMPLETADO EXITOSAMENTE")
        print("="*70)

        # Resumen final
        overfitting_gap = self.model.score(self.X_train, self.y_train) - self.results['test_accuracy']

        print(f"\nðŸ“Š RESUMEN:")
        print(f"  â€¢ Modelo: Random Forest (conservador)")
        print(f"  â€¢ Test Accuracy: {self.results['test_accuracy']:.4f}")
        print(f"  â€¢ CV Accuracy: {self.results['cv_mean']:.4f} Â± {self.results['cv_std']:.4f}")
        print(f"  â€¢ OOB Score: {self.model.oob_score_:.4f}")
        print(f"  â€¢ Overfitting Gap: {overfitting_gap:.4f}")

        if overfitting_gap < 0.02:
            print(f"\nâœ“ Modelo con buena generalizaciÃ³n - LISTO PARA PRODUCCIÃ“N")
        elif overfitting_gap < 0.05:
            print(f"\nâœ“ Modelo con generalizaciÃ³n aceptable - APTO PARA PRODUCCIÃ“N")
        else:
            print(f"\nâš  Modelo con posible overfitting - REVISAR ANTES DE PRODUCCIÃ“N")

        return self.model

In [None]:
if __name__ == "__main__":
    # Entrenar modelo
    pipeline = PenguinPipeline(
        data_path='/content/MLOps/PenguinsML/penguins.csv',
        output_dir='/content/MLOps/PenguinsML/Training'
    )

    model = pipeline.run()


PENGUIN CLASSIFICATION - PRODUCTION TRAINING PIPELINE

[1/8] Cargando datos...
âœ“ Datos cargados: (344, 7)
  Especies: {'Adelie': 152, 'Gentoo': 124, 'Chinstrap': 68}

  Valores faltantes detectados:
    â€¢ bill_length_mm: 2 (0.58%)
    â€¢ bill_depth_mm: 2 (0.58%)
    â€¢ flipper_length_mm: 2 (0.58%)
    â€¢ body_mass_g: 2 (0.58%)
    â€¢ sex: 11 (3.20%)

[2/8] Preprocesando datos...
âœ“ Variables categÃ³ricas codificadas
âœ“ Feature engineering completado (3 nuevas features)
âœ“ ImputaciÃ³n KNN completada

âœ“ Split completado:
  Train: 275 muestras
  Test: 69 muestras
âœ“ Escalado aplicado

[3/8] Construyendo modelo Random Forest...
âœ“ Random Forest creado con hiperparÃ¡metros conservadores:
  â€¢ n_estimators: 100
  â€¢ max_depth: 8 (evita Ã¡rboles muy profundos)
  â€¢ min_samples_split: 10
  â€¢ min_samples_leaf: 4
  â€¢ max_features: sqrt
  â€¢ class_weight: balanced

[4/8] Entrenando modelo...
âœ“ Modelo entrenado
  â€¢ OOB Score: 0.9818

[5/8] ValidaciÃ³n cruzada (5-fold)..