In [3]:
# TP Objective
#  • Master the theoretical trade-offs (Bias vs Variance) inherent in modeling.
#  • Implement and compare regularized linear regression models (Ridge and LASSO).
#  • Estimate optimal parameters () using Cross-Validation.
#  • Understand and analyze the concept of Sparsity introduced by LASSO.
#  • Initiate the management of the model life cycle (MLOps) through Packaging and
#  Tracking (MLflow).

In [2]:
#  1 Phase I: Advanced Linear Regression and Trade-off
#  Analysis
#  1.1 Data Preparation and OLS Model
#  We will use the Diabetes dataset, which aims to predict a quantitative measure of disease
#  progression from ten physiological variables (age, sex, BMI, etc.).
#  • Task 1.1: Load the dataset. Split the data into training and test sets (80
#  • Task 1.2: OLS Regression: Train an Ordinary Least Squares (OLS) Linear Re
# gression model on the normalized data. Evaluate its performance in terms of R 2 and
#  Root Mean Squared Error (RMSE).

In [1]:
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
 # Data loading and preparation
 X, y = load_diabetes(return_X_y=True)
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
 # Normalization (already pre-processed, but we apply a standard for consistency)
 scaler = StandardScaler()
 X_train_scaled = scaler.fit_transform(X_train)
 X_test_scaled = scaler.transform(X_test)

In [7]:
 # OLS Regression
 ols_reg = LinearRegression()
 ols_reg.fit(X_train_scaled, y_train)
 y_pred_ols = ols_reg.predict(X_test_scaled)

In [8]:
# OLS Evaluation
rmse_ols = np.sqrt(mean_squared_error(y_test, y_pred_ols))
r2_ols = r2_score(y_test, y_pred_ols)

In [9]:
print(f"OLS: RMSE={rmse_ols:.2f}, R2={r2_ols:.2f}")
print("Coefficients OLS:", ols_reg.coef_)

OLS: RMSE=53.85, R2=0.45
Coefficients OLS: [  1.75375799 -11.51180908  25.60712144  16.82887167 -44.44885564
  24.64095356   7.67697768  13.1387839   35.16119521   2.35136365]


In [7]:
#  1.2 Bias, Variance, and Regularization
#  • Task 1.3: Theory: Describe the relationship between Bias and Variance in Ma
# chine Learning. Explain how adding a regularization term (penalty) modifies this
#  trade-off and prevents overfitting.
#  • Task 1.4: Ridge Regression (2): Implement a Ridge model. Justify why the 2
#  penalty tends to reduce the magnitude of coefficients, but does not completely zero
#  them out.

In [10]:
 # Ridge Regression
 from sklearn.linear_model import Ridge
 from sklearn.model_selection import cross_val_score

In [11]:
 # Training with an initial alpha value
 alpha_initial = 1.0
 ridge_reg = Ridge(alpha=alpha_initial)
 ridge_reg.fit(X_train_scaled, y_train)

In [12]:
# Evaluation by Cross-Validation (5-folds)
scores_ridge = cross_val_score(ridge_reg, X_train_scaled, y_train, scoring='neg_root_mean_squared_error', cv=5)
rmse_cv_ridge =-scores_ridge.mean()

print(f"\nRidge (alpha={alpha_initial}): RMSE CV={rmse_cv_ridge:.2f}")
print("Coefficients Ridge:", ridge_reg.coef_)


Ridge (alpha=1.0): RMSE CV=55.92
Coefficients Ridge: [  1.80734179 -11.44818951  25.73269892  16.73429974 -34.67195409
  17.05307485   3.36991411  11.76426044  31.3783838    2.45813922]


In [13]:
# 1.3 Sparsity and Hyperparameter Tuning
#  : Sheet —
#  • Task 1.5: LASSO Regression (1): Implement a LASSO model. Explain the con
# cept of Sparsity. Why can the 1 penalty (unlike 2 ) zero out certain coefficients,
#  thereby implicitly performing feature selection?
#  • Task 1.6: Tuning with Cross-Validation: Use LassoCV and RidgeCV (which
#  include internal cross-validation) to find the optimal values of the hyperparameter for
#  each model

In [14]:
# Tuning with CV for Ridge and LASSO
from sklearn.linear_model import RidgeCV, LassoCV

In [19]:
# Packaging (with the best model, here LASSO)
import joblib

In [21]:
import os

# Saving the transformation object and the final model
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(lasso_final, 'lasso_model.pkl')

# dossier cible 
target_dir = "../models"  # ex: "deployment/models" ou "/chemin/absolu/vers/models"
os.makedirs(target_dir, exist_ok=True)

scaler_path = os.path.join(target_dir, "scaler.pkl")
model_path = os.path.join(target_dir, "lasso_model.pkl")

joblib.dump(scaler, scaler_path)
joblib.dump(lasso_final, model_path)

print(f"\nScaler and LASSO model saved for deployment in: {os.path.abspath(target_dir)}")


Scaler and LASSO model saved for deployment in: c:\Users\DELL 5480\Desktop\2025\ML TPS\TP2_Regression\models


In [28]:
# Exemple de fonction de prédiction en production
def predict_new_data(raw_data, model_path='lasso_model.pkl', scaler_path='scaler.pkl'):
    """Simule le pipeline de prédiction en production."""
    import os
    import joblib
    import numpy as np

    # Vérifie la présence des fichiers
    if not os.path.exists(model_path) or not os.path.exists(scaler_path):
        print("❌ Erreur : Fichiers du modèle ou du scaler introuvables.")
        return None

    try:
        # Chargement du scaler et du modèle
        loaded_scaler = joblib.load(scaler_path)
        loaded_model = joblib.load(model_path)
    except FileNotFoundError:
        print("❌ Erreur : Fichiers manquants.")
        return None

    # Étape 1 : transformation des données brutes
    scaled_data = loaded_scaler.transform([raw_data])

    # Étape 2 : prédiction
    prediction = loaded_model.predict(scaled_data)[0]
    return prediction


In [30]:
# Testing the prediction function with a point from the test set

example_raw_input = X_test[0]
# Use the paths where the scaler and model were saved (variables defined earlier)
predicted_value = predict_new_data(example_raw_input, model_path=model_path, scaler_path=scaler_path)

# Vérification de la prédiction avant affichage
if predicted_value is not None:
    print(f"Prediction for the test example: {predicted_value:.2f}")
else:
    print("La prédiction a échoué - vérifiez que les fichiers modèle/scaler existent dans ../models/")

Prediction for the test example: 140.05


In [55]:
# 2.2: Dependencies: Identify the dependencies required for running this model in production 

# numpy / pandas → manipulation des données.

# scikit-learn → pour charger le modèle, le scaler, et effectuer les prédictions.

# joblib → pour sauvegarder et recharger les objets Python.

# mlflow → pour le suivi des expériences et métriques (tracking).

# flask / fastapi → pour servir le modèle via une API web en production.

In [56]:
 # 2.2 MLflow for Lifecycle Management (Simulated)
 # • Task 2.3: Tracking: MLflow isakeyplatformfor managing the entire ML life cycle.
 # Write the necessary code to start an "Experiment," logging the hyperparameters ()
 # and the final metric (RMSE) for the three models (OLS, Ridge, LASSO).

In [32]:
pip install mlflow

Note: you may need to restart the kernel to use updated packages.


In [33]:
 # 2.3: Tracking with MLflow (API Simulation)

import mlflow
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# ⚙️ Configuration de l'expérience
mlflow.set_experiment("TP_Advanced_Regression")

def log_model_results(model_name, params, metrics, model):
    """Enregistre les résultats d’un modèle dans MLflow."""
    with mlflow.start_run(run_name=model_name):
        # 1️⃣ Log des hyperparamètres
        mlflow.log_params(params)
        # 2️⃣ Log des métriques
        mlflow.log_metrics(metrics)
        # 3️⃣ Log du modèle (optionnel : artefact)
        # mlflow.sklearn.log_model(model, "model")
        print(f"✅ MLflow Run '{model_name}' enregistré.")

# Exemple avec le modèle LASSO
y_pred_lasso = lasso_final.predict(X_test_scaled)
rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
r2_lasso = r2_score(y_test, y_pred_lasso)

lasso_params = {"alpha": float(lasso_final.alpha_), "solver": "cd", "penalty": "l1"}
lasso_metrics = {"RMSE": rmse_lasso, "R2": r2_lasso}

log_model_results("LASSO_Optimal", lasso_params, lasso_metrics, lasso_final)


2025/10/27 11:00:53 INFO mlflow.tracking.fluent: Experiment with name 'TP_Advanced_Regression' does not exist. Creating a new experiment.


✅ MLflow Run 'LASSO_Optimal' enregistré.


In [34]:
!mlflow ui

'mlflow' n'est pas reconnu en tant que commande interne
ou externe, un programme ex�cutable ou un fichier de commandes.


In [None]:
# 2.3 Containerization (Docker)
#  • Task 2.4: Dockerfile (Theoretical): Provide a basic Dockerfile that allows con
# tainerizing a small web API (for example, based on Flask/FastAPI) exposing the
#  lasso_model.pkl model for predictions. Explain the role of the commands FROM,
#  COPY, and CMD.