### House Price prediction with MLFLOW 
In this tutorial, we will
1. Run a hyperparameter tuning while training a model 
2. log every hyperparameter and metrics in the MLFLOW UI 
3. Copare the results of the various runs in the MLFLOW UI 
4. Choose the best run and register it as a model 



In [3]:
import pandas as pd
import mlflow 
import mlflow.sklearn 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()


In [6]:
# Prepare the dataset
data = pd.DataFrame(housing.data, columns = housing.feature_names)
data["Price"] = housing.target
data.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [7]:
from urllib.parse import urlparse ## module in python's standards library used to parse URLs
## Independent and dependent features 

X = data.drop("Price", axis=1)
y = data["Price"]

In [8]:
## Hyperparameter tuning using GridSearchcv

def hyperparameter_tuning(X_train, y_train, param_grid):
    rf = RandomForestRegressor()
    grid_serach = GridSearchCV(estimator=rf,param_grid =  param_grid,cv= 3,n_jobs = 2, verbose = 2,
                               scoring="neg_mean_squared_error")
    
    grid_serach.fit(X_train, y_train)
    return grid_serach

In [10]:
## Split the Data into train test 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from mlflow.models import infer_signature
signature = infer_signature(X_train, y_train)

## DEfine the hyperparamters grid 
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 40, 50],
    'min_samples_split': [ 15, 20],
    'min_samples_leaf': [1, 8],
    'max_features': ['auto', 'sqrt']
   
}

## Start the MLFLOW experiment

with mlflow.start_run():
    ## Perform Hyperparameter tuning
    grid_search = hyperparameter_tuning(X_train, y_train, param_grid)
    ## Get the best model 
    best_model = grid_search.best_estimator_
    ## Evaluate the best model 
    y_predict  = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_predict)
    mlflow.log_param("Best_n_estimators", grid_search.best_params_["n_estimators"])
    mlflow.log_param("best_max_depth",grid_search.best_params_["max_depth"])
    mlflow.log_param("best_min_samples_split", grid_search.best_params_["min_samples_split"])
    mlflow.log_param("best_min_samples_leaf", grid_search.best_params_["min_samples_leaf"])
    mlflow.log_param("best_max_features", grid_search.best_params_["max_features"])
    mlflow.log_metric("mse", mse)


    ## Tracking uri 
    mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")
    tracking_uri_type_store = urlparse(mlflow.get_tracking_uri()).scheme

    if tracking_uri_type_store != "file": 
        mlflow.sklearn.log_model(best_model,"model",registered_model_name="Best RandomForest model")
    else:
        mlflow.sklearn.log_model(best_model,"model", signature = signature)

    print(f"best hyperparameters : {grid_search.best_params_}")
    print(f"Mean Squared Error : {mse}")


Fitting 3 folds for each of 48 candidates, totalling 144 fits


Successfully registered model 'Best RandomForest model'.
2024/11/02 20:26:36 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Best RandomForest model, version 1
Created version '1' of model 'Best RandomForest model'.
2024/11/02 20:26:36 INFO mlflow.tracking._tracking_service.client: 🏃 View run welcoming-conch-262 at: http://127.0.0.1:5000/#/experiments/0/runs/296d7a5924584fd3b6a8e01c3d163bd7.
2024/11/02 20:26:36 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0.


best hyperparameters : {'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 15, 'n_estimators': 200}
Mean Squared Error : 0.25669262014710115
