House pricing prediction
* run a hyperparamater tuning while training a model
* log every hyperparameter and metrics in the mlflow ui
* compare the results of the various runs in the mlflow ui
* choose the best run and regsiter it as a model

In [1]:
import pandas as pd 
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import train_test_split,GridSearchCV

from sklearn.metrics import mean_squared_error
from sklearn.datasets import fetch_california_housing

In [15]:
## loading the dataset
housing=fetch_california_housing()
housing

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]], shape=(20640, 8)),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894], shape=(20640,)),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': 

In [22]:
##dividing the dataset in X and y

data=pd.DataFrame(housing.data,columns=housing.feature_names)
data.head()

#included the lastest column // target 
data["Price"]=housing.target

In [24]:
# dividing the dataset into X and y
from urllib.parse import urlparse
X=data.drop(columns='Price')
y=data["Price"]

In [51]:
##splitting the data I have
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=25,shuffle=True)

In [52]:
## create / fit the model 
rfr=RandomForestRegressor()
rfr.fit(X_train,y_train)

In [53]:
y_pred=rfr.predict(X_test)
#
y_pred


array([2.4021922, 3.7077219, 2.8966703, ..., 2.5378201, 2.4947001,
       2.36094  ], shape=(5160,))

In [54]:
##checking accuracy score 
from sklearn.metrics import mean_squared_error ,r2_score
mserr=mean_squared_error(y_test,y_pred)
mserr

0.2383087442845596

In [55]:
r2_score=r2_score(y_test,y_pred)
r2_score

0.8181842315459616

In [56]:
## Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

def hyperparameter_tuning(X_train,y_train,param_grid):
    rf=RandomForestRegressor()
    grid_search=GridSearchCV(estimator=rf,param_grid=param_grid,cv=3,n_jobs=-1,verbose=2,
                             scoring="neg_mean_squared_error")
    grid_search.fit(X_train,y_train)
    return grid_search

In [5]:
import mlflow
import mlflow.sklearn
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd

# Set MLflow tracking URI
mlflow.set_tracking_uri("http://localhost:5000")

# Load dataset
data = fetch_california_housing()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['target']), df['target'], test_size=0.2, random_state=42)

# Define hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

# Train model with GridSearchCV
rf = RandomForestRegressor()
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Log experiment with MLflow
mlflow.set_experiment("california_housing_rf")

with mlflow.start_run() as run:
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    # Log parameters, metrics, and model
    mlflow.log_params(grid_search.best_params_)
    mlflow.log_metric("mse", mse)
    
    input_example = X_test.iloc[:1]
    mlflow.sklearn.log_model(best_model, "random_forest_model", input_example=input_example)
    
    print(f"Logged model with MSE: {mse}")

    # Retrieve Run ID dynamically
    run_id = run.info.run_id

# Model inference
loaded_model = mlflow.sklearn.load_model(f"runs:/{run_id}/random_forest_model")
sample_input = X_test.iloc[:1]
prediction = loaded_model.predict(sample_input)
print(f"Sample Prediction: {prediction}")

# Versioning
mlflow.register_model(f"runs:/{run_id}/random_forest_model", "California_Housing_RF")


Logged model with MSE: 0.2535611365781679
🏃 View run bouncy-foal-563 at: http://localhost:5000/#/experiments/675282779494826991/runs/c7a6bdebcaec4d178553a067dc02b301
🧪 View experiment at: http://localhost:5000/#/experiments/675282779494826991


Successfully registered model 'California_Housing_RF'.
2025/01/31 17:27:44 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: California_Housing_RF, version 1


Sample Prediction: [0.49916178]


Created version '1' of model 'California_Housing_RF'.


<ModelVersion: aliases=[], creation_timestamp=1738344464026, current_stage='None', description='', last_updated_timestamp=1738344464026, name='California_Housing_RF', run_id='c7a6bdebcaec4d178553a067dc02b301', run_link='', source='mlflow-artifacts:/675282779494826991/c7a6bdebcaec4d178553a067dc02b301/artifacts/random_forest_model', status='READY', status_message=None, tags={}, user_id='', version='1'>