# Modelo de Regressão para Previsão de Preço de Imóveis

## Carregando os dados

In [3]:
import pandas as pd

processed_data_path = "../data/processed/"

df = pd.read_csv(processed_data_path + "casas.csv")
df.head()

Unnamed: 0,tamanho,ano,garagem,preco
0,159.0,2003,2,208500
1,117.0,1976,2,181500
2,166.0,2001,2,223500
3,160.0,1915,3,140000
4,204.0,2000,3,250000


## Separando _features_ e _target_

In [4]:
features = df.drop(columns=["preco"])
target = df["preco"]

## Separando em treino e teste

In [5]:
from sklearn.model_selection import train_test_split

SEED = 42

features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.3, random_state=SEED)

## Configurando MLFlow

In [6]:
import mlflow

In [7]:
from mlflow.models import infer_signature

### Login no MLFlow Tracking do Databricks

In [8]:
mlflow.login()

2024/07/29 14:38:59 INFO mlflow.utils.credentials: Successfully connected to MLflow hosted tracking server! Host: https://community.cloud.databricks.com.


In [9]:
mlflow.set_tracking_uri("databricks")

### Definindo o experimento

In [10]:
mlflow.set_experiment("/Users/leandrosouza159@gmail.com/house-prices-eda")

<Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/2842189901217684', creation_time=1722272429978, experiment_id='2842189901217684', last_update_time=1722272429978, lifecycle_stage='active', name='/Users/leandrosouza159@gmail.com/house-prices-eda', tags={'mlflow.experiment.sourceName': '/Users/leandrosouza159@gmail.com/house-prices-eda',
 'mlflow.experimentType': 'MLFLOW_EXPERIMENT',
 'mlflow.ownerEmail': 'leandrosouza159@gmail.com',
 'mlflow.ownerId': '7181426075552825'}>

## Linear Regression

In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, root_mean_squared_error

linear_regression = LinearRegression()
linear_regression.fit(features_train, target_train)

predicted = linear_regression.predict(features_test)

r2 = r2_score(target_test, predicted)
rmse = root_mean_squared_error(target_test, predicted)

### Logging in MLFlow

In [13]:
with mlflow.start_run():

    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Training Info", "Basic LR model for houses data")

    # Infer the model signature
    signature = infer_signature(features_train, linear_regression.predict(features_train))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=linear_regression,
        artifact_path="houses",
        signature=signature,
        input_example=features_train,
    )

    # Log metrics
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("rmse", rmse)

2024/07/29 14:42:18 INFO mlflow.tracking._tracking_service.client: 🏃 View run traveling-sow-28 at: https://community.cloud.databricks.com/ml/experiments?o=352727760772812/ml/experiments/2842189901217684/runs/c2ff9826d1934556b490c142b212ec5e.
2024/07/29 14:42:18 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://community.cloud.databricks.com/ml/experiments?o=352727760772812/ml/experiments/2842189901217684.


## XGBoost

In [17]:
from xgboost import XGBRFRegressor

xgb_params = {
    "learning_rate": 0.1,
    "n_estimators": 50,
    "random_state": SEED
}

xgb = XGBRFRegressor(**xgb_params)
xgb.fit(features_train, target_train)

predicted = xgb.predict(features_test)

r2 = r2_score(target_test, predicted)
rmse = root_mean_squared_error(target_test, predicted)

### Logging

In [18]:
with mlflow.start_run(run_name="XGBRFRegressor-02"):

    # Log the hyperparameters
    mlflow.log_params(xgb_params)

    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Training Info", "Basic XGBRFRegressor model for houses data")

    # Infer the model signature
    signature = infer_signature(features_train, xgb.predict(features_train))

    # Log the model
    model_info = mlflow.xgboost.log_model(
        xgb_model=xgb,
        artifact_path="houses",
        signature=signature,
        input_example=features_train,
    )

    # Log metrics
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("rmse", rmse)

2024/07/29 14:59:45 INFO mlflow.tracking._tracking_service.client: 🏃 View run XGBRFRegressor-02 at: https://community.cloud.databricks.com/ml/experiments?o=352727760772812/ml/experiments/2842189901217684/runs/dda898b2e091453f8a8d123a1d683228.
2024/07/29 14:59:45 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://community.cloud.databricks.com/ml/experiments?o=352727760772812/ml/experiments/2842189901217684.


## Obtendo informações do MLFlow Tracking

In [21]:
experiment = mlflow.get_experiment_by_name("/Users/leandrosouza159@gmail.com/house-prices-eda")
experiment

<Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/2842189901217684', creation_time=1722272429978, experiment_id='2842189901217684', last_update_time=1722275965438, lifecycle_stage='active', name='/Users/leandrosouza159@gmail.com/house-prices-eda', tags={'mlflow.experiment.sourceName': '/Users/leandrosouza159@gmail.com/house-prices-eda',
 'mlflow.experimentType': 'MLFLOW_EXPERIMENT',
 'mlflow.ownerEmail': 'leandrosouza159@gmail.com',
 'mlflow.ownerId': '7181426075552825'}>

In [22]:
runs = mlflow.search_runs(experiment_ids=["2842189901217684"])
runs

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.r2,metrics.rmse,params.n_estimators,params.random_state,params.learning_rate,tags.mlflow.runName,tags.mlflow.source.type,tags.mlflow.source.name,tags.mlflow.user,tags.mlflow.log-model.history,tags.Training Info
0,dda898b2e091453f8a8d123a1d683228,2842189901217684,FINISHED,dbfs:/databricks/mlflow-tracking/2842189901217...,2024-07-29 17:59:25.438000+00:00,2024-07-29 17:59:44.855000+00:00,0.149491,77038.506312,50.0,42.0,0.1,XGBRFRegressor-02,LOCAL,/Users/leandrosouza/workspace/mlflow/.venv/lib...,leandrosouza159@gmail.com,"[{""artifact_path"":""houses"",""saved_input_exampl...",Basic XGBRFRegressor model for houses data
1,38e899a5ad4f4b369cb673a6833c2f0f,2842189901217684,FINISHED,dbfs:/databricks/mlflow-tracking/2842189901217...,2024-07-29 17:50:24.417000+00:00,2024-07-29 17:50:42.288000+00:00,0.283944,70687.397202,,,,XGBRFRegressor-01,LOCAL,/Users/leandrosouza/workspace/mlflow/.venv/lib...,leandrosouza159@gmail.com,"[{""artifact_path"":""houses"",""saved_input_exampl...",Basic XGBRFRegressor model for houses data
2,c2ff9826d1934556b490c142b212ec5e,2842189901217684,FINISHED,dbfs:/databricks/mlflow-tracking/2842189901217...,2024-07-29 17:41:58.837000+00:00,2024-07-29 17:42:17.372000+00:00,0.702115,45592.399783,,,,LR-01,LOCAL,/Users/leandrosouza/workspace/mlflow/.venv/lib...,leandrosouza159@gmail.com,"[{""artifact_path"":""houses"",""saved_input_exampl...",Basic LR model for houses data


In [23]:
run = mlflow.get_run("dda898b2e091453f8a8d123a1d683228")
run

<Run: data=<RunData: metrics={'r2': 0.14949095249176025, 'rmse': 77038.50631181702}, params={'learning_rate': '0.1', 'n_estimators': '50', 'random_state': '42'}, tags={'Training Info': 'Basic XGBRFRegressor model for houses data',
 'mlflow.log-model.history': '[{"artifact_path":"houses","saved_input_example_info":{"artifact_path":"input_example.json","type":"dataframe","pandas_orient":"split","serving_input_path":"serving_input_payload.json"},"signature":{"inputs":"[{\\"type\\": '
                             '\\"double\\", \\"name\\": \\"tamanho\\", '
                             '\\"required\\": true}, {\\"type\\": \\"long\\", '
                             '\\"name\\": \\"ano\\", \\"required\\": true}, '
                             '{\\"type\\": \\"long\\", \\"name\\": '
                             '\\"garagem\\", \\"required\\": '
                             'true}]","outputs":"[{\\"type\\": \\"tensor\\", '
                             '\\"tensor-spec\\": {\\"dtype\\": \\"float3