In [1]:
import mlflow
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Reading and preparing data

In [2]:
df = pd.read_csv('../data/houses.csv')
df.head(1)

Unnamed: 0,tamanho,ano,garagem,preco
0,159.0,2003,2,208500


In [3]:
X = df.drop(columns='preco')
y = df['preco']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1022, 3), (1022,), (438, 3), (438,))

# Some tests using default backend

> Using the default backend the Registry option is disabled

In [None]:
mlflow.set_experiment('house-prices-eda')

## Linear Regression

In [71]:
mlflow.start_run()

<ActiveRun: >

In [65]:
model1 = LinearRegression()
model1.fit(X_train, y_train)

LinearRegression()

In [66]:
mlflow.log_params(model1.get_params())

In [67]:
mlflow.sklearn.log_model(model1, 'linear_regression')

In [68]:
y_pred = model1.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mse, r2

(2078666917.9289913, 0.7021153642898048)

In [69]:
mlflow.log_metric('mse', mse)
mlflow.log_metric('r2', r2)

In [72]:
mlflow.end_run()

## XGBoost

### No params

In [54]:
with mlflow.start_run():
    model2 = XGBRegressor(random_state=42)
    model2.fit(X_train, y_train)
    
    mlflow.xgboost.log_model(model2, 'xgboost')
    
    y_pred = model2.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    mlflow.log_metric('mse', mse)
    mlflow.log_metric('r2', r2)

### Setting model params before running

In [55]:
params = {
    'learning_rate': 0.2,
    'n_estimators': 50,
    'random_state': 42
}

with mlflow.start_run():
    model2 = XGBRegressor(**params)
    model2.fit(X_train, y_train)
    
    mlflow.xgboost.log_model(model2, 'xgboost')
    
    y_pred = model2.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    mlflow.log_metric('mse', mse)
    mlflow.log_metric('r2', r2)

# MLflow stuff

In [56]:
mlflow.get_experiment_by_name('house-prices-eda')

<Experiment: artifact_location='file:///l/disk0/iaraujo/%C3%81rea%20de%20Trabalho/estudos/Courses/alura-mlflow-gestao-ciclo-vidas-modelos-ml/src/mlruns/1', experiment_id='1', lifecycle_stage='active', name='house-prices-eda', tags={}>

In [58]:
mlflow.list_run_infos(experiment_id='1')

[<RunInfo: artifact_uri='file:///l/disk0/iaraujo/%C3%81rea%20de%20Trabalho/estudos/Courses/alura-mlflow-gestao-ciclo-vidas-modelos-ml/src/mlruns/1/841a00f759b047c7b63802e8c96cac1b/artifacts', end_time=1643813416332, experiment_id='1', lifecycle_stage='active', run_id='841a00f759b047c7b63802e8c96cac1b', run_uuid='841a00f759b047c7b63802e8c96cac1b', start_time=1643813415261, status='FINISHED', user_id='iaraujo'>,
 <RunInfo: artifact_uri='file:///l/disk0/iaraujo/%C3%81rea%20de%20Trabalho/estudos/Courses/alura-mlflow-gestao-ciclo-vidas-modelos-ml/src/mlruns/1/667fba01c7e74948a51a21d6d6176652/artifacts', end_time=1643813117235, experiment_id='1', lifecycle_stage='active', run_id='667fba01c7e74948a51a21d6d6176652', run_uuid='667fba01c7e74948a51a21d6d6176652', start_time=1643813116136, status='FINISHED', user_id='iaraujo'>,
 <RunInfo: artifact_uri='file:///l/disk0/iaraujo/%C3%81rea%20de%20Trabalho/estudos/Courses/alura-mlflow-gestao-ciclo-vidas-modelos-ml/src/mlruns/1/602a397137e94ad6a07a3f0e0

In [59]:
mlflow.get_run(run_id='841a00f759b047c7b63802e8c96cac1b')

<Run: data=<RunData: metrics={'mse': 1386727460.1346002, 'r2': 0.8012741720529797}, params={}, tags={'mlflow.log-model.history': '[{"run_id": "841a00f759b047c7b63802e8c96cac1b", '
                             '"artifact_path": "xgboost", "utc_time_created": '
                             '"2022-02-02 14:50:15.361772", "flavors": '
                             '{"python_function": {"loader_module": '
                             '"mlflow.xgboost", "python_version": "3.8.10", '
                             '"data": "model.xgb", "env": "conda.yaml"}, '
                             '"xgboost": {"xgb_version": "1.5.2", "data": '
                             '"model.xgb", "model_class": '
                             '"xgboost.sklearn.XGBRegressor"}}}]',
 'mlflow.source.name': '/l/disk0/iaraujo/.local/lib/python3.8/site-packages/ipykernel_launcher.py',
 'mlflow.source.type': 'LOCAL',
 'mlflow.user': 'iaraujo'}>, info=<RunInfo: artifact_uri='file:///l/disk0/iaraujo/%C3%81rea%20de%20Trabalho/e

## Predict

> It's also possible to predict via command line using:
>> mlflow models predict -m PATH_TO_MODEL -i PATH_TO_INPUT -t INPUT_FILE_TYPE -o OUTPUT_PATH

In [78]:
logged_model = 'runs:/876bf098afd74361bfaaa96bb2d9e66c/linear_regression'

loaded_model = mlflow.pyfunc.load_model(logged_model)

In [79]:
data = df.drop(columns='preco')

loaded_model.predict(pd.DataFrame(data))

array([225330.9699176 , 169894.26391015, 229616.83552065, ...,
       200454.55960921, 110202.60589025, 135744.06785697])

## Serving model as API and testing it

> mlflow models serve -m PATH_TO_MODEL -p PORT

> e.g.: mlflow models serve -m runs:/876bf098afd74361bfaaa96bb2d9e66c/linear_regression -p 5001

In [80]:
import requests

In [86]:
url = 'http://127.0.0.1:5001/invocations'
data = {
    'columns': ['tamanho', 'ano', 'garagem'],
    'data': [[159.0, 2003, 1], [159.0, 2010, 1]]
}
header = {'Content-Type': 'application/json'}

In [87]:
response = requests.post(url, json=data, headers=header)
response, response.text

(<Response [200]>, '[199563.7645382667, 204898.3949670836]')

# Some tests using sqlite backend

> mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./artifacts --host 0.0.0.0

> This enables model Registry, helping on models lifecycle admin

> Now it's easier to serve the model in production:
> * export MLFLOW_TRACKING_URI=http://127.0.0.1:5000
> * mlflow models serve -m 'models:/House Price Predictor/Production' -p 5001

## Linear Regression

In [4]:
# NEW!

mlflow.set_tracking_uri('http://127.0.0.1:5000')

In [6]:
mlflow.set_experiment('house-prices-eda')

2022/02/02 15:24:32 INFO mlflow.tracking.fluent: Experiment with name 'house-prices-eda' does not exist. Creating a new experiment.


<Experiment: artifact_location='./artifacts/1', experiment_id='1', lifecycle_stage='active', name='house-prices-eda', tags={}>

In [9]:
mlflow.start_run()

<ActiveRun: >

In [10]:
model1 = LinearRegression()
model1.fit(X_train, y_train)

LinearRegression()

In [11]:
mlflow.log_params(model1.get_params())

In [12]:
mlflow.sklearn.log_model(model1, 'linear_regression')

ModelInfo(artifact_path='linear_regression', flavors={'python_function': {'model_path': 'model.pkl', 'loader_module': 'mlflow.sklearn', 'python_version': '3.8.10', 'env': 'conda.yaml'}, 'sklearn': {'pickled_model': 'model.pkl', 'sklearn_version': '1.0', 'serialization_format': 'cloudpickle'}}, model_uri='runs:/bdc8d1712417433bbfbf6fddff1689f1/linear_regression', model_uuid='0e75a51230664ccbb0971f5648725ac1', run_id='bdc8d1712417433bbfbf6fddff1689f1', saved_input_example_info=None, signature_dict=None, utc_time_created='2022-02-02 18:24:40.596103')

In [13]:
y_pred = model1.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mse, r2

(2078666917.9289913, 0.7021153642898048)

In [14]:
mlflow.log_metric('mse', mse)
mlflow.log_metric('r2', r2)

In [15]:
mlflow.end_run()

## XGBoost

In [16]:
with mlflow.start_run():
    model2 = XGBRegressor(random_state=42)
    model2.fit(X_train, y_train)
    
    mlflow.xgboost.log_model(model2, 'xgboost')
    
    y_pred = model2.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    mlflow.log_metric('mse', mse)
    mlflow.log_metric('r2', r2)

## Testing again

In [21]:
import requests

In [22]:
url = 'http://127.0.0.1:5001/invocations'
data = {
    'columns': ['tamanho', 'ano', 'garagem'],
    'data': [[159.0, 2003, 1], [159.0, 2010, 1]]
}
header = {'Content-Type': 'application/json'}

In [20]:
#beforing updating production model

response = requests.post(url, json=data, headers=header)
response, response.text

(<Response [200]>, '[199563.7645382667, 204898.3949670836]')

In [23]:
#after updating

response = requests.post(url, json=data, headers=header)
response, response.text

(<Response [200]>, '[177157.125, 209713.640625]')

# Docker

> mlflow models build-docker -m 'models:/House Price Predictor/Production' -n "house-prices"

> docker run -p 5001:8080 "house-prices"

> And now the Production model is served and can receive a post as shown above