In [1]:
import pandas as pd
import awswrangler as wr

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

import sklearn
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor


from sklearn.model_selection import cross_validate

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.options.mode.chained_assignment = None 

import mlflow


SEED = 42

In [2]:
print(f"Pandas version is {pd.__version__}")
print(f"Scikit-learn version is {sklearn.__version__}")
print(f"MLflow version is {mlflow.__version__}")

Pandas version is 1.5.3
Scikit-learn version is 1.2.2
MLflow version is 2.2.2


In [3]:
TRACKING_SERVER_HOST = "ec2-34-250-13-150.eu-west-1.compute.amazonaws.com" # fill in with the public DNS of the EC2 instance
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")
mlflow.set_experiment("godel-cozy-ds")

2023/04/16 19:19:46 INFO mlflow.tracking.fluent: Experiment with name 'godel-cozy-ds' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///c:/Users/u.darhevich/Documents/godel_mlflow_demo/mlruns/993004040168716830', creation_time=1681665586748, experiment_id='993004040168716830', last_update_time=1681665586748, lifecycle_stage='active', name='godel-cozy-ds', tags={}>

In [21]:
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'http://ec2-34-250-13-150.eu-west-1.compute.amazonaws.com:5000'


In [3]:
path1 = 's3://test-bucket-vlad-godel/data/olx_house_price_Q122.csv'

df = wr.s3.read_csv([path1], encoding='utf_8')

In [4]:
df.head(10)

Unnamed: 0,offer_title,price,price_per_meter,offer_type,floor,area,rooms,offer_type_of_building,market,city_name,voivodeship,month,year,population,longitude,latitude
0,Kawalerka na sprzedaĹĽ,240000.0,8888.89,Private,1.0,27.0,1,Housing Block,primary,BolesĹ‚awiec,Lower Silesia,January,2022,39603,15.565,51.263
1,Nowoczesna kawalerka z winda plus garaĹĽ podzi...,250000.0,7142.86,Private,1.0,35.0,1,Housing Block,primary,Jelcz-Laskowice,Lower Silesia,January,2022,15828,17.35,51.04
2,Nowa kawalerka z Balkonem/Legnicka/Magnolia,259000.0,10360.0,Estate Agency,2.0,25.0,1,Housing Block,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.033,51.109
3,Kawalerka z balkonem/klucze I kwartaĹ‚ 2022/60...,269000.0,10275.02,Private,3.0,26.18,1,Apartment Building,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.033,51.109
4,40 tys. taniej od dewelopera/Kawalerka/Magnoli...,258000.0,9923.08,Estate Agency,3.0,26.0,1,Housing Block,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.033,51.109
5,Mieszkanie na start,255000.0,11283.19,Private,3.0,22.6,1,Housing Block,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.033,51.109
6,Okazja! Na sprzedaĹĽ nowa kawalerka przy Odrze...,416120.0,12975.37,Estate Agency,5.0,32.07,1,Apartment Building,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.033,51.109
7,"Bulwary Staromiejskie, kawalerka z balkonem wi...",275900.0,14913.51,Estate Agency,4.0,18.5,1,Apartment Building,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.033,51.109
8,B Urban - Kawalerki inwestycyjne pod wynajem,201000.0,8040.0,Estate Agency,,,1,,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.033,51.109
9,Sprzedam punkt w Pasazu ZieliĹ„skiego,1000.0,100.0,Private,0.0,10.0,1,Other,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.033,51.109


In [5]:
df['price'].describe()

count       62818.000
mean       399405.855
std       2023250.514
min             1.000
25%        260000.000
50%        345000.000
75%        460000.000
max     504151616.000
Name: price, dtype: float64

In [6]:
df.shape

(62818, 16)

In [7]:
df.dtypes

offer_title                object
price                     float64
price_per_meter           float64
offer_type                 object
floor                     float64
area                      float64
rooms                       int64
offer_type_of_building     object
market                     object
city_name                  object
voivodeship                object
month                      object
year                        int64
population                  int64
longitude                 float64
latitude                  float64
dtype: object

In [8]:
df.isna().sum()

offer_title                  0
price                        0
price_per_meter              0
offer_type                   0
floor                     1487
area                      1487
rooms                        0
offer_type_of_building    1487
market                       0
city_name                    0
voivodeship                  0
month                        0
year                         0
population                   0
longitude                    0
latitude                     0
dtype: int64

In [9]:
categorical_features = ['offer_type', 'offer_type_of_building',
                        'market', 'voivodeship', 'month']

numeric_features = ['floor', 'area', 'rooms', 'longitude', 'latitude']

In [10]:
df = df[(df["price"] <= df["price"].quantile(0.95)) & (df["price"] >= df["price"].quantile(0.05))]

In [11]:
categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('encoder', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='infrequent_if_exist', min_frequency=2000))
            ])

numeric_transformer = Pipeline(steps=[
        ('imputer', IterativeImputer(initial_strategy='mean', max_iter=5, random_state=SEED, verbose=0)),
        ('scaler' , StandardScaler())
            ])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ])

y = df["price"]
X_train, X_test, y_train, y_test= train_test_split(df, y, test_size=0.2, random_state=SEED)

X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)
feature_names = preprocessor.get_feature_names_out()

In [12]:
feature_names

array(['num__floor', 'num__area', 'num__rooms', 'num__longitude',
       'num__latitude', 'cat__offer_type_Private',
       'cat__offer_type_of_building_Housing Block',
       'cat__offer_type_of_building_Tenement',
       'cat__offer_type_of_building_infrequent_sklearn',
       'cat__market_primary', 'cat__voivodeship_Kuyavia-Pomerania',
       'cat__voivodeship_Lesser Poland', 'cat__voivodeship_Lodzkie',
       'cat__voivodeship_Lower Silesia', 'cat__voivodeship_Lublin',
       'cat__voivodeship_Masovia', 'cat__voivodeship_Pomerania',
       'cat__voivodeship_Silesia', 'cat__voivodeship_Warmia-Masuria',
       'cat__voivodeship_West Pomerania',
       'cat__voivodeship_infrequent_sklearn', 'cat__month_January',
       'cat__month_March'], dtype=object)

In [13]:
X_train_transformed.shape, X_test_transformed.shape

((45235, 23), (11309, 23))

In [14]:
import pickle


with open('preprocessor.b', 'wb') as f_out:
    pickle.dump(preprocessor, f_out)

In [15]:
models = (
    [Ridge, "Ridge"], 
    [RandomForestRegressor, "RForest"], 
    [LinearSVR, "LinearSVR"], 
    [KNeighborsRegressor, "KNN"]
)


In [37]:
mlflow.sklearn.autolog()

for model_class in models:

    with mlflow.start_run():
        mlflow.set_tag("developer", "vlad")
        mlflow.set_tag("company", "godel")
        mlflow.log_param("Train datset size", X_train_transformed.shape)
        mlflow.log_param("model", model_class[1])
        estimator = model_class[0]()
        cv_results = cross_validate(estimator,
                    X_train_transformed, y_train,
                    cv=5, n_jobs=-1,
                    scoring=('neg_mean_absolute_percentage_error', 'neg_root_mean_squared_error'),
                    return_train_score=True
                    )
        mean_test_mape = cv_results['test_neg_mean_absolute_percentage_error'].mean()
        mean_train_mape = cv_results['train_neg_mean_absolute_percentage_error'].mean()
        mean_test_rmse = cv_results['test_neg_root_mean_squared_error'].mean()
        mean_train_rmse = cv_results['train_neg_root_mean_squared_error'].mean()
        mlflow.log_metric("mean_test_mape", mean_test_mape)
        mlflow.log_metric("mean_train_mape", mean_train_mape)
        mlflow.log_metric("mean_test_rmse", mean_test_rmse)
        mlflow.log_metric("mean_train_rmse", mean_train_rmse)

        mlflow.sklearn.log_model(estimator, artifact_path="models")
        print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")
        mlflow.log_artifact(local_path="preprocessor.b", artifact_path="models_pickle")
        mlflow.end_run()



default artifacts URI: 's3://test-bucket-vlad-godel/mlflow_artifacts/8/96df2c3a659d4f35be4e91dd6cfcfb6b/artifacts'
default artifacts URI: 's3://test-bucket-vlad-godel/mlflow_artifacts/8/e42c6404447848919a7205190a808ad8/artifacts'
default artifacts URI: 's3://test-bucket-vlad-godel/mlflow_artifacts/8/4d63909390ec4c69a6a7a5edcfe33473/artifacts'
default artifacts URI: 's3://test-bucket-vlad-godel/mlflow_artifacts/8/310806035cde44759dc8a93951ad9afc/artifacts'


In [38]:
from mlflow.tracking import MlflowClient


In [39]:
client = MlflowClient(tracking_uri=f"http://{TRACKING_SERVER_HOST}:5000")

In [72]:
experiment = (
    client.search_experiments(filter_string="name = 'godel-cozy-ds'")
) 


'8'

In [76]:
from mlflow.entities import ViewType


runs = client.search_runs(
    experiment_ids=experiment[0].experiment_id,
    filter_string="metrics.mean_test_mape > -0.2",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.mean_test_mape DESC"]
)

In [80]:
run_id  = runs[0].info.run_id
model_uri = f"runs:/{run_id}/model"
model_name = "test-model"
mlflow.register_model(model_uri=model_uri, name=model_name)

Registered model 'test-model' already exists. Creating a new version of this model...
2023/04/08 19:39:06 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: test-model, version 2
Created version '2' of model 'test-model'.


<ModelVersion: creation_timestamp=1680975542503, current_stage='None', description='', last_updated_timestamp=1680975542503, name='test-model', run_id='e42c6404447848919a7205190a808ad8', run_link='', source='s3://test-bucket-vlad-godel/mlflow_artifacts/8/e42c6404447848919a7205190a808ad8/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='2'>

In [82]:
model_version = 2
new_stage = "Staging"
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

<ModelVersion: creation_timestamp=1680975542503, current_stage='Staging', description='', last_updated_timestamp=1680975590702, name='test-model', run_id='e42c6404447848919a7205190a808ad8', run_link='', source='s3://test-bucket-vlad-godel/mlflow_artifacts/8/e42c6404447848919a7205190a808ad8/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='2'>

In [83]:
from datetime import datetime


date = datetime.today().date()
client.update_model_version(
    name=model_name,
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_stage} on {date}"
)

<ModelVersion: creation_timestamp=1680975542503, current_stage='Staging', description='The model version 2 was transitioned to Staging on 2023-04-08', last_updated_timestamp=1680975622373, name='test-model', run_id='e42c6404447848919a7205190a808ad8', run_link='', source='s3://test-bucket-vlad-godel/mlflow_artifacts/8/e42c6404447848919a7205190a808ad8/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='2'>

In [87]:
mlflow.artifacts.download_artifacts(run_id=run_id, dst_path='.')


'c:\\Users\\u.darhevich\\Documents\\godel_mlflow_demo\\'

In [26]:
import xgboost as xgb
import optuna
from tqdm import tqdm


from sklearn.model_selection import RepeatedKFold

In [27]:
dtrain = xgb.DMatrix(X_train_transformed, label=y_train, feature_names=feature_names)
dtest = xgb.DMatrix(X_test_transformed, feature_names=feature_names)

In [33]:
def objective(trial):
    """Objective function used to track experiment in Optuna

    Args:
        trial : a trial used for experiment

    Returns:
        float: target metric
    """
    param = {
        'objective': 'reg:squarederror',
        'eval_metric': ['mape', 'rmse'],
        'booster': 'gbtree',
        'verbosity': 0,
        'eta': trial.suggest_float('eta', 0.001, 0.3),
        'gamma': trial.suggest_float('gamma', 0.001, 10, log=True),
        'max_depth': trial.suggest_int('max_depth', 5, 15),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 100),
        'reg_alpha': trial.suggest_float('reg_alpha', 10e-5, 10.0,  log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 10e-5, 10.0, log=True),
    }
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "test-rmse")
    history = xgb.cv(param, dtrain,
                    folds=RepeatedKFold(n_splits=4, n_repeats=2),
                    num_boost_round=500,
                    early_stopping_rounds=50,
                    seed=SEED,
                    callbacks=[pruning_callback])
    return history['test-rmse-mean'].min()

In [34]:
pruner = optuna.pruners.MedianPruner(n_warmup_steps=10)
study = optuna.create_study(
                    study_name=f"xgboost_cv_optimize",
                    pruner=pruner,
                    direction="minimize")
study.optimize(objective, n_trials=10, timeout=25*60)

[32m[I 2023-04-18 20:14:26,274][0m A new study created in memory with name: xgboost_cv_optimize[0m
[32m[I 2023-04-18 20:15:51,314][0m Trial 0 finished with value: 57658.807234664324 and parameters: {'eta': 0.05430744153044991, 'gamma': 5.89946197834448, 'max_depth': 9, 'subsample': 0.7001389193625145, 'colsample_bytree': 0.7718708337381767, 'min_child_weight': 34, 'reg_alpha': 0.14005479108101337, 'reg_lambda': 0.15109203485913955}. Best is trial 0 with value: 57658.807234664324.[0m
[32m[I 2023-04-18 20:18:05,822][0m Trial 1 finished with value: 54223.09194426981 and parameters: {'eta': 0.2793201990280029, 'gamma': 0.6629936762436845, 'max_depth': 13, 'subsample': 0.5857010554330324, 'colsample_bytree': 0.7568140173700031, 'min_child_weight': 73, 'reg_alpha': 0.0007027783441686488, 'reg_lambda': 0.3209713189059149}. Best is trial 1 with value: 54223.09194426981.[0m
[32m[I 2023-04-18 20:20:19,973][0m Trial 2 finished with value: 55504.8856624805 and parameters: {'eta': 0.1551

In [43]:
?trial

Object `trial` not found.
