In [34]:
import pandas as pd
import awswrangler as wr

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

import sklearn
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor


from sklearn.model_selection import cross_validate

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.options.mode.chained_assignment = None 

import mlflow


SEED = 42

In [35]:
print(f"Pandas version is {pd.__version__}")
print(f"Scikit-learn version is {sklearn.__version__}")
print(f"MLflow version is {mlflow.__version__}")

Pandas version is 1.5.1
Scikit-learn version is 1.2.2
MLflow version is 2.2.2


In [36]:
TRACKING_SERVER_HOST = "ec2-34-250-13-150.eu-west-1.compute.amazonaws.com" # fill in with the public DNS of the EC2 instance
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")

In [37]:
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'http://ec2-34-250-13-150.eu-west-1.compute.amazonaws.com:5000'


In [38]:
path1 = 's3://test-bucket-vlad-godel/data/olx_house_price_Q122.csv'

df = wr.s3.read_csv([path1], encoding='utf-8')

In [39]:
df.head(10)

Unnamed: 0,offer_title,price,price_per_meter,offer_type,floor,area,rooms,offer_type_of_building,market,city_name,voivodeship,month,year,population,longitude,latitude
0,Kawalerka na sprzedaĹĽ,240000.0,8888.89,Private,1.0,27.0,1,Housing Block,primary,BolesĹ‚awiec,Lower Silesia,January,2022,39603,15.565,51.263
1,Nowoczesna kawalerka z winda plus garaĹĽ podzi...,250000.0,7142.86,Private,1.0,35.0,1,Housing Block,primary,Jelcz-Laskowice,Lower Silesia,January,2022,15828,17.35,51.04
2,Nowa kawalerka z Balkonem/Legnicka/Magnolia,259000.0,10360.0,Estate Agency,2.0,25.0,1,Housing Block,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.033,51.109
3,Kawalerka z balkonem/klucze I kwartaĹ‚ 2022/60...,269000.0,10275.02,Private,3.0,26.18,1,Apartment Building,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.033,51.109
4,40 tys. taniej od dewelopera/Kawalerka/Magnoli...,258000.0,9923.08,Estate Agency,3.0,26.0,1,Housing Block,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.033,51.109
5,Mieszkanie na start,255000.0,11283.19,Private,3.0,22.6,1,Housing Block,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.033,51.109
6,Okazja! Na sprzedaĹĽ nowa kawalerka przy Odrze...,416120.0,12975.37,Estate Agency,5.0,32.07,1,Apartment Building,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.033,51.109
7,"Bulwary Staromiejskie, kawalerka z balkonem wi...",275900.0,14913.51,Estate Agency,4.0,18.5,1,Apartment Building,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.033,51.109
8,B Urban - Kawalerki inwestycyjne pod wynajem,201000.0,8040.0,Estate Agency,,,1,,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.033,51.109
9,Sprzedam punkt w Pasazu ZieliĹ„skiego,1000.0,100.0,Private,0.0,10.0,1,Other,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.033,51.109


In [40]:
df['price'].describe()

count       62818.000
mean       399405.855
std       2023250.514
min             1.000
25%        260000.000
50%        345000.000
75%        460000.000
max     504151616.000
Name: price, dtype: float64

In [41]:
df.shape

(62818, 16)

In [42]:
df.dtypes

offer_title                object
price                     float64
price_per_meter           float64
offer_type                 object
floor                     float64
area                      float64
rooms                       int64
offer_type_of_building     object
market                     object
city_name                  object
voivodeship                object
month                      object
year                        int64
population                  int64
longitude                 float64
latitude                  float64
dtype: object

In [43]:
df.isna().sum()

offer_title                  0
price                        0
price_per_meter              0
offer_type                   0
floor                     1487
area                      1487
rooms                        0
offer_type_of_building    1487
market                       0
city_name                    0
voivodeship                  0
month                        0
year                         0
population                   0
longitude                    0
latitude                     0
dtype: int64

In [44]:
categorical_features = ['offer_type', 'offer_type_of_building',
                        'market', 'voivodeship', 'month']

numeric_features = ['floor', 'area', 'rooms', 'longitude', 'latitude']

In [45]:
df = df[(df["price"] <= df["price"].quantile(0.95)) & (df["price"] >= df["price"].quantile(0.05))]

In [46]:
categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('encoder', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='infrequent_if_exist', min_frequency=2000))
            ])

numeric_transformer = Pipeline(steps=[
        ('imputer', IterativeImputer(initial_strategy='mean', max_iter=5, random_state=SEED, verbose=0)),
        ('scaler' , StandardScaler())
            ])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ])

y = df["price"]
X_train, X_test, y_train, y_test= train_test_split(df, y, test_size=0.2, random_state=SEED)

X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)
feature_names = preprocessor.get_feature_names_out()

2023/04/08 14:51:21 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '4c9e77cf3c884dd7bceb509e51558a3e', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2023/04/08 14:51:25 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '537e118eecc54823ba7a470969ea649b', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


In [47]:
feature_names

array(['num__floor', 'num__area', 'num__rooms', 'num__longitude',
       'num__latitude', 'cat__offer_type_Private',
       'cat__offer_type_of_building_Housing Block',
       'cat__offer_type_of_building_Tenement',
       'cat__offer_type_of_building_infrequent_sklearn',
       'cat__market_primary', 'cat__voivodeship_Kuyavia-Pomerania',
       'cat__voivodeship_Lesser Poland', 'cat__voivodeship_Lodzkie',
       'cat__voivodeship_Lower Silesia', 'cat__voivodeship_Lublin',
       'cat__voivodeship_Masovia', 'cat__voivodeship_Pomerania',
       'cat__voivodeship_Silesia', 'cat__voivodeship_Warmia-Masuria',
       'cat__voivodeship_West Pomerania',
       'cat__voivodeship_infrequent_sklearn', 'cat__month_January',
       'cat__month_March'], dtype=object)

In [48]:
X_train_transformed.shape, X_test_transformed.shape

((45235, 23), (11309, 23))

In [56]:
models = (
    [Ridge, "Ridge"], 
    [RandomForestRegressor, "RF"], 
    [LinearSVR, "LinearSVR"], 
    [KNeighborsRegressor, "KNN"]
)


In [57]:
mlflow.set_experiment("my-experiment-godel")
mlflow.sklearn.autolog()

for model_class in models:

    with mlflow.start_run():
        
        mlflow.log_param("Train datset size", X_train_transformed.shape)
        mlflow.log_param("model", model_class[1])
        estimator = model_class[0]()
        cv_results = cross_validate(estimator,
                    X_train_transformed, y_train,
                    cv=5, n_jobs=-1,
                    scoring=('neg_mean_absolute_percentage_error', 'neg_root_mean_squared_error'),
                    return_train_score=True
                    )
        mean_test_mape = cv_results['test_neg_mean_absolute_percentage_error'].mean()
        mean_train_mape = cv_results['train_neg_mean_absolute_percentage_error'].mean()
        mean_test_rmse = cv_results['test_neg_root_mean_squared_error'].mean()
        mean_train_rmse = cv_results['train_neg_root_mean_squared_error'].mean()
        mlflow.log_metric("mean_test_mape", mean_test_mape)
        mlflow.log_metric("mean_train_mape", mean_train_mape)
        mlflow.log_metric("mean_test_rmse", mean_test_rmse)
        mlflow.log_metric("mean_train_rmse", mean_train_rmse)

        mlflow.sklearn.log_model(estimator, artifact_path="models")
        print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")
        mlflow.end_run()

2023/04/08 14:55:36 INFO mlflow.tracking.fluent: Experiment with name 'my-experiment-godel' does not exist. Creating a new experiment.


default artifacts URI: 's3://test-bucket-vlad-godel/mlflow_artifacts/4/473f7cf208d64b5b8a0e09dc50587010/artifacts'
default artifacts URI: 's3://test-bucket-vlad-godel/mlflow_artifacts/4/77173d555adf444188b8e9fcee2edca4/artifacts'
default artifacts URI: 's3://test-bucket-vlad-godel/mlflow_artifacts/4/467e10ba61154f58927998fb2862eadf/artifacts'
default artifacts URI: 's3://test-bucket-vlad-godel/mlflow_artifacts/4/53c8ef4fdcc949dfbd5d763e561ae2d1/artifacts'


In [59]:
from mlflow.tracking import MlflowClient


In [101]:
client = MlflowClient(tracking_uri=f"http://{TRACKING_SERVER_HOST}:5000")

In [126]:
experiments = (
    client.search_experiments()
) 


In [127]:
experiments

[<Experiment: artifact_location='s3://test-bucket-vlad-godel/mlflow_artifacts/6', creation_time=1680963899678, experiment_id='6', last_update_time=1680963899678, lifecycle_stage='active', name='godeldemo-2023-04-08-16-04-03', tags={}>,
 <Experiment: artifact_location='s3://test-bucket-vlad-godel/mlflow_artifacts/5', creation_time=1680961665901, experiment_id='5', last_update_time=1680961665901, lifecycle_stage='active', name='godeldemo-2023-04-08-15-04-49', tags={}>,
 <Experiment: artifact_location='s3://test-bucket-vlad-godel/mlflow_artifacts/4', creation_time=1680958533002, experiment_id='4', last_update_time=1680958533002, lifecycle_stage='active', name='my-experiment-godel', tags={}>,
 <Experiment: artifact_location='s3://test-bucket-vlad-godel/mlflow_artifacts/3', creation_time=1680957409102, experiment_id='3', last_update_time=1680957409102, lifecycle_stage='active', name='my-experiment-test2', tags={}>]

In [128]:
run = client.create_run(experiments[0].experiment_id)
run 

<Run: data=<RunData: metrics={}, params={}, tags={'mlflow.runName': 'salty-goat-814'}>, info=<RunInfo: artifact_uri='s3://test-bucket-vlad-godel/mlflow_artifacts/6/3097311aa38d47a39ef87576eb0bd06b/artifacts', end_time=None, experiment_id='6', lifecycle_stage='active', run_id='3097311aa38d47a39ef87576eb0bd06b', run_name='salty-goat-814', run_uuid='3097311aa38d47a39ef87576eb0bd06b', start_time=1680964114943, status='RUNNING', user_id='unknown'>>

In [129]:
run.info.run_id

'3097311aa38d47a39ef87576eb0bd06b'

In [130]:
exp_id = run.info.experiment_id

In [131]:
client.set_tag(run.info.run_id, "tag", "test")


In [132]:
from mlflow.entities import ViewType


runs = client.search_runs(
    experiment_ids= run.info.experiment_id,
    filter_string="metrics.mean_test_mape > -0.2",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.mean_test_mape DESC"]
)

In [133]:
runs[0].info.run_id

'1b9c7d0f51aa461db1796e46e02b80eb'

In [135]:
import mlflow
logged_model = 'runs:/1b9c7d0f51aa461db1796e46e02b80eb/models'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

# Predict on a Pandas DataFrame.
