In [1]:
import pandas as pd
import awswrangler as wr
import sklearn
import boto3
import pickle
import xgboost as xgb
import numpy as np
import mlflow
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.options.mode.chained_assignment = None 

SEED = 42

In [2]:
print(f"Pandas version is {pd.__version__}")
print(f"Scikit-learn version is {sklearn.__version__}")
print(f"MLflow version is {mlflow.__version__}")

Pandas version is 1.5.3
Scikit-learn version is 1.2.2
MLflow version is 2.2.2


In [3]:
TRACKING_SERVER_HOST = "ec2-3-253-112-217.eu-west-1.compute.amazonaws.com" # fill in with the public DNS of the EC2 instance
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")
mlflow.set_experiment("godel-cozy-ds")

<Experiment: artifact_location='s3://test-bucket-vlad-godel/mlflow_artifacts/4', creation_time=1685455257640, experiment_id='4', last_update_time=1685455257640, lifecycle_stage='active', name='godel-cozy-ds', tags={}>

In [4]:
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'http://ec2-3-253-112-217.eu-west-1.compute.amazonaws.com:5000'


In [5]:
path1 = 's3://test-bucket-vlad-godel/data/olx_house_price_Q122.csv'

df = wr.s3.read_csv([path1], encoding='utf-8')

In [6]:
df.head(10)

Unnamed: 0,offer_title,price,price_per_meter,offer_type,floor,area,rooms,offer_type_of_building,market,city_name,voivodeship,month,year,population,longitude,latitude
0,Kawalerka na sprzedaĹĽ,240000.0,8888.89,Private,1.0,27.0,1,Housing Block,primary,BolesĹ‚awiec,Lower Silesia,January,2022,39603,15.565,51.263
1,Nowoczesna kawalerka z winda plus garaĹĽ podzi...,250000.0,7142.86,Private,1.0,35.0,1,Housing Block,primary,Jelcz-Laskowice,Lower Silesia,January,2022,15828,17.35,51.04
2,Nowa kawalerka z Balkonem/Legnicka/Magnolia,259000.0,10360.0,Estate Agency,2.0,25.0,1,Housing Block,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.033,51.109
3,Kawalerka z balkonem/klucze I kwartaĹ‚ 2022/60...,269000.0,10275.02,Private,3.0,26.18,1,Apartment Building,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.033,51.109
4,40 tys. taniej od dewelopera/Kawalerka/Magnoli...,258000.0,9923.08,Estate Agency,3.0,26.0,1,Housing Block,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.033,51.109
5,Mieszkanie na start,255000.0,11283.19,Private,3.0,22.6,1,Housing Block,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.033,51.109
6,Okazja! Na sprzedaĹĽ nowa kawalerka przy Odrze...,416120.0,12975.37,Estate Agency,5.0,32.07,1,Apartment Building,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.033,51.109
7,"Bulwary Staromiejskie, kawalerka z balkonem wi...",275900.0,14913.51,Estate Agency,4.0,18.5,1,Apartment Building,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.033,51.109
8,B Urban - Kawalerki inwestycyjne pod wynajem,201000.0,8040.0,Estate Agency,,,1,,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.033,51.109
9,Sprzedam punkt w Pasazu ZieliĹ„skiego,1000.0,100.0,Private,0.0,10.0,1,Other,primary,WrocĹ‚aw,Lower Silesia,January,2022,634487,17.033,51.109


In [7]:
df['price'].describe()

count       62818.000
mean       399405.855
std       2023250.514
min             1.000
25%        260000.000
50%        345000.000
75%        460000.000
max     504151616.000
Name: price, dtype: float64

In [8]:
df.shape

(62818, 16)

In [9]:
df.dtypes

offer_title                object
price                     float64
price_per_meter           float64
offer_type                 object
floor                     float64
area                      float64
rooms                       int64
offer_type_of_building     object
market                     object
city_name                  object
voivodeship                object
month                      object
year                        int64
population                  int64
longitude                 float64
latitude                  float64
dtype: object

In [10]:
df.isna().sum()

offer_title                  0
price                        0
price_per_meter              0
offer_type                   0
floor                     1487
area                      1487
rooms                        0
offer_type_of_building    1487
market                       0
city_name                    0
voivodeship                  0
month                        0
year                         0
population                   0
longitude                    0
latitude                     0
dtype: int64

In [11]:
df['floor'].value_counts()

1.000     12791
0.000     11384
2.000     11313
3.000     10769
4.000      9139
5.000      1432
10.000     1061
6.000      1048
7.000       803
8.000       777
9.000       603
11.000      149
-1.000       62
Name: floor, dtype: int64

In [12]:
df = df[df['floor'].isna() == False]

In [13]:
df = df[(df["price"] <= df["price"].quantile(0.95)) & (df["price"] >= df["price"].quantile(0.05))]

In [14]:
y = df["price"]
X_train, X_test, y_train, y_test= train_test_split(df, y, test_size=0.3, random_state=SEED)

In [15]:
from mlflow.tracking import MlflowClient


In [16]:
client = MlflowClient(tracking_uri=f"http://{TRACKING_SERVER_HOST}:5000")

In [20]:
experiment = (
    client.search_experiments()
    ) 
experiment

[<Experiment: artifact_location='s3://test-bucket-vlad-godel/mlflow_artifacts/6', creation_time=1685516502288, experiment_id='6', last_update_time=1685516502288, lifecycle_stage='active', name='godel-cozy-ds-hyperopt_2023-05-31_09-01-40', tags={}>,
 <Experiment: artifact_location='s3://test-bucket-vlad-godel/mlflow_artifacts/5', creation_time=1685458629352, experiment_id='5', last_update_time=1685458629352, lifecycle_stage='active', name='godel-cozy-ds-hyperopt_2023-05-30_16-57-08', tags={}>,
 <Experiment: artifact_location='s3://test-bucket-vlad-godel/mlflow_artifacts/4', creation_time=1685455257640, experiment_id='4', last_update_time=1685455257640, lifecycle_stage='active', name='godel-cozy-ds', tags={}>,
 <Experiment: artifact_location='s3://test-bucket-vlad-godel/mlflow_artifacts/3', creation_time=1685454262485, experiment_id='3', last_update_time=1685454262485, lifecycle_stage='active', name='godel-cozy-ds-hyperopt_2023-05-30_15-44-21', tags={}>,
 <Experiment: artifact_location='

In [21]:
models = (
    client.search_registered_models()
    ) 
requierd_run_id = models[0].latest_versions[0].run_id
requierd_experiment_id = '6'

In [22]:
client.update_model_version(
    name="house_pricing_xgboost_model",
    version=1,
    description="This model will be used for house pricing predictions",
)


<ModelVersion: creation_timestamp=1685456013492, current_stage='None', description='This model will be used for house pricing predictions', last_updated_timestamp=1685526881957, name='house_pricing_xgboost_model', run_id='10b25ffd3a0141f8ac9d4c60000c5204', run_link='', source='s3://test-bucket-vlad-godel/mlflow_artifacts/3/10b25ffd3a0141f8ac9d4c60000c5204/artifacts/models_artifacts', status='READY', status_message='', tags={}, user_id='', version='1'>

In [23]:
m = client.get_latest_versions('house_pricing_xgboost_model')
m[0].version


'3'

In [24]:
s3 = boto3.client('s3')
s3.download_file('test-bucket-vlad-godel', f'mlflow_artifacts/{requierd_experiment_id}/{requierd_run_id}/artifacts/models_pickle/preprocessor.b', 'preprocessor.b')
s3.download_file('test-bucket-vlad-godel', f'mlflow_artifacts/{requierd_experiment_id}/{requierd_run_id}/artifacts/model/model.xgb', 'model.xgb')

In [25]:
preprocessor = pickle.load(open('preprocessor.b', 'rb'))

In [26]:
X_tr_train = preprocessor.transform(X_train)
X_tr_test = preprocessor.transform(X_test)

In [27]:
model_xgb = xgb.Booster()
model_xgb.load_model("model.xgb")

In [28]:
predictions_train = model_xgb.predict(xgb.DMatrix(X_tr_train))
predictions_test = model_xgb.predict(xgb.DMatrix(X_tr_test))

In [29]:
X_train['predictions'] = predictions_train
X_test['predictions'] = predictions_test

In [30]:
def calculate_regression_metrics(y_true, y_pred):
    metrics = {}
    # Mean Squared Error (MSE)
    metrics['MSE'] = mean_squared_error(y_true, y_pred)
    # Mean Absolute Error (MAE)
    metrics['MAE'] = mean_absolute_error(y_true, y_pred)
    # R-squared score (R2)
    metrics['R2'] = r2_score(y_true, y_pred)
    # Root Mean Squared Error (RMSE)
    metrics['RMSE'] = np.sqrt(metrics['MSE'])
    # Mean Absolute Percentage Error (MAPE)
    metrics['MAPE'] = mean_absolute_percentage_error(y_true, y_pred)
    
    return metrics

In [31]:
calculate_regression_metrics(X_train['price'], X_train['predictions'])

results = pd.DataFrame({'TRAIN': calculate_regression_metrics(X_train['price'], X_train['predictions']), 'TEST': calculate_regression_metrics(X_test['price'], X_test['predictions'])})
results

Unnamed: 0,TRAIN,TEST
MSE,516648088.593,525635764.291
MAE,10047.038,10180.268
R2,0.97,0.97
RMSE,22729.894,22926.748
MAPE,0.028,0.028


In [35]:
X_test[['price', 'predictions']].sample(10)

Unnamed: 0,price,predictions
33237,185000.0,189409.609
55874,699000.0,542710.25
49158,745000.0,733992.688
46779,495000.0,479441.844
900,568360.0,568297.0
37941,249000.0,251599.812
29159,399000.0,402528.062
48648,435000.0,456069.781
55459,290000.0,262284.562
2594,285000.0,284507.875
