In [13]:
#!pip install category_encoders
#!pip install pandas
#!pip install scikit-learn

In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_percentage_error,
    mean_absolute_error)

from category_encoders import TargetEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
import os
print(os.getcwd())
os.chdir("..")
print(os.getcwd())

c:\Users\Admin\Bayes\Work\Bain\property_friends_real_estate_case\property_friends_real_estate_case\challenge
c:\Users\Admin\Bayes\Work\Bain\property_friends_real_estate_case\property_friends_real_estate_case


In [2]:
def load_data(train_path:str, test_path:str) -> (pd.DataFrame, pd.DataFrame):
    """
    Loads the train and test data into pandas DataFrames
    """
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    return train, test

train, test = load_data('./ml/data/raw/train.csv', './ml/data/raw/test.csv')

In [3]:
len(train) + len(test)

23161

In [4]:
test.head()

Unnamed: 0,type,sector,net_usable_area,net_area,n_rooms,n_bathroom,latitude,longitude,price
0,casa,vitacura,152.0,257.0,3.0,3.0,-33.3794,-70.5447,18500
1,departamento,las condes,140.0,165.0,4.0,4.0,-33.41135,-70.56977,14500
2,departamento,la reina,101.0,101.0,4.0,3.0,-33.44154,-70.55704,6522
3,departamento,providencia,80.0,112.0,1.0,2.0,-33.42486,-70.60868,6100
4,departamento,vitacura,200.0,200.0,3.0,4.0,-33.4049,-70.5945,19000


In [5]:
categorical_cols = ["type", "sector"]
target           = "price"
train_cols = [
    col for col in train.columns if col not in ['id', target]
    ]

print(train_cols)

['type', 'sector', 'net_usable_area', 'net_area', 'n_rooms', 'n_bathroom', 'latitude', 'longitude']


In [6]:
train.shape

(16212, 9)

In [7]:
test.shape

(6949, 9)

In [8]:
categorical_transformer = TargetEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('categorical',
          categorical_transformer,
          categorical_cols)
    ])

In [9]:
steps = [
    ('preprocessor', preprocessor),]

In [12]:
pipe = Pipeline(steps)
pipe.fit_transform(train[train_cols], train[target])

array([[11391.53303366, 20974.73776706],
       [23073.46840205, 13913.20591133],
       [23073.46840205, 15365.34788782],
       ...,
       [11391.53303366,  9345.74580855],
       [11391.53303366, 15365.34788782],
       [11391.53303366, 15365.34788782]])

In [6]:


steps = [
    ('preprocessor', preprocessor),
    ('model', GradientBoostingRegressor(**{
        "learning_rate":0.01,
        "n_estimators":300,
        "max_depth":5,
        "loss":"absolute_error"
    }))
]

pipeline = Pipeline(steps)


In [8]:
pipeline.fit(train[train_cols], train[target])

In [9]:
train.head(1)

Unnamed: 0,type,sector,net_usable_area,net_area,n_rooms,n_bathroom,latitude,longitude,price
0,departamento,vitacura,140.0,170.0,4.0,4.0,-33.40123,-70.58056,11900


In [10]:
from joblib import dump
dump(pipeline, "./app/ml/model/model.joblib")

['./app/ml/model/model.joblib']

In [11]:
test[train_cols]

Unnamed: 0,type,sector,net_usable_area,net_area,n_rooms,n_bathroom,latitude,longitude
0,casa,vitacura,152.0,257.0,3.0,3.0,-33.37940,-70.54470
1,departamento,las condes,140.0,165.0,4.0,4.0,-33.41135,-70.56977
2,departamento,la reina,101.0,101.0,4.0,3.0,-33.44154,-70.55704
3,departamento,providencia,80.0,112.0,1.0,2.0,-33.42486,-70.60868
4,departamento,vitacura,200.0,200.0,3.0,4.0,-33.40490,-70.59450
...,...,...,...,...,...,...,...,...
6944,departamento,nunoa,45.0,57.0,1.0,1.0,-33.44861,-70.61880
6945,departamento,providencia,66.0,78.0,2.0,2.0,-33.43054,-70.59725
6946,departamento,las condes,58.0,58.0,1.0,1.0,-33.40033,-70.50269
6947,departamento,las condes,135.0,135.0,4.0,4.0,-33.42368,-70.58209


In [11]:
test_predictions = pipeline.predict(test[train_cols])
test_target = test[target].values

In [7]:
type(test_predictions), type(test_target)

(numpy.ndarray, numpy.ndarray)

In [8]:
def print_metrics(predictions, target):
    print("RMSE: ", np.sqrt(mean_squared_error(predictions, target)))
    print("MAPE: ", mean_absolute_percentage_error(predictions, target))
    print("MAE : ", mean_absolute_error(predictions, target))

In [9]:
print_metrics(test_predictions, test_target)

RMSE:  10254.155686652393
MAPE:  0.40042979298798137
MAE :  5859.374796053153
