In [1]:
import pandas as pd

df = pd.read_csv('../data/interim/immo_houses.csv')

In [2]:
df.duplicated().sum()

0

In [3]:
df.columns

Index(['Price', 'Postal Code', 'Facades', 'Habitable Surface', 'Land Surface',
       'Type', 'Subtype', 'Bedroom Count', 'Bathroom Count', 'Toilet Count',
       'Room Count', 'Kitchen Type', 'Furnished', 'Terrace', 'Terrace Surface',
       'Garden Exists', 'State of Building', 'Living Surface', 'EPC',
       'Consumption Per m2', 'Heating Type', 'Province'],
      dtype='object')

In [4]:
features = df.drop(columns=['Postal Code', 'Price', 'Type'])
features

Unnamed: 0,Facades,Habitable Surface,Land Surface,Subtype,Bedroom Count,Bathroom Count,Toilet Count,Room Count,Kitchen Type,Furnished,Terrace,Terrace Surface,Garden Exists,State of Building,Living Surface,EPC,Consumption Per m2,Heating Type,Province
0,2.0,123.0,175.0,HOUSE,3.0,1.0,2.0,6.0,USA_HYPER_EQUIPPED,0,1,,0,AS_NEW,30.0,,,,OOST-VLAANDEREN
1,2.0,123.0,191.0,HOUSE,3.0,1.0,2.0,6.0,USA_HYPER_EQUIPPED,0,1,,0,AS_NEW,33.0,,,,OOST-VLAANDEREN
2,3.0,123.0,168.0,VILLA,3.0,1.0,2.0,6.0,USA_HYPER_EQUIPPED,0,1,,0,AS_NEW,31.0,,,GAS,OOST-VLAANDEREN
3,3.0,123.0,234.0,VILLA,3.0,1.0,2.0,6.0,USA_HYPER_EQUIPPED,0,1,,0,AS_NEW,31.0,,,,OOST-VLAANDEREN
4,3.0,123.0,252.0,VILLA,3.0,1.0,2.0,6.0,USA_HYPER_EQUIPPED,0,1,,0,AS_NEW,31.0,,,,OOST-VLAANDEREN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8057,2.0,149.0,289.0,HOUSE,4.0,1.0,1.0,6.0,,0,1,,0,GOOD,22.0,F,635.0,,WEST-VLAANDEREN
8058,3.0,100.0,344.0,HOUSE,3.0,1.0,,4.0,,0,0,,0,,32.0,F,836.0,GAS,ANTWERPEN
8059,4.0,168.0,153.0,HOUSE,3.0,2.0,2.0,7.0,,0,0,,0,GOOD,,D,376.0,GAS,OOST-VLAANDEREN
8060,2.0,92.0,221.0,HOUSE,2.0,1.0,,3.0,,0,0,,1,,13.0,F,434.0,FUELOIL,LUIK


In [5]:
target = df['Price']
target

0       378500.0
1       381000.0
2       399500.0
3       410400.0
4       411900.0
          ...   
8057    229000.0
8058    220000.0
8059    399000.0
8060     90000.0
8061    349000.0
Name: Price, Length: 8062, dtype: float64

In [6]:
features.columns

Index(['Facades', 'Habitable Surface', 'Land Surface', 'Subtype',
       'Bedroom Count', 'Bathroom Count', 'Toilet Count', 'Room Count',
       'Kitchen Type', 'Furnished', 'Terrace', 'Terrace Surface',
       'Garden Exists', 'State of Building', 'Living Surface', 'EPC',
       'Consumption Per m2', 'Heating Type', 'Province'],
      dtype='object')

In [7]:

cat_cols = ['Subtype', 'Kitchen Type', 'State of Building', 'EPC', 'Heating Type', 'Province']
cat_cols

['Subtype',
 'Kitchen Type',
 'State of Building',
 'EPC',
 'Heating Type',
 'Province']

In [8]:
X = features.copy(deep=True)
X[cat_cols] = X[cat_cols].astype('category')

y = target.copy(deep=True)

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import HistGradientBoostingRegressor

one_hot_encoder = make_column_transformer(
    (
        OneHotEncoder(sparse_output=False, handle_unknown="ignore"),
        make_column_selector(dtype_include="category"),
    ),
    remainder="passthrough",
)

hist_one_hot = make_pipeline(
    one_hot_encoder, HistGradientBoostingRegressor(loss='squared_error', learning_rate=0.1, random_state=42)
)

In [11]:
hist_one_hot

In [12]:
hist_one_hot.fit(X_train, y_train)

In [13]:
y_pred = hist_one_hot.predict(X_test)

In [14]:
accuracy = hist_one_hot.score(X_test, y_test)
accuracy

0.5430107244888819

## XGBoost

In [19]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import HistGradientBoostingRegressor
from xgboost import XGBRegressor

one_hot_encoder = make_column_transformer(
    (
        OneHotEncoder(sparse_output=False, handle_unknown="ignore"),
        make_column_selector(dtype_include="category"),
    ),
    remainder="passthrough",
)

hist_one_hot = make_pipeline(
    one_hot_encoder, XGBRegressor(learning_rate=0.1, n_estimators=150, booster='dart')
)

In [20]:
hist_one_hot

In [21]:
hist_one_hot.fit(X_train, y_train)

In [22]:
accuracy = hist_one_hot.score(X_test, y_test)
accuracy

0.6782984123789675