In [34]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split,KFold,GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error,mean_squared_error,mean_absolute_percentage_error

from xgboost import XGBRFRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor,GradientBoostingRegressor,RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

In [2]:
df = pd.read_csv('../data/housing_price.csv')
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [5]:
X = df.drop(columns='price')
y = df.price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
num_cols = Pipeline([
    ('scl',StandardScaler())
])

cat_cols = Pipeline([
    ('enc',OneHotEncoder())
])

prep = ColumnTransformer([
    ('num',num_cols,['area','bedrooms','stories','parking']),
    ('cat',cat_cols,['mainroad','guestroom','basement','hotwaterheating','airconditioning','prefarea','furnishingstatus'])
])

In [8]:
models = [XGBRFRegressor(),LinearRegression(),AdaBoostRegressor(),GradientBoostingRegressor(),RandomForestRegressor(),
          SVR(), KNeighborsRegressor()]

In [36]:
MAE = []
MSE =[]
MAPE = []
MODEL_NAME = []

for model in models:
    pipeline = Pipeline([
        ('prep',prep),
        ('model',model)
    ])
    pipeline.fit(X_train,y_train)
    y_pred = pipeline.predict(X_test)
    mae = mean_absolute_error(y_test,y_pred)
    mse = mean_squared_error(y_test,y_pred)
    mape = mean_absolute_percentage_error(y_test,y_pred)
    MAE.append(mae)
    MODEL_NAME.append(model.__class__.__name__)
    MSE.append(mse)
    MAPE.append(mape)

pd.DataFrame({
    'Model Name': MODEL_NAME,
    'MAE': MAE,
    'MSE':MSE,
    'MAPE':MAPE,
})

Unnamed: 0,Model Name,MAE,MSE,MAPE
0,XGBRFRegressor,1106221.0,2319859000000.0,0.237069
1,LinearRegression,967460.6,1884282000000.0,0.205813
2,AdaBoostRegressor,1245253.0,2602387000000.0,0.284192
3,GradientBoostingRegressor,980154.2,1970014000000.0,0.208505
4,RandomForestRegressor,1025096.0,2146519000000.0,0.217444
5,SVR,1763888.0,5567935000000.0,0.367129
6,KNeighborsRegressor,1050246.0,2501423000000.0,0.215623


Berdasarkan hasil seleksi model di atas saya memilih LinearRegression dan GradientBoosting karena menghasilkan error yang lebih kecl dari yang lain. Dengan metric penilaian MSE (mean squared error)