In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

# Reading data

In [2]:
train_valid = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')

In [3]:
train_valid

Unnamed: 0,price,bedrooms,bathrooms,living,lot,floors,waterfront,view,condition,grade,above,basement,yr_built,yr_renovated,zipcode,lat,long,living15,lot15
0,325000,-0.406924,-1.020294,-0.326556,-0.048584,-0.808562,0,-0.305767,0.907771,1.142623,-0.010175,-0.658642,0.408326,-0.210133,-0.671691,-1.393225,0.439485,1.113865,0.011946
1,257000,-1.508293,-1.020294,-1.175827,-0.275405,-0.808562,0,-0.305767,-0.629146,-1.409591,-1.193642,-0.206737,-1.430099,-0.210133,0.748744,-0.058073,-0.540447,-1.045531,-0.284530
2,293000,-0.406924,0.340518,-0.097907,-0.181804,1.003409,0,-0.305767,-0.629146,-0.558853,0.243426,-0.658642,0.646641,-0.210133,-1.026799,-1.943163,-0.973605,-0.126329,-0.205714
3,225000,-0.406924,-1.020294,-0.794744,-0.128303,-0.808562,0,-0.305767,0.907771,-0.558853,-0.529451,-0.658642,-0.102347,-0.210133,-1.045489,-2.612183,1.497528,-0.432730,-0.171726
4,479000,-1.508293,0.340518,-0.369020,-0.329991,1.003409,0,-0.305767,-0.629146,0.291885,-0.413520,0.007918,1.225404,-0.210133,-0.821210,1.041082,0.034731,0.150891,-0.084779
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17284,378000,-0.406924,-1.020294,-1.175827,-0.197810,-0.808562,0,-0.305767,-0.629146,-0.558853,-0.952118,-0.658642,-0.817291,-0.210133,0.879573,1.113974,-0.746375,-1.439475,-0.213222
17285,3567000,1.795815,3.062142,3.016084,-0.109207,1.003409,1,4.914012,-0.629146,1.993361,2.115236,2.301337,1.225404,-0.210133,-1.307148,0.247208,0.737725,2.164382,0.201478
17286,575000,-0.406924,0.340518,0.043638,-0.249331,1.003409,0,-0.305767,-0.629146,-0.558853,0.400416,-0.658642,1.123270,-0.210133,-0.466101,0.872925,1.291600,-0.432730,-0.370597
17287,245000,-2.609662,-2.381106,-1.850888,-0.002593,-0.808562,0,-0.305767,-0.629146,-2.260328,-1.700842,-0.658642,-0.272572,-0.210133,1.683240,-0.570482,-0.774778,-1.191436,0.081716


# Train/valid splitting

In [4]:
from sklearn.model_selection import train_test_split
train, valid = train_test_split(train_valid, test_size=0.2, random_state=42)

X_train = train.drop(columns=['price'])
y_train = train['price']
X_valid = valid.drop(columns=['price'])
y_valid = valid['price']

# Define objective function

In [5]:
from sklearn.svm import SVR, LinearSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor

In [6]:
def objective(model, loss_func):
    model.fit(X_train, y_train)
    predict = model.predict(X_valid)
    return loss_func(y_valid, predict)


# Baseline

In [7]:
((y_train - y_train.mean()) ** 2).mean()

134015870278.58614

# Linear Regression

In [8]:
objective(LinearRegression(), mean_squared_error)

37004420129.132484

In [9]:
objective(LinearRegression(), mean_absolute_error)

124657.91291045053

In [10]:
objective(LinearRegression(), r2_score)

0.6880428738057232

# Support Vector Machine
## Linear SVM

In [11]:
objective(LinearSVR(max_iter=10000), mean_squared_error)



389351013592.0525

### Hyperparameter finetuning

In [12]:
import optuna

In [13]:
X = train_valid.drop(columns=['price'])
y = train_valid['price']

In [14]:
def optuna_cross_validation(params, model):
    optuna_params = params
    optuna_search = optuna.integration.OptunaSearchCV(
        cv=5,
        estimator=model,
        param_distributions=optuna_params,
        scoring='neg_mean_squared_error',
        n_trials=100,
    )
    optuna_search.fit(X, y)
    return optuna_search


In [None]:
linear_svm_cv = optuna_cross_validation(
    params={
        'C': optuna.distributions.FloatDistribution(1e-10, 1e10)
    },
    model=LinearSVR(max_iter=10000)
)

In [32]:
linear_svm_cv.best_params_

{'C': 3907246685.23547}

## Non-linear SVM

In [16]:
objective(SVR(max_iter=10000, kernel='rbf'), mean_squared_error)

125372888070.63678

In [None]:
svm_cv = optuna_cross_validation(
    params={
        'C': optuna.distributions.FloatDistribution(1e-10, 1e10),
        'kernel': optuna.distributions.CategoricalDistribution(['rbf', 'poly'])
    },
    model=SVR(max_iter=10000)
)

In [31]:
svm_cv.best_params_

{'C': 4525681.402821764, 'kernel': 'rbf'}

# Decision Tree

In [18]:
objective(DecisionTreeRegressor(), mean_squared_error)

31961431009.566803

In [19]:
objective(DecisionTreeRegressor(), r2_score)

0.7314148994516922

In [20]:
dt_cv = optuna_cross_validation(
    params={
        'max_depth': optuna.distributions.IntDistribution(1,100)
    },
    model=DecisionTreeRegressor()
)

  optuna_search = optuna.integration.OptunaSearchCV(
[I 2024-05-29 20:22:51,306] A new study created in memory with name: no-name-23056ef0-dc3e-47d6-8a4a-9fb6423ab4e7
[I 2024-05-29 20:22:52,416] Trial 0 finished with value: -34273603671.289803 and parameters: {'max_depth': 67}. Best is trial 0 with value: -34273603671.289803.
[I 2024-05-29 20:22:52,654] Trial 1 finished with value: -45341431760.71805 and parameters: {'max_depth': 4}. Best is trial 0 with value: -34273603671.289803.
[I 2024-05-29 20:22:53,843] Trial 2 finished with value: -34725045332.55603 and parameters: {'max_depth': 94}. Best is trial 0 with value: -34273603671.289803.
[I 2024-05-29 20:22:55,014] Trial 3 finished with value: -35137283852.54122 and parameters: {'max_depth': 27}. Best is trial 0 with value: -34273603671.289803.
[I 2024-05-29 20:22:56,224] Trial 4 finished with value: -33621281427.29641 and parameters: {'max_depth': 98}. Best is trial 4 with value: -33621281427.29641.
[I 2024-05-29 20:22:56,608] Trial 

In [21]:
dt_cv.best_params_

{'max_depth': 9}

# Random Forest

In [22]:
objective(RandomForestRegressor(), mean_squared_error)


15972338036.309834

In [23]:
objective(RandomForestRegressor(), r2_score)

0.864842636934591

In [24]:
rf_cv = optuna_cross_validation(
    params={
        'max_depth': optuna.distributions.IntDistribution(1,100)
    },
    model=RandomForestRegressor()
)

  optuna_search = optuna.integration.OptunaSearchCV(
[I 2024-05-29 20:24:44,548] A new study created in memory with name: no-name-a1abd91a-b042-4f67-af3a-e9cca662b725
[I 2024-05-29 20:26:13,542] Trial 0 finished with value: -16380405628.536234 and parameters: {'max_depth': 74}. Best is trial 0 with value: -16380405628.536234.
[I 2024-05-29 20:27:52,727] Trial 1 finished with value: -16548576744.073456 and parameters: {'max_depth': 39}. Best is trial 0 with value: -16380405628.536234.
[I 2024-05-29 20:29:15,522] Trial 2 finished with value: -16425845075.126846 and parameters: {'max_depth': 68}. Best is trial 0 with value: -16380405628.536234.
[I 2024-05-29 20:30:42,410] Trial 3 finished with value: -16446628706.477352 and parameters: {'max_depth': 32}. Best is trial 0 with value: -16380405628.536234.
[I 2024-05-29 20:32:24,368] Trial 4 finished with value: -16276323523.41676 and parameters: {'max_depth': 34}. Best is trial 4 with value: -16276323523.41676.
[I 2024-05-29 20:34:07,920] Tr

In [30]:
rf_cv.best_params_

{'max_depth': 42}