In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

# Reading data

In [2]:
train_valid = pd.read_csv('datadata/train.csv')
test = pd.read_csv('datadata/test.csv')

In [3]:
train_valid

Unnamed: 0,price,bedrooms,bathrooms,living,lot,floors,waterfront,view,grade,above,basement,yr_built,yr_renovated,zipcode,lat,living15,lot15
0,325000,3,1,165.366035,1216.555184,1,0,0,9,165.366035,0.000000,1983,0,98042,47.3670,255.481234,1216.555184
1,257000,2,1,92.902267,343.738387,1,0,0,6,74.321813,18.580453,1929,0,98118,47.5520,117.985879,464.511334
2,228500,3,1,100.334448,695.466369,1,0,0,6,91.973244,8.361204,1942,0,98146,47.4838,108.695652,724.637681
3,288000,3,2,194.165738,696.767001,1,0,0,7,118.914902,75.250836,1977,0,98031,47.3951,167.224080,682.831661
4,479000,2,2,161.742847,133.686362,2,0,0,8,134.336678,27.406169,2007,0,98034,47.7043,194.165738,971.200297
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17285,378000,3,1,92.902267,642.326273,1,0,0,7,92.902267,0.000000,1947,0,98125,47.7144,92.902267,645.392048
17286,399950,3,2,286.789298,464.697139,2,0,0,8,286.789298,0.000000,2014,0,98023,47.2974,271.924935,481.512449
17287,575000,3,2,196.952806,444.072835,2,0,0,7,196.952806,0.000000,2004,0,98053,47.6810,157.004831,246.191007
17288,245000,1,0,35.302861,1393.534002,1,0,0,5,35.302861,0.000000,1963,0,98168,47.4810,108.695652,1393.534002


# Train/valid splitting

In [4]:
from sklearn.model_selection import train_test_split
train, valid = train_test_split(train_valid, test_size=0.2, random_state=42)

X_train = train.drop(columns=['price'])
y_train = train['price']
X_valid = valid.drop(columns=['price'])
y_valid = valid['price']

# Define objective function

In [5]:
from sklearn.svm import SVR, LinearSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor

In [6]:
def objective(model, loss_func):
    model.fit(X_train, y_train)
    predict = model.predict(X_valid)
    return loss_func(y_valid, predict)


# Baseline: Linear Regression

In [7]:
objective(LinearRegression(), mean_squared_error)

35895389415.54213

In [8]:
objective(LinearRegression(), mean_absolute_error)

124126.53695622204

In [9]:
objective(LinearRegression(), r2_score)

0.6983886029832058

# Support Vector Machine
## Linear SVM

In [10]:
objective(LinearSVR(max_iter=10000), mean_squared_error)



73690365616.30244

### Hyperparameter finetuning

In [11]:
import optuna

In [12]:
def optuna_linear_svm(trial):
    svr_c = trial.suggest_float('svr_c', 1e-10, 1e10, log=True)
    regressor_obj = LinearSVR(C=svr_c, max_iter=10000)
    regressor_obj.fit(X_train, y_train)
    pred = regressor_obj.predict(X_valid)
    error = mean_squared_error(y_valid, pred)
    return error

In [13]:
study = optuna.create_study(direction='minimize')
study.optimize(optuna_linear_svm, n_trials=100)

[I 2024-05-28 13:29:09,233] A new study created in memory with name: no-name-8af08fcc-f2cb-4068-8eee-75bb165a2247
[I 2024-05-28 13:29:29,858] Trial 0 finished with value: 205003483804.67062 and parameters: {'svr_c': 1752477.0402667248}. Best is trial 0 with value: 205003483804.67062.
[I 2024-05-28 13:29:29,880] Trial 1 finished with value: 371794454367.4659 and parameters: {'svr_c': 2.548225800123644e-10}. Best is trial 0 with value: 205003483804.67062.
[I 2024-05-28 13:29:29,903] Trial 2 finished with value: 261326636145.30807 and parameters: {'svr_c': 1.2269810701253667e-09}. Best is trial 0 with value: 205003483804.67062.
[I 2024-05-28 13:29:42,308] Trial 3 finished with value: 80161001501.04617 and parameters: {'svr_c': 69.0428100829682}. Best is trial 3 with value: 80161001501.04617.
[I 2024-05-28 13:29:44,561] Trial 4 finished with value: 65903841927.220955 and parameters: {'svr_c': 0.010917323679713974}. Best is trial 4 with value: 65903841927.220955.
[I 2024-05-28 13:29:56,060]

In [40]:
param = study.best_trial.params['svr_c']
result = study.best_trial.value
print(f'The best result is {result} with hyperparameter C={param}')

The best result is 55467035031.50063 with hyperparameter C=107575.1248333997


## Non-linear SVM

In [14]:
objective(SVR(max_iter=10000, kernel='rbf'), mean_squared_error)

126529541859.09338

In [15]:
def optuna_svm(trial):
    svr_c = trial.suggest_float('svr_c', 1e-10, 1e10, log=True)
    svr_kernel = trial.suggest_categorical('svr_kernel', ['rbf', 'poly'])
    regressor_obj = SVR(C=svr_c,kernel=svr_kernel,max_iter=10000)
    regressor_obj.fit(X_train, y_train)
    pred = regressor_obj.predict(X_valid)
    error = mean_squared_error(y_valid, pred)
    return error

In [16]:
svm_study = optuna.create_study(direction='minimize')
svm_study.optimize(optuna_svm, n_trials=100)

[I 2024-05-28 13:45:51,104] A new study created in memory with name: no-name-08f2706c-fa0f-46af-880f-979389c65bc2
[I 2024-05-28 13:45:58,497] Trial 0 finished with value: 1319623576811.454 and parameters: {'svr_c': 2350989821.370526, 'svr_kernel': 'poly'}. Best is trial 0 with value: 1319623576811.454.
[I 2024-05-28 13:46:16,499] Trial 1 finished with value: 126529941008.08618 and parameters: {'svr_c': 5.751247774463919e-07, 'svr_kernel': 'rbf'}. Best is trial 1 with value: 126529941008.08618.
[I 2024-05-28 13:46:26,472] Trial 2 finished with value: 86792549737.49461 and parameters: {'svr_c': 187704362.4924578, 'svr_kernel': 'poly'}. Best is trial 2 with value: 86792549737.49461.
[I 2024-05-28 13:46:44,266] Trial 3 finished with value: 126529941008.07092 and parameters: {'svr_c': 2.066987156290725e-10, 'svr_kernel': 'rbf'}. Best is trial 2 with value: 86792549737.49461.
[I 2024-05-28 13:46:54,629] Trial 4 finished with value: 126529886735.21855 and parameters: {'svr_c': 0.0874655409950

In [42]:
param = svm_study.best_trial.params['svr_c']
kernel = svm_study.best_trial.params['svr_kernel']
result = svm_study.best_trial.value
print(f'The best result is {result} with hyperparameter C={param}, kernel={kernel}')

The best result is 62258517614.79775 with hyperparameter C=91482361.39960833, kernel=poly


# Decision Tree

In [17]:
objective(DecisionTreeRegressor(), mean_squared_error)

38281533358.379196

In [18]:
objective(DecisionTreeRegressor(), r2_score)

0.6887397896070498

In [19]:
def optuna_dt(trial):
    depth = trial.suggest_int('dt_max_depth', 1, 100,)
    regressor_obj = DecisionTreeRegressor(max_depth=depth)
    regressor_obj.fit(X_train, y_train)
    pred = regressor_obj.predict(X_valid)
    error = mean_squared_error(y_valid, pred)
    return error

In [20]:
dt_study = optuna.create_study(direction='minimize')
dt_study.optimize(optuna_dt, n_trials=100)

[I 2024-05-28 14:04:28,291] A new study created in memory with name: no-name-4d3c97fe-ee22-4e1e-ae4b-c1b30716e0c3
[I 2024-05-28 14:04:28,511] Trial 0 finished with value: 40664897779.759254 and parameters: {'dt_max_depth': 41}. Best is trial 0 with value: 40664897779.759254.
[I 2024-05-28 14:04:28,747] Trial 1 finished with value: 37938322938.75499 and parameters: {'dt_max_depth': 66}. Best is trial 1 with value: 37938322938.75499.
[I 2024-05-28 14:04:28,967] Trial 2 finished with value: 38220659185.90638 and parameters: {'dt_max_depth': 19}. Best is trial 1 with value: 37938322938.75499.
[I 2024-05-28 14:04:29,183] Trial 3 finished with value: 38951084533.69614 and parameters: {'dt_max_depth': 83}. Best is trial 1 with value: 37938322938.75499.
[I 2024-05-28 14:04:29,404] Trial 4 finished with value: 37937994418.718475 and parameters: {'dt_max_depth': 66}. Best is trial 4 with value: 37937994418.718475.
[I 2024-05-28 14:04:29,602] Trial 5 finished with value: 40737761007.11191 and par

In [44]:
param = dt_study.best_trial.params['dt_max_depth']
result = dt_study.best_trial.value
print(f'The best result is {result} with hyperparameter max_depth={param}')

The best result is 29312528523.130486 with hyperparameter max_depth=8


# Random Forest

In [32]:
objective(RandomForestRegressor(), mean_squared_error)

17067579367.36958

In [33]:
objective(RandomForestRegressor(), r2_score)

0.8558476898360446

In [30]:
def optuna_rf(trial):
    depth = trial.suggest_int('rf_max_depth', 1, 100,)
    # n_trees = trial.suggest_int('rf_n_trees', 10, 100)
    regressor_obj = RandomForestRegressor(n_estimators=100, max_depth=depth)
    regressor_obj.fit(X_train, y_train)
    pred = regressor_obj.predict(X_valid)
    error = mean_squared_error(y_valid, pred)
    return error

In [31]:
rf_study = optuna.create_study(direction='minimize')
rf_study.optimize(optuna_rf, n_trials=100)

[I 2024-05-28 16:11:32,223] A new study created in memory with name: no-name-301b72f8-7616-404d-b09f-9cab270594cf
[I 2024-05-28 16:11:51,615] Trial 0 finished with value: 16869934308.35958 and parameters: {'rf_max_depth': 71}. Best is trial 0 with value: 16869934308.35958.
[I 2024-05-28 16:12:10,076] Trial 1 finished with value: 17422036317.265728 and parameters: {'rf_max_depth': 19}. Best is trial 0 with value: 16869934308.35958.
[I 2024-05-28 16:12:11,287] Trial 2 finished with value: 68875722148.00703 and parameters: {'rf_max_depth': 1}. Best is trial 0 with value: 16869934308.35958.
[I 2024-05-28 16:12:19,920] Trial 3 finished with value: 20399191329.937305 and parameters: {'rf_max_depth': 9}. Best is trial 0 with value: 16869934308.35958.
[I 2024-05-28 16:12:37,676] Trial 4 finished with value: 17158196162.858395 and parameters: {'rf_max_depth': 27}. Best is trial 0 with value: 16869934308.35958.
[I 2024-05-28 16:12:54,941] Trial 5 finished with value: 16852685314.318909 and param

In [45]:
param = rf_study.best_trial.params['rf_max_depth']
result = rf_study.best_trial.value
print(f'The best result is {result} with hyperparameter max_depth={param}')

The best result is 16389835959.229929 with hyperparameter max_depth=100


In [24]:

# X_test = test.drop(columns=['price'])
# y_test = test['price']

In [25]:
# X = train_valid.drop(columns='price')
# y = train_valid['price']

In [26]:
# study.best_trial.params

In [27]:
# def objective_test(model, loss_func):
#     model.fit(X, y)
#     predict = model.predict(X_test)
#     return loss_func(y_test, predict)

In [28]:
# res = objective_test(DecisionTreeRegressor(max_depth=11), mean_squared_error)
# res