In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

# Reading data

In [2]:
train_valid = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')

In [3]:
train_valid

Unnamed: 0,price,bedrooms,bathrooms,living,lot,floors,waterfront,view,condition,grade,above,basement,yr_built,yr_renovated,zipcode,lat,long,living15,lot15
0,325000,-0.406924,-1.020294,-0.326556,-0.048584,-0.808562,0,-0.305767,0.907771,1.142623,-0.010175,-0.658642,0.408326,-0.210133,-0.671691,-1.393225,0.439485,1.113865,0.011946
1,257000,-1.508293,-1.020294,-1.175827,-0.275405,-0.808562,0,-0.305767,-0.629146,-1.409591,-1.193642,-0.206737,-1.430099,-0.210133,0.748744,-0.058073,-0.540447,-1.045531,-0.284530
2,293000,-0.406924,0.340518,-0.097907,-0.181804,1.003409,0,-0.305767,-0.629146,-0.558853,0.243426,-0.658642,0.646641,-0.210133,-1.026799,-1.943163,-0.973605,-0.126329,-0.205714
3,225000,-0.406924,-1.020294,-0.794744,-0.128303,-0.808562,0,-0.305767,0.907771,-0.558853,-0.529451,-0.658642,-0.102347,-0.210133,-1.045489,-2.612183,1.497528,-0.432730,-0.171726
4,479000,-1.508293,0.340518,-0.369020,-0.329991,1.003409,0,-0.305767,-0.629146,0.291885,-0.413520,0.007918,1.225404,-0.210133,-0.821210,1.041082,0.034731,0.150891,-0.084779
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17284,378000,-0.406924,-1.020294,-1.175827,-0.197810,-0.808562,0,-0.305767,-0.629146,-0.558853,-0.952118,-0.658642,-0.817291,-0.210133,0.879573,1.113974,-0.746375,-1.439475,-0.213222
17285,3567000,1.795815,3.062142,3.016084,-0.109207,1.003409,1,4.914012,-0.629146,1.993361,2.115236,2.301337,1.225404,-0.210133,-1.307148,0.247208,0.737725,2.164382,0.201478
17286,575000,-0.406924,0.340518,0.043638,-0.249331,1.003409,0,-0.305767,-0.629146,-0.558853,0.400416,-0.658642,1.123270,-0.210133,-0.466101,0.872925,1.291600,-0.432730,-0.370597
17287,245000,-2.609662,-2.381106,-1.850888,-0.002593,-0.808562,0,-0.305767,-0.629146,-2.260328,-1.700842,-0.658642,-0.272572,-0.210133,1.683240,-0.570482,-0.774778,-1.191436,0.081716


# Train/valid splitting

In [4]:
from sklearn.model_selection import train_test_split
train, valid = train_test_split(train_valid, test_size=0.2, random_state=42)

X_train = train.drop(columns=['price'])
y_train = train['price']
X_valid = valid.drop(columns=['price'])
y_valid = valid['price']

# Define objective function

In [5]:
from sklearn.svm import SVR, LinearSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor

In [6]:
def objective(model, loss_func):
    model.fit(X_train, y_train)
    predict = model.predict(X_valid)
    return loss_func(y_valid, predict)


# Baseline

In [41]:
((y_train - y_train.mean()) ** 2).sum()

1853573501823124.8

# Linear Regression

In [7]:
objective(LinearRegression(), mean_squared_error)

37004420129.132484

In [8]:
objective(LinearRegression(), mean_absolute_error)

124657.91291045053

In [9]:
objective(LinearRegression(), r2_score)

0.6880428738057232

# Support Vector Machine
## Linear SVM

In [10]:
objective(LinearSVR(max_iter=10000), mean_squared_error)



389351013592.0525

### Hyperparameter finetuning

In [11]:
import optuna

In [12]:
X = train_valid.drop(columns=['price'])
y = train_valid['price']

In [29]:
def optuna_cross_validation(params, model):
    optuna_params = params
    optuna_search = optuna.integration.OptunaSearchCV(
        cv=5,
        estimator=model,
        param_distributions=optuna_params,
        scoring='neg_mean_squared_error',
        n_trials=50
    )
    optuna_search.fit(X, y)
    return optuna_search


In [14]:
linear_svm_cv = optuna_cross_validation(
    params={
        'C': optuna.distributions.FloatDistribution(1e-10, 1e10)
    },
    model=LinearSVR(max_iter=10000)
)

  optuna_search = optuna.integration.OptunaSearchCV(
[I 2024-05-29 01:37:58,133] A new study created in memory with name: no-name-c377d645-582c-4d9b-875e-ee8aec58eaea
[I 2024-05-29 01:39:58,831] Trial 0 finished with value: -64156290475.730606 and parameters: {'C': 7939165714.421484}. Best is trial 0 with value: -64156290475.730606.
[I 2024-05-29 01:42:23,187] Trial 1 finished with value: -57116633349.9692 and parameters: {'C': 172426666.70082563}. Best is trial 1 with value: -57116633349.9692.
[I 2024-05-29 01:45:59,932] Trial 2 finished with value: -70581296795.01968 and parameters: {'C': 6358680348.148176}. Best is trial 1 with value: -57116633349.9692.
[I 2024-05-29 01:49:38,469] Trial 3 finished with value: -57841761626.00139 and parameters: {'C': 4108512434.035767}. Best is trial 1 with value: -57116633349.9692.
[I 2024-05-29 01:52:35,916] Trial 4 finished with value: -45394819757.512985 and parameters: {'C': 12498166.610557249}. Best is trial 4 with value: -45394819757.512985.
[

## Non-linear SVM

In [15]:
objective(SVR(max_iter=10000, kernel='rbf'), mean_squared_error)

125372888070.63678

In [16]:
svm_cv = optuna_cross_validation(
    params={
        'C': optuna.distributions.FloatDistribution(1e-10, 1e10),
        'kernel': optuna.distributions.CategoricalDistribution(['rbf', 'poly'])
    },
    model=SVR(max_iter=10000)
)

  optuna_search = optuna.integration.OptunaSearchCV(
[I 2024-05-29 03:17:20,929] A new study created in memory with name: no-name-d7004872-a37d-4eac-bcd5-ee4e20caadf8
[I 2024-05-29 03:17:37,388] Trial 0 finished with value: -3016361597044.65 and parameters: {'C': 4011175662.5568953, 'kernel': 'poly'}. Best is trial 0 with value: -3016361597044.65.
[I 2024-05-29 03:17:54,061] Trial 1 finished with value: -2319265947495.5737 and parameters: {'C': 9796930539.54014, 'kernel': 'poly'}. Best is trial 1 with value: -2319265947495.5737.
[I 2024-05-29 03:18:30,301] Trial 2 finished with value: -76123687489.83902 and parameters: {'C': 6181493541.406706, 'kernel': 'rbf'}. Best is trial 2 with value: -76123687489.83902.
[I 2024-05-29 03:19:06,881] Trial 3 finished with value: -92040254201.45938 and parameters: {'C': 8159869439.824643, 'kernel': 'rbf'}. Best is trial 2 with value: -76123687489.83902.
[I 2024-05-29 03:19:43,969] Trial 4 finished with value: -86359918142.43518 and parameters: {'C': 6

# Decision Tree

In [17]:
objective(DecisionTreeRegressor(), mean_squared_error)

32800727836.246964

In [18]:
objective(DecisionTreeRegressor(), r2_score)

0.732802812253309

In [32]:
dt_cv = optuna_cross_validation(
    params={
        'max_depth': optuna.distributions.IntDistribution(1,100)
    },
    model=DecisionTreeRegressor()
)

  optuna_search = optuna.integration.OptunaSearchCV(
[I 2024-05-29 05:00:18,076] A new study created in memory with name: no-name-dec01730-5171-496e-b58b-c0152505b255
[I 2024-05-29 05:00:19,348] Trial 0 finished with value: -35070114283.90737 and parameters: {'max_depth': 100}. Best is trial 0 with value: -35070114283.90737.
[I 2024-05-29 05:00:20,546] Trial 1 finished with value: -34976520768.37325 and parameters: {'max_depth': 27}. Best is trial 1 with value: -34976520768.37325.
[I 2024-05-29 05:00:21,661] Trial 2 finished with value: -34450240372.15851 and parameters: {'max_depth': 35}. Best is trial 2 with value: -34450240372.15851.
[I 2024-05-29 05:00:22,816] Trial 3 finished with value: -35456471317.17578 and parameters: {'max_depth': 87}. Best is trial 2 with value: -34450240372.15851.
[I 2024-05-29 05:00:23,784] Trial 4 finished with value: -32587662718.177532 and parameters: {'max_depth': 17}. Best is trial 4 with value: -32587662718.177532.
[I 2024-05-29 05:00:24,973] Trial 5

In [36]:
dt_cv.best_params_

{'max_depth': 9}

# Random Forest

In [19]:
objective(RandomForestRegressor(), mean_squared_error)

16195108850.685543

In [20]:
objective(RandomForestRegressor(), r2_score)

0.8630392962004799

In [37]:
rf_cv = optuna_cross_validation(
    params={
        'max_depth': optuna.distributions.IntDistribution(1,100)
    },
    model=RandomForestRegressor()
)

  optuna_search = optuna.integration.OptunaSearchCV(
[I 2024-05-29 05:12:09,063] A new study created in memory with name: no-name-0f1bdacd-3efb-48f0-8885-b9f54ba72c3c
[I 2024-05-29 05:13:58,174] Trial 0 finished with value: -16186698851.45846 and parameters: {'max_depth': 59}. Best is trial 0 with value: -16186698851.45846.
[I 2024-05-29 05:14:36,466] Trial 1 finished with value: -20427217093.467968 and parameters: {'max_depth': 8}. Best is trial 0 with value: -16186698851.45846.
[I 2024-05-29 05:16:15,336] Trial 2 finished with value: -16511014966.250559 and parameters: {'max_depth': 33}. Best is trial 0 with value: -16186698851.45846.
[I 2024-05-29 05:17:56,604] Trial 3 finished with value: -16497309424.417526 and parameters: {'max_depth': 76}. Best is trial 0 with value: -16186698851.45846.
[I 2024-05-29 05:19:39,391] Trial 4 finished with value: -16403368589.885925 and parameters: {'max_depth': 43}. Best is trial 0 with value: -16186698851.45846.
[I 2024-05-29 05:21:15,424] Trial 5

# Testing

In [21]:

# X_test = test.drop(columns=['price'])
# y_test = test['price']

In [22]:
# X = train_valid.drop(columns='price')
# y = train_valid['price']

In [23]:
# study.best_trial.params

In [24]:
# def objective_test(model, loss_func):
#     model.fit(X, y)
#     predict = model.predict(X_test)
#     return loss_func(y_test, predict)

In [25]:
# res = objective_test(DecisionTreeRegressor(max_depth=11), mean_squared_error)
# res