##### From Week 1

In [2]:
import warnings
warnings.filterwarnings('ignore')

import os
from os.path import join

import pandas as pd
import numpy as np

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score
import xgboost as xgb
import xgboost as xgb
import lightgbm as lgb

import seaborn as sns
import matplotlib.pyplot as plt

# %matplotlib inline
# %config InlineBackend.figure_format = 'retina'

train_data_path = join('./data/train.csv')
sub_data_path = join('./data/test.csv')

data = pd.read_csv(train_data_path)
sub = pd.read_csv(sub_data_path)

y = data['price']
del data['price']

train_len = len(data)
data = pd.concat((data, sub), axis=0)

sub_id = data['id'][train_len:]
del data['id']

data['date'] = data['date'].apply(lambda x : str(x[:6])).astype(int)

skew_columns = ['bedrooms', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement']

for c in skew_columns:
    data[c] = np.log1p(data[c].values)

y_log_transformation = np.log1p(y)

sub = data.iloc[train_len:, :] 
x = data.iloc[:train_len, :] 

print(x.shape)
print(sub.shape)

(15035, 19)
(6468, 19)


##### Averaging

In [3]:
gboost = GradientBoostingRegressor(random_state=2023)
xgboost = xgb.XGBRegressor(random_state=2023)
lightgbm = lgb.LGBMRegressor(random_state=2023)

models = [{'model':gboost, 'name':'GradientBoosting'}, {'model':xgboost, 'name':'XGBoost'},
          {'model':lightgbm, 'name':'LightGBM'}]

##### Cross Validation Function

In [4]:
def get_cv_score(models):
    kfold = KFold(n_splits=5).get_n_splits(x.values)
    for m in models:
        CV_score = np.mean(cross_val_score(m['model'], X=x.values, y=y, cv=kfold))
        print(f"Model: {m['name']}, CV score:{CV_score:.4f}")

In [5]:
get_cv_score(models)

Model: GradientBoosting, CV score:0.8609
Model: XGBoost, CV score:0.8861
Model: LightGBM, CV score:0.8819


##### Ensemble (Averaging Blending)

In [24]:
def AveragingBlending(models, x, y, sub_x):
    # Use x and y to train models
    # x : Pandas DataFrame object
    # x.values : NumPy array form
    for m in models : 
        m['model'].fit(x.values, y)
    
    # Model prediction using sub_x
    # Store the prediction results into predictions
    predictions = np.column_stack([
        m['model'].predict(sub_x.values) for m in models
    ])

    # Return mean of each model
    return np.mean(predictions, axis=1)

# print(models)
# sub.head()
# x.head()
# print(x.values)

[[ 2.01410000e+05  1.38629436e+00  1.00000000e+00 ... -1.22257000e+02
   1.34000000e+03  5.65000000e+03]
 [ 2.01502000e+05  1.09861229e+00  1.00000000e+00 ... -1.22233000e+02
   2.72000000e+03  8.06200000e+03]
 [ 2.01502000e+05  1.38629436e+00  2.00000000e+00 ... -1.22045000e+02
   1.80000000e+03  7.50300000e+03]
 ...
 [ 2.01405000e+05  1.38629436e+00  2.50000000e+00 ... -1.22346000e+02
   1.53000000e+03  1.50900000e+03]
 [ 2.01502000e+05  1.60943791e+00  2.50000000e+00 ... -1.22362000e+02
   1.83000000e+03  7.20000000e+03]
 [ 2.01410000e+05  1.09861229e+00  7.50000000e-01 ... -1.22299000e+02
   1.02000000e+03  1.35700000e+03]]


In [17]:
y_pred = AveragingBlending(models, x, y, sub)
print(len(y_pred))
y_pred

6468


array([ 529966.66304912,  430726.21272617, 1361676.91242777, ...,
        452081.69137012,  341572.97685942,  421725.1231835 ])

In [25]:
result = pd.DataFrame({
    'id' : sub_id,
    'price' : y_pred
})

result.head()

Unnamed: 0,id,price
0,15035,529966.7
1,15036,430726.2
2,15037,1361677.0
3,15038,333803.6
4,15039,308900.6


In [26]:
submission_path = './data/submission.csv'

result.to_csv(submission_path, index=False)

##### Better Model

In [28]:
train = x
test = sub

# Need to use expm1 after model prediction
y = np.log1p(y)

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [31]:
def rmse(y_test, y_pred):
    return np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred)))