##### Code From Week 1

In [2]:
import warnings
warnings.filterwarnings('ignore')

import os
from os.path import join

import pandas as pd
import numpy as np

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score
import xgboost as xgb
import xgboost as xgb
import lightgbm as lgb

import seaborn as sns
import matplotlib.pyplot as plt

# %matplotlib inline
# %config InlineBackend.figure_format = 'retina'

train_data_path = join('./data/train.csv')
sub_data_path = join('./data/test.csv')

data = pd.read_csv(train_data_path)
sub = pd.read_csv(sub_data_path)

y = data['price']
del data['price']

train_len = len(data)
data = pd.concat((data, sub), axis=0)

sub_id = data['id'][train_len:]
del data['id']

data['date'] = data['date'].apply(lambda x : str(x[:6])).astype(int)

skew_columns = ['bedrooms', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement']

for c in skew_columns:
    data[c] = np.log1p(data[c].values)

y_log_transformation = np.log1p(y)

sub = data.iloc[train_len:, :] 
x = data.iloc[:train_len, :] 

print(x.shape)
print(sub.shape)

(15035, 19)
(6468, 19)


##### Ensemble Learning (blending)

Averaging

In [3]:
gboost = GradientBoostingRegressor(random_state=2023)
xgboost = xgb.XGBRegressor(random_state=2023)
lightgbm = lgb.LGBMRegressor(random_state=2023)

models = [{'model':gboost, 'name':'GradientBoosting'}, {'model':xgboost, 'name':'XGBoost'},
          {'model':lightgbm, 'name':'LightGBM'}]

##### Cross Validation Function

In [4]:
def get_cv_score(models):
    kfold = KFold(n_splits=5).get_n_splits(x.values)
    for m in models:
        CV_score = np.mean(cross_val_score(m['model'], X=x.values, y=y, cv=kfold))
        print(f"Model: {m['name']}, CV score:{CV_score:.4f}")

In [5]:
get_cv_score(models)

Model: GradientBoosting, CV score:0.8609
Model: XGBoost, CV score:0.8861
Model: LightGBM, CV score:0.8819


##### Baseline Model

In [None]:
def AveragingBlending(models, x, y, sub_x):
    # Model learning
    for m in models : 
        m['model'].fit(x.values, y)
    
    # Model prediction
    predictions = np.column_stack([
        m['model'].predict(sub_x.values) for m in models
    ])

    # Return mean of each model
    return np.mean(predictions, axis=1)

In [None]:
y_pred = AveragingBlending(models, x, y, sub)
print(len(y_pred))
y_pred

##### Submit CSV File

In [6]:
submission = pd.read_csv('./data/sample_submission.csv')
submission.head()

Unnamed: 0,id,price
0,15035,100000
1,15036,100000
2,15037,100000
3,15038,100000
4,15039,100000


In [None]:
result = pd.DataFrame({
    'id' : sub_id,
    'price' = y_pred
})