##### From Week 1

In [95]:
import warnings
warnings.filterwarnings('ignore')

import os
from os.path import join

import pandas as pd
import numpy as np

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score
import xgboost as xgb
import xgboost as xgb
import lightgbm as lgb

import seaborn as sns
import matplotlib.pyplot as plt

# %matplotlib inline
# %config InlineBackend.figure_format = 'retina'

train_data_path = join('./data/train.csv')
sub_data_path = join('./data/test.csv')

data = pd.read_csv(train_data_path)
sub = pd.read_csv(sub_data_path)

y = data['price']
del data['price']

train_len = len(data)
data = pd.concat((data, sub), axis=0)

sub_id = data['id'][train_len:]
del data['id']

data['date'] = data['date'].apply(lambda x : str(x[:6])).astype(int)

skew_columns = ['bedrooms', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement']

for c in skew_columns:
    data[c] = np.log1p(data[c].values)

y_log_transformation = np.log1p(y)

sub = data.iloc[train_len:, :] 
x = data.iloc[:train_len, :] 

print(x.shape)
print(sub.shape)

(15035, 19)
(6468, 19)


##### Averaging

In [96]:
gboost = GradientBoostingRegressor(random_state=2023)
xgboost = xgb.XGBRegressor(random_state=2023)
lightgbm = lgb.LGBMRegressor(random_state=2023)

models = [{'model':gboost, 'name':'GradientBoosting'}, {'model':xgboost, 'name':'XGBoost'},
          {'model':lightgbm, 'name':'LightGBM'}]

##### Cross Validation Function

In [97]:
def get_cv_score(models):
    kfold = KFold(n_splits=5).get_n_splits(x.values)
    for m in models:
        CV_score = np.mean(cross_val_score(m['model'], X=x.values, y=y, cv=kfold))
        print(f"Model: {m['name']}, CV score:{CV_score:.4f}")

In [98]:
get_cv_score(models)

Model: GradientBoosting, CV score:0.8609
Model: XGBoost, CV score:0.8861
Model: LightGBM, CV score:0.8819


##### Ensemble (Averaging Blending)

In [99]:
def AveragingBlending(models, x, y, sub_x):
    # Use x and y to train models
    # x : Pandas DataFrame object
    # x.values : NumPy array form
    for m in models : 
        m['model'].fit(x.values, y)
    
    # Model prediction using sub_x
    # Store the prediction results into predictions
    predictions = np.column_stack([
        m['model'].predict(sub_x.values) for m in models
    ])

    # Return mean of each model
    return np.mean(predictions, axis=1)

# print(models)
# sub.head()
# x.head()
# print(x.values)

In [100]:
y_pred = AveragingBlending(models, x, y, sub)
print(len(y_pred))
y_pred

6468


array([ 529966.66304912,  430726.21272617, 1361676.91242777, ...,
        452081.69137012,  341572.97685942,  421725.1231835 ])

In [101]:
result = pd.DataFrame({
    'id' : sub_id,
    'price' : y_pred
})

result.head()

Unnamed: 0,id,price
0,15035,529966.7
1,15036,430726.2
2,15037,1361677.0
3,15038,333803.6
4,15039,308900.6


In [102]:
submission_path = './data/submission.csv'

result.to_csv(submission_path, index=False)

##### Better Model

In [103]:
train = x
test = sub

# Need to use expm1 after model prediction
y = np.log1p(y)

In [104]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [105]:
def rmse(y_test, y_pred):
    return np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred)))

In [106]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

In [107]:
# random_state = None
random_state = 2023

gboost = GradientBoostingRegressor(random_state=random_state)
xgboost = XGBRegressor(random_state=random_state)
lightgbm = LGBMRegressor(random_state=random_state)
rdforest = RandomForestRegressor(random_state=random_state)

models = [gboost, xgboost, lightgbm, rdforest]

In [108]:
def get_scores(models, train, y):
    df = {} # create table for saving scores
    for model in models:
        model_name = model.__class__.__name__
        # validation : 0.2
        X_train, X_val, y_train, y_val = train_test_split(train, y, random_state=random_state, test_size=0.2)

        model.fit(X_train, y_train)

        y_pred = model.predict(X_val)

        df[model_name] = rmse(y_val, y_pred)

        score_df = pd.DataFrame(df, index=['RMSE']).T.sort_values('RMSE', ascending=False)
    
    return score_df

##### Grid Search / Random Search
Hyperparameter tuning in machine learning models
##### Grid Search : predefined set of hyperparameter values is specified
##### Random Search : randomly selects hyperparameter values from a predefined search space

In [109]:
# Grid Search
from sklearn.model_selection import GridSearchCV

In [110]:
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [1, 10],
}

In [111]:
model = LGBMRegressor(random_state=random_state)

In [112]:
grid_model = GridSearchCV(model, param_grid=param_grid,
                          scoring='neg_mean_squared_error',
                          cv=5, verbose=1, n_jobs=5)

grid_model.fit(x,y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


GridSearchCV(cv=5, estimator=LGBMRegressor(random_state=2023), n_jobs=5,
             param_grid={'max_depth': [1, 10], 'n_estimators': [50, 100]},
             scoring='neg_mean_squared_error', verbose=1)

In [113]:
params = grid_model.cv_results_['params']
params

[{'max_depth': 1, 'n_estimators': 50},
 {'max_depth': 1, 'n_estimators': 100},
 {'max_depth': 10, 'n_estimators': 50},
 {'max_depth': 10, 'n_estimators': 100}]

In [114]:
score = grid_model.cv_results_['mean_test_score']
score

array([-0.07339763, -0.05502319, -0.02917323, -0.02702383])

In [115]:
results = pd.DataFrame(params)
results['score'] = score
results

Unnamed: 0,max_depth,n_estimators,score
0,1,50,-0.073398
1,1,100,-0.055023
2,10,50,-0.029173
3,10,100,-0.027024


By using neg_mean_squared_error, we can interpret higher values as indicating worse performance, consistent with the scoring convention, while still following the principle that smaller MSE values imply better model performance.

Using GridSearchCV, various scoring systems can be used in addition to 'neg_mean_squared_error'. \
Scoring systems can be categorized :
 - Classification : A problem of assigning data into predefined categories or classes. The goal is to predict the class or category to which a new data point belongs based on the given features. (accuracy, precision, recall, F1 score) 

 - Clustering : A task of grouping similar data points together based on their similarity or distance metrics. The objective is to identify natural groupings or clusters within the data without prior knowledge of the class labels. (silhouette score, completeness score)
 
 - Regression : The problem of finding a function or model that maps input variables to continuous output values. The goal is to predict a numeric or continuous target variable based on the input features. (mean absolute error(MAE), R-squared)

In [116]:
results['RMSE'] = np.sqrt(-1 * results['score'])
results

Unnamed: 0,max_depth,n_estimators,score,RMSE
0,1,50,-0.073398,0.27092
1,1,100,-0.055023,0.23457
2,10,50,-0.029173,0.170802
3,10,100,-0.027024,0.164389


In [117]:
# there was no conversion using np.expm1()
# so the values represent RMSLE (Root Mean Squared Log Error) rather than RMSE (Root Mean Squared Error)
results = results.rename(columns = {'RMSE' : 'RMSLE'})
results

Unnamed: 0,max_depth,n_estimators,score,RMSLE
0,1,50,-0.073398,0.27092
1,1,100,-0.055023,0.23457
2,10,50,-0.029173,0.170802
3,10,100,-0.027024,0.164389


In [118]:
results = results.sort_values('RMSLE')
results

Unnamed: 0,max_depth,n_estimators,score,RMSLE
3,10,100,-0.027024,0.164389
2,10,50,-0.029173,0.170802
1,1,100,-0.055023,0.23457
0,1,50,-0.073398,0.27092


##### Turn GridSearchCV steps into a function

In [119]:
def my_GridSearch(model, train, y, param_grid, verbose=2, n_jobs=5):
  model = GridSearchCV(model, param_grid=param_grid, 
                        scoring='neg_mean_squared_error', cv=5, 
                        verbose=verbose, n_jobs=n_jobs)

  model.fit(train,y)

  params = model.cv_results_['params']
  score = model.cv_results_['mean_test_score']

  results = pd.DataFrame(params)
  results['score'] = score
  results['RMSLE'] = np.sqrt(-1 * results['score'])
  results = results.sort_values('RMSLE')
  
  return results

In [120]:
my_GridSearch(model, train, y, param_grid, verbose=2, n_jobs=5)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


Unnamed: 0,max_depth,n_estimators,score,RMSLE
3,10,100,-0.027024,0.164389
2,10,50,-0.029173,0.170802
1,1,100,-0.055023,0.23457
0,1,50,-0.073398,0.27092


In [121]:
model = LGBMRegressor(max_depth=10, n_estimators=100, random_state=random_state)
model.fit(train, y)

prediction = model.predict(test)
prediction

array([13.13580793, 13.08051399, 14.11202067, ..., 13.01592878,
       12.69894979, 12.96297768])

In [122]:
prediction = np.expm1(prediction)
prediction

array([ 506766.66784595,  479506.10405112, 1345155.15609376, ...,
        449515.92243642,  327402.87855805,  426332.71354302])

In [123]:
submission_path = './data/sample_submission.csv'
submission = pd.read_csv(submission_path)
submission.head()

Unnamed: 0,id,price
0,15035,100000
1,15036,100000
2,15037,100000
3,15038,100000
4,15039,100000


In [124]:
submission['price'] = prediction
submission.head()

Unnamed: 0,id,price
0,15035,506766.7
1,15036,479506.1
2,15037,1345155.0
3,15038,312257.9
4,15039,333864.5


In [125]:
submission_csv_path = '{}/submission_{}_RMSLE_{}.csv'.format('./data', 'lgbm', '0.164399')
submission.to_csv(submission_csv_path, index=False)
print(submission_csv_path)

./data/submission_lgbm_RMSLE_0.164399.csv


##### Turn submission steps into a function

In [126]:
def save_submission(model, train, y, test, model_name, rmsle=None):
    model.fit(train, y)
    prediction = model.predict(test)
    prediction = np.expm1(prediction)
    data_dir = './data'
    submission_path = join(data_dir, 'sample_submission.csv')
    submission = pd.read_csv(submission_path)
    submission['price'] = prediction
    submission_csv_path = '{}/submission_{}_RMSLE_{}.csv'.format(data_dir, model_name, rmsle)
    submission.to_csv(submission_csv_path, index=False)
    print('{} saved!'.format(submission_csv_path))

In [127]:
save_submission(model, train, y, test, 'lgbm', rmsle='0.164399')

./data/submission_lgbm_RMSLE_0.164399.csv saved!
