In [9]:
import warnings
warnings.filterwarnings("ignore")

import os
from os.path import join

import pandas as pd
import numpy as np

import missingno as msno

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score
import xgboost as xgb
import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

from sklearn.model_selection import GridSearchCV

data_dir = os.getenv('HOME')+'/aiffel/kaggle_kakr_housing/data'

train_data_path = join(data_dir, 'train.csv')
test_data_path = join(data_dir, 'test.csv') 

train = pd.read_csv(train_data_path)
test = pd.read_csv(test_data_path)

# 데이터 전처리 
train['date'] = train['date'].apply(lambda i: i[:6]).astype(int)

y = train['price']
del train['price']
del train['id']

test['date'] = test['date'].apply(lambda i: i[:6]).astype(int)
del test['id']

# seaborn의 `kdeplot`을 활용해 y분포 확인
sns.kdeplot(y)
plt.show()

# 왼쪽으로 크게 치우쳐 있는 y값 로그변환 
y = np.log1p(y)

# 데이터 형태 확인
train.info()

# rmse 계산을 위한 함수 생성
def rmse(y_test, y_pred):
    return np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred)))

random_state=2020        # 하지만 우리는 이렇게 고정값을 세팅해 두겠습니다. 

gboost = GradientBoostingRegressor(random_state=random_state)
xgboost = XGBRegressor(random_state=random_state)
lightgbm = LGBMRegressor(random_state=random_state)
rdforest = RandomForestRegressor(random_state=random_state)

models = [gboost, xgboost, lightgbm, rdforest]

def get_scores(models, train, y):
    df = {}
    
    for model in models:
        model_name = model.__class__.__name__
        
        X_train, X_test, y_train, y_test = train_test_split(train, y, random_state=random_state, test_size=0.2)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        df[model_name] = rmse(y_test, y_pred)
        score_df = pd.DataFrame(df, index=['RMSE']).T.sort_values('RMSE', ascending=False)
            
    return score_df

get_scores(models, train, y)

def my_GridSearch(model, train, y, param_grid, verbose=2, n_jobs=5):
    # GridSearchCV 모델로 초기화
    grid_model = GridSearchCV(model, param_grid=param_grid, scoring='neg_mean_squared_error', \
                              cv=5, verbose=verbose, n_jobs=n_jobs)
    
    # 모델 fitting
    grid_model.fit(train, y)

    # 결과값 저장
    params = grid_model.cv_results_['params']
    score = grid_model.cv_results_['mean_test_score']
    
    # 데이터 프레임 생성
    results = pd.DataFrame(params)
    results['score'] = score
    
    # RMSLE 값 계산 후 정렬
    results['RMSLE'] = np.sqrt(-1 * results['score'])
    results = results.sort_values('RMSLE')

    return results

# 그리드 탐색
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [1, 10],
}

# 학습결과 예측
"""
model = LGBMRegressor(max_depth=10, n_estimators=100, random_state=random_state)

def save_submission(model, train, y, test, model_name, rmsle=None):
    model.fit(train, y)
    prediction = model.predict(test)
    prediction = np.expm1(prediction)
    data_dir = os.getenv('HOME')+'/aiffel/kaggle_kakr_housing/data'
    submission_path = join(data_dir, 'sample_submission.csv')
    submission = pd.read_csv(submission_path)
    submission['price'] = prediction
    submission_csv_path = '{}/submission_{}_RMSLE_{}.csv'.format(data_dir, model_name, rmsle)
    submission.to_csv(submission_csv_path, index=False)
    print('{} saved!'.format(submission_csv_path))
"""

#save_submission(model, train, y, test, 'lgbm', rmsle='0.164399')

model_a = GradientBoostingRegressor(max_depth=10, n_estimators=100, random_state=random_state)
# GradientBoostingRegressor(max_depth=10, n_estimators=100, random_state=random_state)
model_b = XGBRegressor(max_depth=10, n_estimators=50, random_state=random_state)
# XGBRegressor(max_depth=10, n_estimators=50, random_state=random_state)
model_c = LGBMRegressor(max_depth=10, n_estimators=100, random_state=random_state)
# LGBMRegressor(max_depth=10, n_estimators=100, random_state=random_state)
model_d = RandomForestRegressor(max_depth=10, n_estimators=100, random_state=random_state) 
                                
models = [{'model':model_a, 'name':'GradientBoosting'}, {'model':model_c, 'name':'XGBoost'},
          {'model':model_b, 'name':'LightGBM'}, {'model':model_d, 'name':'RandomForest'}]
                                
# RandomForestRegressor(max_depth=10, n_estimators=100, random_state=random_state)

#print(my_GridSearch(model_a, train, y, param_grid, verbose=2, n_jobs=5))
#print(my_GridSearch(model_b, train, y, param_grid, verbose=2, n_jobs=5))
#print(my_GridSearch(model_c, train, y, param_grid, verbose=2, n_jobs=5))
#print(my_GridSearch(model_d, train, y, param_grid, verbose=2, n_jobs=5))

SyntaxError: invalid syntax (357091280.py, line 143)

In [10]:
def AveragingBlending(models, x, y, sub_x):
    for m in models : 
        m['model'].fit(x.values, y)
    
    predictions = np.column_stack([
        m['model'].predict(sub_x.values) for m in models
    ])
    return np.mean(predictions, axis=1)