In [None]:
import pandas as pd
import numpy as np
from IPython.display import Image
import warnings

warnings.filterwarnings('ignore')

np.set_printoptions(suppress = True)

In [None]:
from sklearn.datasets import load_boston

In [None]:
#데이터 로드 
data = load_boston()

df = pd.DataFrame(data['data'], columns=data['feature_names'])
df['MEDV'] = data['target']

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df.drop('MEDV', 1), df['MEDV'], random_state=42)

In [None]:
#평가지표 만들기

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

my_predictions = {}

colors = ['r', 'c', 'm', 'y', 'k', 'khaki', 'teal', 'orchid', 'sandybrown',
          'greenyellow', 'dodgerblue', 'deepskyblue', 'rosybrown', 'firebrick',
          'deeppink', 'crimson', 'salmon', 'darkred', 'olivedrab', 'olive', 
          'forestgreen', 'royalblue', 'indigo', 'navy', 'mediumpurple', 'chocolate',
          'gold', 'darkorange', 'seagreen', 'turquoise', 'steelblue', 'slategray', 
          'peru', 'midnightblue', 'slateblue', 'dimgray', 'cadetblue', 'tomato'
         ]

def plot_predictions(name_, pred, actual):
    df = pd.DataFrame({'prediction': pred, 'actual': y_test})
    df = df.sort_values(by='actual').reset_index(drop=True)

    plt.figure(figsize=(12, 9))
    plt.scatter(df.index, df['prediction'], marker='x', color='r')
    plt.scatter(df.index, df['actual'], alpha=0.7, marker='o', color='black')
    plt.title(name_, fontsize=15)
    plt.legend(['prediction', 'actual'], fontsize=12)
    plt.show()

def mse_eval(name_, pred, actual):
    global predictions
    global colors

    plot_predictions(name_, pred, actual)

    mse = mean_squared_error(pred, actual)
    my_predictions[name_] = mse

    y_value = sorted(my_predictions.items(), key=lambda x: x[1], reverse=True)
    
    df = pd.DataFrame(y_value, columns=['model', 'mse'])
    print(df)
    min_ = df['mse'].min() - 10
    max_ = df['mse'].max() + 10
    
    length = len(df)
    
    plt.figure(figsize=(10, length))
    ax = plt.subplot()
    ax.set_yticks(np.arange(len(df)))
    ax.set_yticklabels(df['model'], fontsize=15)
    bars = ax.barh(np.arange(len(df)), df['mse'])
    
    for i, v in enumerate(df['mse']):
        idx = np.random.choice(len(colors))
        bars[i].set_color(colors[idx])
        ax.text(v + 2, i, str(round(v, 3)), color='k', fontsize=15, fontweight='bold')
        
    plt.title('MSE Error', fontsize=18)
    plt.xlim(min_, max_)
    
    plt.show()

def remove_model(name_):
    global my_predictions
    try:
        del my_predictions[name_]
    except KeyError:
        return False
    return True

def plot_coef(columns, coef):
    coef_df = pd.DataFrame(list(zip(columns, coef)))
    coef_df.columns=['feature', 'coef']
    coef_df = coef_df.sort_values('coef', ascending=False).reset_index(drop=True)
    
    fig, ax = plt.subplots(figsize=(9, 7))
    ax.barh(np.arange(len(coef_df)), coef_df['coef'])
    idx = np.arange(len(coef_df))
    ax.set_yticks(idx)
    ax.set_yticklabels(coef_df['feature'])
    fig.tight_layout()
    plt.show()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

In [None]:
linear_reg = LinearRegression(n_jobs=-1)
linear_reg.fit(x_train, y_train)
pred = linear_reg.predict(x_test)
mse_eval('LinearRegression', pred, y_test)

In [None]:
ridge = Ridge(alpha=1)
ridge.fit(x_train, y_train)
pred = ridge.predict(x_test)
mse_eval('Ridge(alpha=1)', pred, y_test)

In [None]:
lasso = Lasso(alpha=0.01)
lasso.fit(x_train, y_train)
pred = lasso.predict(x_test)
mse_eval('Lasso(alpha=0.01)', pred, y_test)

In [None]:
elasticnet = ElasticNet(alpha=0.5, l1_ratio=0.8)
elasticnet.fit(x_train, y_train)
pred = elasticnet.predict(x_test)
mse_eval('ElasticNet(l1_ratio=0.8)', pred, y_test)    

In [None]:
elasticnet_pipeline = make_pipeline(
    StandardScaler(),
    ElasticNet(alpha=0.1, l1_ratio=0.2)
)
elasticnet_pred = elasticnet_pipeline.fit(x_train, y_train).predict(x_test)
mse_eval('Standard ElasticNet', elasticnet_pred, y_test)

In [None]:
poly_pipeline = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False),
    StandardScaler(),
    ElasticNet(alpha=0.1, l1_ratio=0.2)
)
poly_pred = poly_pipeline.fit(x_train, y_train).predict(x_test)
mse_eval('Poly ElasticNet', poly_pred, y_test)

In [None]:
###앙상블 시작!! -- 보팅

In [None]:
from sklearn.ensemble import VotingRegressor, VotingClassifier

In [None]:
single_models = [
    ('linear_reg', linear_reg), 
    ('ridge', ridge), 
    ('lasso', lasso), 
    ('elasticnet_pipeline', elasticnet_pipeline), 
    ('poly_pipeline', poly_pipeline)
]

In [None]:
voting_regressor = VotingRegressor(single_models, n_jobs=-1)

In [None]:
voting_regressor.fit(x_train, y_train)

In [None]:
voting_pred = voting_regressor.predict(x_test)

In [None]:
mse_eval('Voting Ensemble', voting_pred, y_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

In [None]:
rfr = RandomForestRegressor()
rfr_pred = rfr.fit(x_train, y_train).predict(x_test)

In [None]:
mse_eval('RandomForest Ensemble', rfr_pred, y_test)

In [None]:
rfr = RandomForestRegressor(random_state=42, n_estimators = 1000, max_depth=7, max_features=0.8)
rfr_pred = rfr.fit(x_train, y_train).predict(x_test)

In [None]:
mse_eval('RandomForest Ensemble w/ Tunning', rfr_pred, y_test)

In [None]:
# GradientBoost
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
gbr = GradientBoostingRegressor(random_state = 42)
gbr_pred = gbr.fit(x_train, y_train).predict(x_test)
mse_eval('GradientBoosting ensemble',gbr_pred, y_test)

In [None]:
gbr = GradientBoostingRegressor(random_state = 42, learning_rate=0.01)
gbr_pred = gbr.fit(x_train, y_train).predict(x_test)
mse_eval('GradientBoosting ensemble(lr = 0.01)',gbr_pred, y_test)

In [None]:
gbr = GradientBoostingRegressor(random_state = 42, learning_rate=0.01, n_estimators=1000)
gbr_pred = gbr.fit(x_train, y_train).predict(x_test)
mse_eval('GradientBoosting ensemble(lr = 0.01, est=1000)',gbr_pred, y_test)

In [None]:
gbr = GradientBoostingRegressor(random_state = 42, learning_rate=0.01, n_estimators=1000, subsample=0.8)
gbr_pred = gbr.fit(x_train, y_train).predict(x_test)
mse_eval('GradientBoosting ensemble(lr = 0.01, est=1000, subsample=0.8)',gbr_pred, y_test)

In [None]:
from xgboost import XGBRegressor

In [None]:
xgb = XGBRegressor(random_state = 42)
xgb_pred = xgb.fit(x_train, y_train).predict(x_test)
mse_eval('XGBoost', xgb_pred, y_test)

In [None]:
xgb = XGBRegressor(random_state = 42, learning_rate = 0.01, n_estimators =1000, subsample = 0.8, max_features = 0.8, max_depth = 7)
xgb_pred = xgb.fit(x_train, y_train).predict(x_test)
mse_eval('XGBoost w/ Tuning', xgb_pred, y_test)

In [None]:
from lightgbm import LGBMRegressor

In [None]:
lgbm = LGBMRegressor(random_state=42)
lgbm_pred = lgbm.fit(x_train, y_train).predict(x_test)
mse_eval('LGBM', lgbm_pred, y_test)

In [None]:
lgbm = LGBMRegressor(random_state=42, learning_rate=0.01, n_estimators=2000, colsample_bytree=0.8, subsample=0.8, max_depth = 7)
lgbm_pred = lgbm.fit(x_train, y_train).predict(x_test)
mse_eval('LGBM w/ Tuning', lgbm_pred, y_test)

In [None]:
from sklearn.ensemble import StackingRegressor

In [None]:
stack_models = {
    ('elasticent', poly_pipeline),
    ('randomForest', rfr),
    ('gbr', gbr),
    ('lgbm', lgbm)
}

In [None]:
stack_reg =StackingRegressor(stack_models, final_estimator=xgb, n_jobs=-1)
stack_reg_pred = stack_reg.fit(x_train, y_train).predict(x_test)
mse_eval('Stacking Ensemble', stack_reg_pred, y_test)

In [None]:
final_outputs = {
    'elasticnet': poly_pred, 
    'randomforest': rfr_pred, 
    'gbr': gbr_pred,
    'xgb': xgb_pred, 
    'lgbm': lgbm_pred,
    'stacking': stack_reg_pred,
}

In [None]:
final_prediction=\
final_outputs['elasticnet'] * 0.1\
+final_outputs['randomforest'] * 0.1\
+final_outputs['gbr'] * 0.2\
+final_outputs['xgb'] * 0.2\
+final_outputs['lgbm'] * 0.2\
+final_outputs['stacking'] * 0.2

In [None]:
mse_eval('Weighted Blending', final_prediction, y_test)

In [None]:
#교차검증

#Cross Validation

"""
모델을 평가하는 하나의 방법
K-겹 교차검증을 많이 활용

K-겹 교차검증이란?
- 모든 데이터가 최소 한 번은 테스트셋으로 쓰이도록 합니다.

EX)
- Estimation 1일때
학습 데이터 [B,C,D,E] 검증데이터 [A]
- Estimation 2일대
학습 데이터 [A,C,D,E] 검증데이터[B]
"""

In [59]:
from sklearn.model_selection import KFold

In [60]:
n_splits = 5
kfold = KFold(n_splits=n_splits, random_state=42,shuffle=True)

In [61]:
X = np.array(df.drop('MEDV', 1))
Y = np.array(df['MEDV'])

In [62]:
lgbm_fold = LGBMRegressor(random_state=42)

In [63]:
i = 1
total_error = 0
for train_index, test_index in kfold.split(X):
    x_train_fold, x_test_fold = X[train_index], X[test_index]
    y_train_fold, y_test_fold = Y[train_index], Y[test_index]
    lgbm_pred_fold = lgbm_fold.fit(x_train_fold, y_train_fold).predict(x_test_fold)
    error = mean_squared_error(lgbm_pred_fold, y_test_fold)
    print('Fold = {}, prediction score = {:.2f}'.format(i, error))
    total_error += error
    i+=1
print('---'*10)
print('Average Error: %s' % (total_error / n_splits))

Fold = 1, prediction score = 8.34
Fold = 2, prediction score = 10.40
Fold = 3, prediction score = 17.58
Fold = 4, prediction score = 6.94
Fold = 5, prediction score = 12.16
------------------------------
Average Error: 11.083201392666322
