In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import scipy.stats as stats
from scipy import special

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, VotingRegressor

from sklearn.metrics import r2_score, mean_squared_error

In [2]:
def label(train, test, object_columns):
    train_label = dict()
    test_label = dict()

    for label in object_columns:
        label_encoder = LabelEncoder()
        train_label[label] = label_encoder.fit_transform(train[label])

        for t_label in np.unique(test[label]):
            if t_label not in label_encoder.classes_: # unseen label 데이터인 경우( )
                label_encoder.classes_ = np.append(label_encoder.classes_, t_label)
        
        test_label[label] = label_encoder.transform(test[label])

    train_label_df = pd.concat([train.drop(object_columns, axis=1), pd.DataFrame(train_label)], axis=1)
    test_label_df = pd.concat([test.drop(object_columns, axis=1), pd.DataFrame(test_label)], axis=1)

    return train_label_df, test_label_df

# 데이터

In [3]:
benz= pd.read_csv('./Data/benz/train.csv')
benz_test = pd.read_csv('./Data/benz/test.csv')

In [4]:
# type이 int인 column 추출
int_columns = benz.loc[:, benz.dtypes == 'int64'].columns

zero_col = list()
cont_col = list()
cate_col = list()

for col in int_columns:
    if len(benz[col].unique()) == 1:
        zero_col.append(col)
    elif len(benz[col].unique()) == 2:
        cate_col.append(col)
    else:
        cont_col.append(col)

In [5]:
# type이 object인 column 추출
object_columns = benz.loc[:, benz.dtypes == object].columns

In [6]:
test = benz_test.copy()
train = benz.copy()

In [7]:
train_label_df, test_label_df = label(train, test, object_columns)

In [8]:
# box cox scaling
train_box = train_label_df.copy()

train_box_out = train_box[train_box['y'] < 200]

train_box['y'], maxlog = stats.boxcox(train_box['y'])
train_box_out['y'], maxlog = stats.boxcox(train_box_out['y'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_box_out['y'], maxlog = stats.boxcox(train_box_out['y'])


In [9]:
datas = {'box' : train_box, 'box_out' : train_box_out}

# 여러가지 모델 학습
(XGB, lightGBM, Randomforest, Ridge, Lasso, DecisionTree, LinearRegression)

## base model

In [10]:
def regression(data, reg):
    X = data.drop(['y'], axis=1)
    y = data.y

    scores = cross_val_score(reg, X, y, scoring='r2', cv=5)

    return np.round(np.mean(scores), 4)

In [11]:
lr_reg = LinearRegression()
ridge = Ridge()
lasso = Lasso(alpha=0.1)
dt_reg = DecisionTreeRegressor()
rf_reg = RandomForestRegressor()
xgb_reg = XGBRegressor()
lgbm_reg = LGBMRegressor()

regs = {'LR':lr_reg, 'Lasso' : lasso, 'Ridge' : ridge, 'DT':dt_reg, 'RF' : rf_reg, 'XGB' : xgb_reg, 'LGBM' : lgbm_reg}
scores = list()

In [12]:
for key, val in datas.items():
   for reg in regs.values():
        scores.append([key, reg.__class__.__name__, regression(val, reg)])

In [13]:
df = pd.DataFrame(scores, columns=['key', 'reg','mean_test_score']).sort_values('mean_test_score', ascending=False)
df.reset_index(drop=True)

Unnamed: 0,key,reg,mean_test_score
0,box_out,LGBMRegressor,0.6314
1,box_out,Ridge,0.6289
2,box,Ridge,0.6265
3,box,LGBMRegressor,0.6244
4,box_out,LinearRegression,0.6214
5,box,LinearRegression,0.6192
6,box_out,XGBRegressor,0.5331
7,box,XGBRegressor,0.5135
8,box_out,RandomForestRegressor,0.5028
9,box,RandomForestRegressor,0.4304


결론
1. regression model에서 Lasso, DecisionTree 제외

## 하이퍼파라미터 튜닝

In [14]:
def grid_regression(data, reg, params = None, score = 'r2'):
    X = data.drop(['y'], axis=1)
    y = data.y

    grid = GridSearchCV(reg, param_grid=params, scoring=score, n_jobs = -1, cv = 5, refit=False)
        
    grid.fit(X, y)

    para = grid.cv_results_['params']
    r2 = grid.cv_results_["mean_test_r2"]
    mse_score = grid.cv_results_["mean_test_neg_mean_squared_error"]

    return r2, mse_score, para

In [15]:
ridge = Ridge()
rf_reg = RandomForestRegressor()
xgb_reg = XGBRegressor()
lgbm_reg = LGBMRegressor()

regs = {'Ridge' : ridge, 'RF' : rf_reg, 'XGB' : xgb_reg, 'LGBM' : lgbm_reg}

reg_params = {'Ridge' : {'alpha':[20, 30, 40, 50]}, 'RF' : {'n_estimators':[10, 20, 30, 50], 'max_depth':[3, 4, 5, 6], 'min_samples_split':[16, 32, 64]}, 
              'XGB' : {'n_estimators':[20, 25, 30], 'max_depth':[3, 4, 5, 6]}, 'LGBM' : {'learning_rate':[0.01, 0.05, 0.1], 'max_depth':[3, 4, 5, 6]}}

In [16]:
total_scores = pd.DataFrame()
scoring = ['r2','neg_mean_squared_error']

for key, val in datas.items():
    for reg in regs.keys():
        r2, mse_score, para = grid_regression(data = datas[key], reg=regs[reg], score=scoring, params=reg_params[reg])
    
        temp = pd.DataFrame({'data':[key for _ in range(len(r2))], 'reg':[reg  for _ in range(len(r2))], 'params': para, 'r2 score':r2, 'mse score':mse_score})
        total_scores = pd.concat([total_scores, temp])

In [17]:
total_scores.sort_values('r2 score', ascending=False)[:5]

Unnamed: 0,data,reg,params,r2 score,mse score
4,box_out,LGBM,"{'learning_rate': 0.05, 'max_depth': 3}",0.662493,-8e-06
5,box_out,LGBM,"{'learning_rate': 0.05, 'max_depth': 4}",0.657856,-8e-06
8,box_out,LGBM,"{'learning_rate': 0.1, 'max_depth': 3}",0.657482,-8e-06
16,box_out,RF,"{'max_depth': 4, 'min_samples_split': 32, 'n_e...",0.656408,-8e-06
22,box_out,RF,"{'max_depth': 4, 'min_samples_split': 64, 'n_e...",0.655966,-8e-06


In [18]:
total_scores[total_scores['reg'] == 'XGB'].sort_values('r2 score', ascending=False)[:1]

Unnamed: 0,data,reg,params,r2 score,mse score
0,box_out,XGB,"{'max_depth': 3, 'n_estimators': 20}",0.651161,-8e-06


In [19]:
total_scores[total_scores['reg'] == 'Ridge'].sort_values('r2 score', ascending=False)[:1]

Unnamed: 0,data,reg,params,r2 score,mse score
1,box_out,Ridge,{'alpha': 30},0.639215,-9e-06


In [20]:
r2_rf = total_scores[total_scores['reg'] == 'RF'].sort_values('r2 score', ascending=False)
pa = pd.json_normalize(r2_rf['params'])

r2_rf = pd.concat([r2_rf.drop(columns=['params']).reset_index(drop=True), pa.reset_index(drop=True)], axis=1)

r2_rf[:1]


Unnamed: 0,data,reg,r2 score,mse score,max_depth,min_samples_split,n_estimators
0,box_out,RF,0.656408,-8e-06,4,32,10


## stacking

In [21]:
models = [
    ('xgb', XGBRegressor(max_depth = 3, n_estimators = 20)),
    ('lgbm', LGBMRegressor(learning_rate = 0.05, max_depth = 3)),
    ('ridge', Ridge(alpha = 30)),
    ('rf', RandomForestRegressor(n_estimators = 50, max_depth = 4, min_samples_split = 64)),
    ('lr', LinearRegression())
]

X = train_box_out.drop('y', axis = 1)
y = train_box_out.y

for i in models:
    base_models = [x for x in models if x != i]
    sr = StackingRegressor(estimators=base_models, final_estimator=i[1])

    scores = cross_val_score(sr, X, y, scoring='r2', cv=5)

    print(f'meta model : {i[0]}, mean r2 score : {np.round(np.mean(scores), 4)}\n')

meta model : xgb, mean r2 score : 0.6396

meta model : lgbm, mean r2 score : 0.6577

meta model : ridge, mean r2 score : 0.0019

meta model : rf, mean r2 score : 0.6608

meta model : lr, mean r2 score : 0.6634



## voting

In [22]:
models = [
    ('lr', LinearRegression()),
    ('rf', RandomForestRegressor(n_estimators = 50, max_depth = 4, min_samples_split = 64)),
    ('xgb', XGBRegressor(max_depth = 3, n_estimators = 20)),
    ('ridge', Ridge(alpha = 30)),
    ('lgbm', LGBMRegressor(learning_rate = 0.05, max_depth = 3))
]

X = train_box_out.drop('y', axis = 1)
y = train_box_out.y

vt = VotingRegressor(estimators=models)

scores = cross_val_score(vt, X, y, scoring='r2', cv=5)

print(f'mean r2 score : {np.round(np.mean(scores), 4)}')

mean r2 score : 0.6586


최적의 모델
1. data : object type columns -> label encoding / target -> box_cox scaler / outlier expect value -> drop  
2. estimator : LGBM(learning rate : 0.05, max_depth : 3)  
mean r2 score : 0.6634