In [51]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import scipy.stats as stats
from scipy import special

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, VotingRegressor

from sklearn.metrics import r2_score, mean_squared_error

In [52]:
def one_hot(data, cols):
    oh_encoder = OneHotEncoder()

    oh = oh_encoder.fit_transform(data[cols])
    oh_df = pd.DataFrame(oh.toarray().astype(int), columns=oh_encoder.get_feature_names_out())
    
    return pd.concat([data.drop(cols, axis =1), oh_df], axis=1)

# 데이터

In [53]:
benz= pd.read_csv('./Data/benz/train.csv')
benz_test = pd.read_csv('./Data/benz/test.csv')

In [54]:
# type이 int인 column 추출
int_columns = benz.loc[:, benz.dtypes == 'int64'].columns

zero_col = list()
cont_col = list()
cate_col = list()

for col in int_columns:
    if len(benz[col].unique()) == 1:
        zero_col.append(col)
    elif len(benz[col].unique()) == 2:
        cate_col.append(col)
    else:
        cont_col.append(col)

In [55]:
# type이 object인 column 추출
object_columns = benz.loc[:, benz.dtypes == object].columns

In [56]:
test = benz_test.copy()
train = benz.copy()

In [57]:
none_train = list()

for ob in object_columns:
    if len(set(benz_test[ob]) - set(benz[ob])) > 0:
        none_train.append(ob)

train = train.drop(none_train, axis=1)
object_col = object_columns.drop(none_train)

In [58]:
# object type의 변수들 one hot encoding
train_oh_df = one_hot(train, object_col)
test_oh_df = one_hot(test, object_col)

In [59]:
# box cox scaling
train_box = train_oh_df.copy()

train_box_out = train_box[train_box['y'] < 200]

train_box['y'], maxlog = stats.boxcox(train_box['y'])
train_box_out['y'], maxlog = stats.boxcox(train_box_out['y'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_box_out['y'], maxlog = stats.boxcox(train_box_out['y'])


In [60]:
datas = {'box' : train_box, 'box_out' : train_box_out}

# 여러가지 모델 학습
(XGB, lightGBM, Randomforest, Ridge, Lasso, DecisionTree, LinearRegression)

## base model

In [61]:
def regression(data, reg):
    X = data.drop(['y'], axis=1)
    y = data.y

    scores = cross_val_score(reg, X, y, scoring='r2', cv=5)

    return np.round(np.mean(scores), 4)

In [62]:
lr_reg = LinearRegression()
ridge = Ridge()
lasso = Lasso(alpha=0.1)
dt_reg = DecisionTreeRegressor()
rf_reg = RandomForestRegressor()
xgb_reg = XGBRegressor()
lgbm_reg = LGBMRegressor()

regs = {'LR':lr_reg, 'Lasso' : lasso, 'Ridge' : ridge, 'DT':dt_reg, 'RF' : rf_reg, 'XGB' : xgb_reg, 'LGBM' : lgbm_reg}
scores = list()

In [63]:
for key, val in datas.items():
   for reg in regs.values():
        scores.append([key, reg.__class__.__name__, regression(val, reg)])

In [64]:
df = pd.DataFrame(scores, columns=['key', 'reg','mean_test_score']).sort_values('mean_test_score', ascending=False)
df.reset_index(drop=True)

Unnamed: 0,key,reg,mean_test_score
0,box_out,LGBMRegressor,0.6376
1,box,LGBMRegressor,0.6294
2,box_out,Ridge,0.6254
3,box,Ridge,0.623
4,box_out,LinearRegression,0.6154
5,box,LinearRegression,0.6129
6,box,XGBRegressor,0.6072
7,box_out,RandomForestRegressor,0.5035
8,box_out,XGBRegressor,0.4856
9,box,RandomForestRegressor,0.4114


결론
1. regression model에서 Lasso, DecisionTree 제외

## 하이퍼파라미터 튜닝

In [65]:
def grid_regression(data, reg, params = None, score = 'r2'):
    X = data.drop(['y'], axis=1)
    y = data.y

    grid = GridSearchCV(reg, param_grid=params, scoring=score, n_jobs = -1, cv = 5, refit=False)
        
    grid.fit(X, y)

    para = grid.cv_results_['params']
    r2 = grid.cv_results_["mean_test_r2"]
    mse_score = grid.cv_results_["mean_test_neg_mean_squared_error"]

    return r2, mse_score, para

In [66]:
ridge = Ridge()
rf_reg = RandomForestRegressor()
xgb_reg = XGBRegressor()
lgbm_reg = LGBMRegressor()

regs = {'Ridge' : ridge, 'RF' : rf_reg, 'XGB' : xgb_reg, 'LGBM' : lgbm_reg}

reg_params = {'Ridge' : {'alpha':[20, 30, 40, 50]}, 'RF' : {'n_estimators':[10, 20, 30, 50], 'max_depth':[3, 4, 5, 6], 'min_samples_split':[32, 64, 128]}, 
              'XGB' : {'n_estimators':[20, 25, 30], 'max_depth':[3, 4, 5, 6]}, 'LGBM' : {'learning_rate':[0.01, 0.05, 0.1], 'max_depth':[3, 4, 5, 6]}}

In [67]:
total_scores = pd.DataFrame()
scoring = ['r2','neg_mean_squared_error']

for key, val in datas.items():
    for reg in regs.keys():
        r2, mse_score, para = grid_regression(data = datas[key], reg=regs[reg], score=scoring, params=reg_params[reg])
    
        temp = pd.DataFrame({'data':[key for _ in range(len(r2))], 'reg':[reg  for _ in range(len(r2))], 'params': para, 'r2 score':r2, 'mse score':mse_score})
        total_scores = pd.concat([total_scores, temp])

In [68]:
total_scores.sort_values('r2 score', ascending=False)[:5]

Unnamed: 0,data,reg,params,r2 score,mse score
1,box_out,XGB,"{'max_depth': 3, 'n_estimators': 25}",0.664137,-8e-06
4,box_out,LGBM,"{'learning_rate': 0.05, 'max_depth': 3}",0.663955,-8e-06
5,box_out,LGBM,"{'learning_rate': 0.05, 'max_depth': 4}",0.659373,-8e-06
20,box_out,RF,"{'max_depth': 4, 'min_samples_split': 128, 'n_...",0.658904,-8e-06
8,box_out,LGBM,"{'learning_rate': 0.1, 'max_depth': 3}",0.657644,-8e-06


In [69]:
total_scores[total_scores['reg'] == 'XGB'].sort_values('r2 score', ascending=False)[:1]

Unnamed: 0,data,reg,params,r2 score,mse score
1,box_out,XGB,"{'max_depth': 3, 'n_estimators': 25}",0.664137,-8e-06


In [70]:
total_scores[total_scores['reg'] == 'LGBM'].sort_values('r2 score', ascending=False)[:1]

Unnamed: 0,data,reg,params,r2 score,mse score
4,box_out,LGBM,"{'learning_rate': 0.05, 'max_depth': 3}",0.663955,-8e-06


In [71]:
total_scores[total_scores['reg'] == 'Ridge'].sort_values('r2 score', ascending=False)[:1]

Unnamed: 0,data,reg,params,r2 score,mse score
2,box_out,Ridge,{'alpha': 40},0.639189,-9e-06


In [72]:
r2_rf = total_scores[total_scores['reg'] == 'RF'].sort_values('r2 score', ascending=False)
pa = pd.json_normalize(r2_rf['params'])

r2_rf = pd.concat([r2_rf.drop(columns=['params']).reset_index(drop=True), pa.reset_index(drop=True)], axis=1)

r2_rf[:1]


Unnamed: 0,data,reg,r2 score,mse score,max_depth,min_samples_split,n_estimators
0,box_out,RF,0.658904,-8e-06,4,128,10


## stacking

In [73]:
models = [
    ('xgb', XGBRegressor(max_depth = 3, n_estimators = 25)),
    ('lgbm', LGBMRegressor(learning_rate = 0.05, max_depth = 3)),
    ('ridge', Ridge(alpha = 40)),
    ('rf', RandomForestRegressor(n_estimators = 10, max_depth = 4, min_samples_split = 128)),
    ('lr', LinearRegression())
]

X = train_box_out.drop('y', axis = 1)
y = train_box_out.y

for i in models:
    base_models = [x for x in models if x != i]
    sr = StackingRegressor(estimators=base_models, final_estimator=i[1])

    scores = cross_val_score(sr, X, y, scoring='r2', cv=5)

    print(f'meta model : {i[0]}, mean r2 score : {np.round(np.mean(scores), 4)}\n')

meta model : xgb, mean r2 score : 0.6608

meta model : lgbm, mean r2 score : 0.6589

meta model : ridge, mean r2 score : 0.0001

meta model : rf, mean r2 score : 0.658

meta model : lr, mean r2 score : 0.6664



## voting

In [74]:
models = [
    ('lr', LinearRegression()),
    ('rf', RandomForestRegressor(n_estimators = 10, max_depth = 4, min_samples_split = 128)),
    ('xgb', XGBRegressor(max_depth = 3, n_estimators = 20)),
    ('ridge', Ridge(alpha = 40)),
    ('lgbm', LGBMRegressor(learning_rate = 0.05, max_depth = 3))
]

X = train_box_out.drop('y', axis = 1)
y = train_box_out.y

vt = VotingRegressor(estimators=models)

scores = cross_val_score(vt, X, y, scoring='r2', cv=5)

print(f'mean r2 score : {np.round(np.mean(scores), 4)}')

mean r2 score : 0.6598


최적의 모델
1. data : object type columns -> drop & one hot encoding  / target -> box_cox scaler / outlier expect value -> drop  
2. estimator : stacking (base model : Random forest, XGB, lightGBM, Ridge / meta model : LinearRegressor)  

mean r2 score : 0.6664