<h1>Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#KFold" data-toc-modified-id="KFold-1">KFold</a></span></li><li><span><a href="#GridSearch" data-toc-modified-id="GridSearch-2">GridSearch</a></span></li><li><span><a href="#RandomSearch" data-toc-modified-id="RandomSearch-3">RandomSearch</a></span></li><li><span><a href="#Model-Ensemble" data-toc-modified-id="Model-Ensemble-4">Model Ensemble</a></span></li></ul></div>

In [1]:
import os
import warnings
from tqdm import tqdm

import pandas as pd
import numpy as np

from sklearn.model_selection import (
    KFold, StratifiedKFold, GridSearchCV, RandomizedSearchCV, RepeatedKFold
)
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import (
    BaggingRegressor, RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
)

warnings.filterwarnings('ignore')

In [2]:
data_list = os.listdir('./data')
data_list

['test_csv.csv',
 'train_pickle.pkl',
 'FIFA_train.csv',
 'train_csv.csv',
 'train_feather.ftr',
 'submission_baseline_rf.csv',
 'FIFA_test.csv',
 'submission.csv',
 'submission_baseline_rf_final.csv']

In [3]:
train = pd.read_feather('./data/' + data_list[4])
test = pd.read_csv('./data/' + data_list[0])

In [4]:
# 데이터 x, y 분리
X_train = train.drop('value', axis=1)
y_train = train['value']

# KFold
- 아래 for문을 돌리면 마지막 모델만 저장이 됨
- 리스트에 append 하는 방식으로 각각의 모델을 저장하여 사용할 수도 있다.

In [5]:
kfold = KFold(n_splits=7, shuffle=True, random_state=120)
# StratifiedKFold: 분류 문제에서 사용, class 분포를 고르게 나눔
# stratifiedkfold = StratifiedKFold()

In [6]:
for i, (t, v) in enumerate(kfold.split(train)):
    # train, val 분리
    trn = train.iloc[t]
    val = train.iloc[v]
    # x, y 분리
    x_tr = trn.drop('value', axis=1)
    y_tr = trn['value']
    x_val = val.drop('value', axis=1)
    y_val = val['value']
    # 모델 학습
    rf = RandomForestRegressor(n_estimators=300, random_state=130)
    rf.fit(x_tr, y_tr)
    # 예측
    pred = rf.predict(x_val)
    pred = np.expm1(pred)
    y_val = np.expm1(y_val)
    # rmse
    mse = mean_squared_error(y_val, pred)
    rmse = np.sqrt(mse)
    
    print(f'{i+1}번 모델 rmse: {rmse}')

1번 모델 rmse: 687844.2488371134
2번 모델 rmse: 625916.0549405463
3번 모델 rmse: 856642.1689139444
4번 모델 rmse: 468062.7167838183
5번 모델 rmse: 609892.461082129
6번 모델 rmse: 876997.4335459874
7번 모델 rmse: 1455918.1601901588


# GridSearch

In [7]:
rf = RandomForestRegressor(random_state=120)

In [8]:
params = {
    'n_estimators': [300, 400, 500],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3]
}

In [9]:
cv = KFold(n_splits=3, shuffle=True, random_state=120)

In [10]:
grid_search = GridSearchCV(rf, 
                           param_grid=params, 
                           cv=cv, 
                           scoring='neg_mean_squared_error', 
                           verbose=2, 
                           n_jobs=-1)

In [11]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   14.4s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   49.8s finished


GridSearchCV(cv=KFold(n_splits=3, random_state=120, shuffle=True),
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=120,
                                             verbo

In [12]:
print(grid_search.best_params_)
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_index_)

{'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 400}
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=400, n_jobs=None, oob_score=False,
                      random_state=120, verbose=0, warm_start=False)
-0.009021674662347183
1


In [13]:
RandomForestRegressor(**grid_search.best_params_)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=400, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [14]:
# 서치 결과 가장 좋은 estimator로 학습
grid_search.best_estimator_.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=400, n_jobs=None, oob_score=False,
                      random_state=120, verbose=0, warm_start=False)

In [15]:
# 학습 후 예측
grid_search.best_estimator_.predict(test)

array([17.70900388, 18.15445908, 18.01956852, ..., 10.99046923,
       10.68916581, 10.82281539])

# RandomSearch

In [16]:
rf = RandomForestRegressor(random_state=130)

In [17]:
from scipy.stats import randint, uniform, loguniform

In [18]:
params = {
    'n_estimators': randint(100, 600),
    'min_samples_split': randint(1, 8),
    'min_samples_leaf': randint(1, 5)
}

In [19]:
# RepeatedKFold: KFold를 n번 반복해서 실행, n_repeats
cv = RepeatedKFold(n_splits=3, random_state=120, n_repeats=3)

In [22]:
random_search = RandomizedSearchCV(rf,
                                   param_distributions=params,
                                   cv=cv,
                                   n_iter=20,
                                   scoring='neg_mean_squared_error',
                                   n_jobs=-1)

In [23]:
random_search.fit(X_train, y_train)

RandomizedSearchCV(cv=RepeatedKFold(n_repeats=3, n_splits=3, random_state=120),
                   error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction...
                   param_distrib

In [24]:
print(random_search.best_estimator_)
print(random_search.best_params_)
print(random_search.best_index_)
print(random_search.best_score_)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=3, min_weight_fraction_leaf=0.0,
                      n_estimators=343, n_jobs=None, oob_score=False,
                      random_state=130, verbose=0, warm_start=False)
{'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 343}
10
-0.009534631415567311


# Model Ensemble
- train_data에서 무작위로 train_data 개수 만큼 뽑아서 모델에 돌린 다음에 예측 값의 평균을 구함
- lightgbm 앙상블이 가장 좋음

In [25]:
rf_1 = grid_search.best_estimator_
rf_2 = random_search.best_estimator_

In [26]:
rf_1.fit(X_train, y_train)
rf_2.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=3, min_weight_fraction_leaf=0.0,
                      n_estimators=343, n_jobs=None, oob_score=False,
                      random_state=130, verbose=0, warm_start=False)

In [27]:
pred_1 = rf_1.predict(test)
pred_2 = rf_2.predict(test)

In [28]:
pred = (pred_1 * 0.5) + (pred_2 * 0.5)
pred

array([17.70697775, 18.15654872, 18.0181266 , ..., 10.99191611,
       10.70213227, 10.82235723])

In [30]:
# 앙상블 모델 + bagging
prediction_list = []  # 예측값을 저장할 리스트
np.random.seed(123)

for _ in tqdm(range(10)):
    # data_index = X_train.index  # 데이터프레임 인덱스 사용
    data_index = [idx for idx in range(X_train.shape[0])]  # X_train 길이만큼 index를 저장
    random_index = np.random.choice(data_index, X_train.shape[0], replace=True)  # 인덱스 복원추출
    
    rf = RandomForestRegressor(**random_search.best_params_)  # search하여 찾은 하이퍼파라미터를 넣어줌
    rf.fit(X_train.iloc[random_index, ], y_train.iloc[random_index, ])  # 모델 학습
    
    pred = rf.predict(test)
    pred = np.expm1(pred)
    prediction_list.append(pred)

100%|██████████| 10/10 [00:55<00:00,  5.60s/it]


In [31]:
# 각 array의 line by line으로 평균을 내어 prediction에 저장
prediction = []
for idx2 in range(test.shape[0]):
    temp = []
    for idx in range(len(prediction_list)):
        temp.append(prediction_list[idx][idx2])
    prediction.append(np.mean(temp))

In [32]:
# 위와 동일한 과정을 dataframe으로 구현
df = pd.DataFrame({
    'p0': prediction_list[0],
    'p1': prediction_list[1],
    'p2': prediction_list[2],
    'p3': prediction_list[3],
    'p4': prediction_list[4],
    'p5': prediction_list[5],
    'p6': prediction_list[6],
    'p7': prediction_list[7],
    'p8': prediction_list[8],
    'p9': prediction_list[9],
})

In [33]:
df.head()

Unnamed: 0,p0,p1,p2,p3,p4,p5,p6,p7,p8,p9
0,36637530.0,78063300.0,48049900.0,72312210.0,71040730.0,31870720.0,44824000.0,56084240.0,54443880.0,38177700.0
1,81226930.0,77081080.0,73828140.0,88009210.0,81500670.0,71740640.0,83111450.0,81427380.0,82219100.0,65555220.0
2,69826830.0,60365410.0,65407050.0,62641880.0,61882410.0,68353470.0,73765920.0,73466750.0,70795880.0,62088710.0
3,70203670.0,65304410.0,70078620.0,65434600.0,67095160.0,70684200.0,73253030.0,74561450.0,75727490.0,63143260.0
4,63997350.0,55523830.0,55291000.0,62394910.0,58299680.0,54230240.0,61320280.0,68717760.0,70126130.0,60779700.0


In [37]:
df_mean = np.mean(df, axis=1)
df_mean.head()

0    5.315042e+07
1    7.856998e+07
2    6.685943e+07
3    6.954859e+07
4    6.106809e+07
dtype: float64

In [34]:
np.mean(df.iloc[0])

53150419.65165807

In [35]:
prediction[0]

53150419.65165807

In [38]:
submission = pd.read_csv('./data/' + data_list[7])
submission['value'] = prediction
submission.to_csv('./data/submission_ensemble_rf.csv', index=False)