In [None]:
import sklearn
import pandas as pd
from sklearn.preprocessing import LabelBinarizer
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from datetime import datetime


pd.options.display.max_rows = 500
pd.options.display.max_columns= 500
pd.options.display.expand_frame_repr=False

In [None]:
# 학습데이터 csv파일 경로 정의
datatag = 'bikedemand'
train_data_path = 'data/%s_train.csv'%(datatag)
test_data_path  = 'data/%s_test.csv'%(datatag)

In [None]:
# CSV파일을 pandas dataframe으로 불러오기

df_train = pd.read_csv(train_data_path)
df_test = pd.read_csv(test_data_path) # BLIND DATA, FOR SUBMISSION

# PRINT WHOLE COLUMNS
print(df_train.columns)
print(df_train)


# DEFINE FEATURE, TARGET COLUMNS
feature = ['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed']
target = ['count']

In [None]:
# WITHOUT PREPROCESSING
train_x, test_x, train_y, test_y = train_test_split(df_train[feature], 
                                                    df_train[target], 
                                                    random_state=0, 
                                                    test_size=0.25)




# GRID SEARCH MANUALY
# 3 * 3 * 2
for md in [5,7,9]:
    for ne in [200,400,600]:
        for lr in [0.02, 0.03]:
            gbm = xgb.XGBRegressor(# tree_method='gpu_hist', 
                                                    objective='reg:squarederror', 
                                                    booster='gbtree', 
                                                    max_depth=md, 
                                                    n_estimators=ne, 
                                                    learning_rate=lr, 
                                                    random_state=1)
            gbm = gbm.fit(train_x[feature], train_y[target], 
                  eval_set=[(test_x[feature], test_y[target])],
                  eval_metric = ['gamma-deviance', 'rmse', 'mae'],
                  verbose=False)


            evals_result = gbm.evals_result()
            df_eval = pd.DataFrame(evals_result['validation_0'])
            print(ne, md, lr, df_eval.rmse.min(), df_eval.mae.min())
    

In [None]:
# PREPROCESSING


# 전처리는 재활용목적으로 미리미리 함수로 작성해둔다.
def preprocessing(df_input): 
    df = df_input.copy() 
    
    df["hour"] = [t.hour for t in pd.DatetimeIndex(df.datetime)]
    df["day"] = [t.dayofweek for t in pd.DatetimeIndex(df.datetime)]
    df["month"] = [t.month for t in pd.DatetimeIndex(df.datetime)]
    df['year'] = [t.year for t in pd.DatetimeIndex(df.datetime)]
    df['year'] = df['year'].map({2011:0, 2012:1})

    
    season=pd.get_dummies(df['season'],prefix='season')
    df=pd.concat([df,season],axis=1)

    weather=pd.get_dummies(df['weather'],prefix='weather')
    df=pd.concat([df,weather],axis=1)
    
    return df



df_train_preprocessed = preprocessing(df_train)
df_test_preprocessed = preprocessing(df_test)

In [None]:
# CHECK NEW GENERATED FEATURE COLUMNS
print(df_train.describe())
print('-'*100)
print(df_train_preprocessed.describe())


print(df_train.head(5))
print(df_train_preprocessed.head(5))

In [None]:
feature = ['season', 'holiday', 'workingday', 'weather', 'temp','atemp', 'humidity', 'windspeed', 'hour', 'day', 'month', 'year', 'season_1', 'season_2', 'season_3','season_4', 'weather_1', 'weather_2', 'weather_3', 'weather_4']
target = ['count']

In [None]:
train_x, test_x, train_y, test_y = train_test_split(df_train_preprocessed[feature], 
                                                    df_train_preprocessed[target], 
                                                    random_state=0, 
                                                    test_size=0.25)



# GRID SEARCH MANUALLLY
for md in [5,7,9]:
    for ne in [200,400,600]:
        for lr in [0.02, 0.03]:
            gbm = xgb.XGBRegressor(# tree_method='gpu_hist', 
                                                    objective='reg:squarederror', 
                                                    booster='gbtree', 
                                                    max_depth=md, 
                                                    n_estimators=ne, 
                                                    learning_rate=lr, 
                                                    random_state=1)
            gbm = gbm.fit(train_x[feature], train_y[target], 
                  eval_set=[(test_x[feature], test_y[target])],
                  eval_metric = ['rmsle', 'gamma-deviance', 'rmse', 'mae'],
                  verbose=False)

            evals_result = gbm.evals_result()
            df_eval = pd.DataFrame(evals_result['validation_0'])
            print(ne, md, lr, df_eval.rmsle.min(), df_eval.rmse.min(), df_eval.mae.min(), float(df_eval.rmse.min())**2)

In [None]:
# GRID SEARCH BY GRIDSEARCHCV
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {'max_depth': [5,7,9],
              'n_estimators':[200,400,600],
              'learning_rate':[0.02,0.03]}
             # 'max_features':['auto', 'sqrt', 'log2']
             


# GRID SEARCH WITH CROSS VALIDATION
# cross validation 은 트레이닝 데이터셋을 균일하게 n 개로 나눠서 서로 교차검증 하므로, train test split 을 할 필요 없다

# CHECK AVALABLE SCORING FUNCTION
# for e in sorted(sklearn.metrics.SCORERS.keys()):
#     print(e)
    
    
    
search = GridSearchCV(xgb.XGBRegressor(), param_grid, cv=2, scoring='neg_mean_squared_error')
search.fit(train_x[feature], train_y[target])

# PRINT THE BEST

print('%-40s: %s'%('BEST ESTIMATOR:', search.best_estimator_))
print('%-40s: %s'%('BEST SCORE:', search.best_score_))
print('%-40s: %s'%('BEST INDEX:', search.best_index_))
print('%-40s: %s'%('BEST PARAMS:', search.best_params_))

# print('='*100)



In [None]:
# 수동 결과와 CV 결과 비교하기
# 사실은 CV 할꺼면 BLIND DATASET 을 만들어서 따로 하는게 맞음... 시간 있으면 나누자

from sklearn.metrics import mean_squared_error
pred_y = search.predict(test_x[feature])
print(mean_squared_error(pred_y, test_y))

## BAYESIAN OPTIMIZATION !!!

In [None]:
# GRID SEARCH 를 자동으로 진행하는 방법.
# 이론상세는 다음 링크를 참조
# https://www.kdnuggets.com/2019/07/xgboost-random-forest-bayesian-optimisation.html

from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score

In [None]:
def bayesian_optimization(dataset, function, parameters):
    
    train_x, test_x, train_y, test_y = dataset
    n_iterations = 5
    gp_params = {"alpha": 1e-4}
    BO = BayesianOptimization(function, parameters)
    BO.maximize(n_iter=n_iterations, **gp_params)

    return BO.max

In [None]:
def xgb_optimization(cv_splits, eval_set):
    def function(eta, max_depth, n_estimators):
            return cross_val_score(
                   xgb.XGBRegressor(
                        objective='reg:squarederror',
                       n_estimators=int(max(n_estimators,0)),
                       learning_rate=max(learning_rate, 0),
                       max_depth=int(max_depth),                                               
                       seed=42,
                       nthread=-1,),  
                   X=train_x, 
                   y=train_y, 
                   cv=cv_splits,
                   scoring="neg_mean_squared_error",
                   fit_params={
                        "early_stopping_rounds": 10, 
                        "eval_metric": "rmse", 
                        "eval_set": eval_set},
                   n_jobs=-1).mean()

    # ========================================================
    # 아래에 TUNING을 진행할 PARAMS의 범위를 설정
    # ========================================================
    parameters = {"learning_rate": (0.001, 0.04),
                  "max_depth": (2, 12),
                  "n_estimators":(100,2000)}
    
    return function, parameters

In [None]:
def train(train_x, test_x, train_y, test_y, function, parameters):
    dataset = (train_x, test_x, train_y, test_y)
    cv_splits = 5
    
    best_solution = bayesian_optimization(dataset, function, parameters)      
    params = best_solution["params"]
    
    # ======================================================================================
    # BAYESIAN OPTIMIZATION으로 얻은 최적의 HYPER PARAMETER 출력 !
    # ======================================================================================
    for k,v in params.items():
        print(k,v)
        
    md = int(params['max_depth'])
    ne = int(params['n_estimators'])
    lr = params['learning_rate']

    
    # ======================================================================================
    # BAYESIAN OPTIMIZATION으로 얻은 최적의 HYPER PARAMETER 로 학습 진행, SCORE 출력
    # ======================================================================================
    gbm = xgb.XGBRegressor(# tree_method='gpu_hist', 
                                        objective='reg:squarederror', 
                                        max_depth=md, 
                                        n_estimators=ne, 
                                        learning_rate=lr, 
                                        random_state=1)
    gbm = gbm.fit(train_x[feature], train_y[target], 
      eval_set=[(test_x[feature], test_y[target])],
      eval_metric = ['rmsle', 'gamma-deviance', 'rmse', 'mae'],
      verbose=False)

    evals_result = gbm.evals_result()
    df_eval = pd.DataFrame(evals_result['validation_0'])
    print(ne, md, lr, df_eval.rmsle.min(), df_eval.rmse.min(), df_eval.mae.min(), float(df_eval.rmse.min())**2)


In [None]:
# 함수정의

train(train_x, test_x, train_y, test_y, *xgb_optimization(5,[(test_x[feature], test_y[target])]))

In [None]:
# ======================================================================================
## MACHINE LEARNING SUMMARY!
# ======================================================================================


1) 무엇보다 "모델튜닝"보다 전처리가 "훨씬" 중요하다.
2) 알고리즘 별 적절한 SCORING METHOD를 골라줘야 한다. (구글링 키워드 : scoring metrics)
3) xgboost는 max_depth, n_estimators, learning_rate 외에도 많은 params를 튜닝할 수 있다, 
   공식 document를 참조하여 각 params의 의미를 파악하고 적절한 parameter를 선택한다


