In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor

In [2]:
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

In [3]:
le = LabelEncoder()
for i in range(4):
    train['EQUIP'+str(i+1)] = le.fit_transform(train['EQUIP'+str(i+1)])
    test['EQUIP'+str(i+1)] = le.fit_transform(test['EQUIP'+str(i+1)])

In [4]:
data = train.drop(['OUT'], axis=1)
label = train['OUT']

In [5]:
def rmse(label, pred):
    return np.sqrt(np.sum(np.power(label-pred,2))/np.size(pred))

In [6]:
train_data , val_data, train_label, val_label = train_test_split(data, label, random_state=100)

In [7]:
model = XGBRegressor(learning_rate=0.1, random_state=0, n_jobs=-1)

In [9]:
param = {
    'base_score':[0.4,0.5],
    'max_depth':[9,10,11],
    'n_estimators':[800,850,900]
}

In [10]:
grid = GridSearchCV(model, param_grid=param, cv=3, verbose = True, n_jobs=-1, scoring = 'neg_root_mean_squared_error')

In [11]:
grid.fit(train_data,train_label)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:  3.4min finished


GridSearchCV(cv=3,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False, gamma=None,
                                    gpu_id=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=0.1, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=-1,
                                    num_parallel_tree=None, predictor=None,
                                    random_state=0, reg_alpha=None,
                                    reg_lambda=None, scale_pos_weight=None,
             

In [12]:
print('xgboost best parameters : ', grid.best_params_)
print('xgboost best score : ', grid.best_score_)
grid_best = grid.best_estimator_
pred = grid_best.predict(val_data)
print("xgboost : {}".format(rmse(val_label, pred)))

xgboost best parameters :  {'base_score': 0.5, 'max_depth': 9, 'n_estimators': 900}
xgboost best score :  -0.28489352165276727
xgboost : 0.2667962608954327


In [13]:
model = XGBRegressor(base_score=0.4, learning_rate=0.1, max_depth=10, n_estimators=800, random_state=0, n_jobs=-1) #best

In [15]:
model.fit(train_data,train_label)
pred = model.predict(val_data)
print(rmse(val_label,pred))

0.2655772653773409


In [11]:
model.fit(data,label)

XGBRegressor(base_score=0.4, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=10, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=800, n_jobs=-1,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [12]:
pred = model.predict(test)

In [13]:
submit = pd.read_csv('answer_sample.csv')
submit['OUT']=pred

In [14]:
submit.to_csv('XGBOOST_answer.csv', index=False)