In [None]:
import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

from tqdm import tqdm

In [None]:
import pandas  as pd
import xgboost as xgb

#===========================================================================
# read in the data
# Original kernel: https://www.kaggle.com/carlmcbrideellis/very-simple-xgboost-regression
#===========================================================================
train_data = pd.read_csv('../input/tabular-playground-series-jan-2021/train.csv')
test_data  = pd.read_csv('../input/tabular-playground-series-jan-2021/test.csv')

#===========================================================================
# select some features of interest ("ay, there's the rub", Shakespeare)
#===========================================================================
features = ['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',
       'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13', 'cont14']

#===========================================================================
#===========================================================================
X_train = train_data[features]
y_train = train_data["target"]
final_X_test = test_data[features]

#===========================================================================
# XGBoost regression: 
# Parameters: 
# n_estimators  "Number of gradient boosted trees. Equivalent to number 
#                of boosting rounds."
# learning_rate "Boosting learning rate (xgb’s “eta”)"
# max_depth     "Maximum depth of a tree. Increasing this value will make 
#                the model more complex and more likely to overfit." 
#===========================================================================
# regressor=xgb.XGBRegressor(n_estimators  = 500,
#                            learning_rate = 0.1,
#                            max_depth     = 5)
# regressor.fit(X_train, y_train)

#===========================================================================
# To use early_stopping_rounds: 
# "Validation metric needs to improve at least once in every 
# early_stopping_rounds round(s) to continue training."
#===========================================================================
# perform a test/train split 
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_train, y_train, test_size=0.1, random_state=0)

# params for XGB are taked from this great kernel https://www.kaggle.com/hamzaghanmi/xgboost-hyperparameter-tuning-using-optuna 
# by Hamza Ghanmi

regressor = xgb.XGBRegressor(
                 colsample_bytree=0.5,
                 alpha=0.01563,
                 #gamma=0.0,
                 learning_rate=0.01,
                 max_depth=15,
                 min_child_weight=257,
                 n_estimators=4000,                                                                  
                 #reg_alpha=0.9,
                 reg_lambda=0.003,
                 subsample=0.7,
                 random_state=2020,
                 metric_period=100,
                 silent=0)#silent 1にするとwarningがでてくる

regressor.fit(X_train, y_train, early_stopping_rounds=6, eval_set=[(X_test, y_test)], verbose=0)

In [None]:
#===========================================================================
# use the model XGB to predict the prices for the test data
#===========================================================================
predictions = regressor.predict(final_X_test)

In [None]:
X = train_data.drop(['id','target'], axis=1)
Xtest = test_data.drop(['id'], axis=1)
y = train_data['target']

train = int(len(X)*0.85)
Xtrain, Xval = X.iloc[:train], X.iloc[train:]
ytrain, yval = y.iloc[:train], y.iloc[train:]

In [None]:
# params from this kernel https://www.kaggle.com/kailex/tabular-playground

params={'random_state': 33,'n_estimators':5000,
 'min_data_per_group': 5,
 'boosting_type': 'gbdt',
 'num_leaves': 256,
 'max_dept': -1,
 'learning_rate': 0.02,
 'subsample_for_bin': 200000,
 'lambda_l1': 1.074622455507616e-05,
 'lambda_l2': 2.0521330798729704e-06,
 'n_jobs': -1,
 'cat_smooth': 1.0,
 'silent': True,
 'importance_type': 'split',
 'metric': 'rmse',
 'feature_pre_filter': False,
 'bagging_fraction': 0.8206341150202605,
 'min_data_in_leaf': 100,
 'min_sum_hessian_in_leaf': 0.001,
 'bagging_freq': 6,
 'feature_fraction': 0.5,
 'min_gain_to_split': 0.0,
 'min_child_samples': 20}

In [None]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold

N_FOLDS = 7

kf = KFold(n_splits = N_FOLDS)
oof = np.zeros(len(y))
oof_vanilla = np.zeros(len(y))
preds = np.zeros(len(Xtest))
params['learning_rate'] = 0.005
params['num_iterations'] = 5000
for train_ind, test_ind in tqdm(kf.split(X)):
    Xtrain = X.iloc[train_ind]
    Xval = X.iloc[test_ind]
    ytrain = y.iloc[train_ind]
    yval = y.iloc[test_ind]

    model = LGBMRegressor(**params)
    vanilla_model = LGBMRegressor()
    
    model.fit(Xtrain, ytrain, eval_set = ((Xval,yval)), early_stopping_rounds = 50, verbose = 0)
    vanilla_model.fit(Xtrain, ytrain)
    p = model.predict(Xval)
    p_vanilla = vanilla_model.predict(Xval)
    oof[test_ind] = p
    oof_vanilla[test_ind] = p_vanilla
    
    preds += model.predict(Xtest)/N_FOLDS
    
print(f'mean square error on training data (vanilla model): {np.round(mean_squared_error(y, oof_vanilla, squared=False),5)}')    
print(f'mean square error on training data (with optuna tuning): {np.round(mean_squared_error(y, oof, squared=False),5)}')

In [None]:
X = train_data.drop(['id','target'], axis=1)
Xtest = test_data.drop(['id'], axis=1)
y = train_data['target']

In [None]:

tmp=X.copy()
import random
fold_list = [1,2,3,4,5,6,7]
folds = []
for i in range(int((tmp.shape[0])/7)):
    random.shuffle(fold_list)
    folds.extend(fold_list)
folds=folds+[1]
tmp['fold'] = folds
tmp.head(7)

In [None]:
predictions = np.zeros(len(Xtest))
for fold in range(1,8):
    train_index_list = tmp[tmp['fold'] != fold].index
    test_index_list = tmp[tmp['fold'] == fold].index
                          
    X_train = X.iloc[train_index_list]
    y_train = y.iloc[train_index_list]
    X_val = X.iloc[test_index_list]
    y_val = y.iloc[test_index_list]

    model=LGBMRegressor(**params)
    eval_set = [(X_val, y_val)]
    model.fit(X_train, y_train,eval_metric='rmse', eval_set=eval_set, verbose=False)#,early_stopping_rounds=5,
    predictions += model.predict(Xtest)
predictions = predictions/7

In [None]:
Xtest["target"]=predictions

In [None]:
Xtest.head()

In [None]:
output=pd.DataFrame({"id":test_data.id, "target":predictions})
output.to_csv('submission7.csv', index=False)

In [None]:
output