In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

from tqdm import tqdm

In [2]:
import pandas  as pd
import xgboost as xgb

#===========================================================================
# read in the data
# Original kernel: https://www.kaggle.com/carlmcbrideellis/very-simple-xgboost-regression
#===========================================================================
train_data = pd.read_csv('../input/tabular-playground-series-jan-2021/train.csv')
test_data  = pd.read_csv('../input/tabular-playground-series-jan-2021/test.csv')

#===========================================================================
# select some features of interest ("ay, there's the rub", Shakespeare)
#===========================================================================
features = ['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',
       'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13', 'cont14']

#===========================================================================
#===========================================================================
X_train = train_data[features]
y_train = train_data["target"]
final_X_test = test_data[features]

#===========================================================================
# XGBoost regression: 
# Parameters: 
# n_estimators  "Number of gradient boosted trees. Equivalent to number 
#                of boosting rounds."
# learning_rate "Boosting learning rate (xgb’s “eta”)"
# max_depth     "Maximum depth of a tree. Increasing this value will make 
#                the model more complex and more likely to overfit." 
#===========================================================================
# regressor=xgb.XGBRegressor(n_estimators  = 500,
#                            learning_rate = 0.1,
#                            max_depth     = 5)
# regressor.fit(X_train, y_train)

#===========================================================================
# To use early_stopping_rounds: 
# "Validation metric needs to improve at least once in every 
# early_stopping_rounds round(s) to continue training."
#===========================================================================
# perform a test/train split 
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_train, y_train, test_size=0.1, random_state=0)

# params for XGB are taked from this great kernel https://www.kaggle.com/hamzaghanmi/xgboost-hyperparameter-tuning-using-optuna 
# by Hamza Ghanmi

regressor = xgb.XGBRegressor(
                 colsample_bytree=0.5,
                 alpha=0.01563,
                 #gamma=0.0,
                 learning_rate=0.01,
                 max_depth=15,
                 min_child_weight=257,
                 n_estimators=4000,                                                                  
                 #reg_alpha=0.9,
                 reg_lambda=0.003,
                 subsample=0.7,
                 random_state=2020,
                 metric_period=100,
                 silent=0)#silent 1にするとwarningがでてくる

regressor.fit(X_train, y_train, early_stopping_rounds=6, eval_set=[(X_test, y_test)], verbose=0)

Parameters: { metric_period, silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-rmse:7.37014
[1]	validation_0-rmse:7.29718
[2]	validation_0-rmse:7.22493
[3]	validation_0-rmse:7.15340
[4]	validation_0-rmse:7.08261
[5]	validation_0-rmse:7.01254
[6]	validation_0-rmse:6.94320
[7]	validation_0-rmse:6.87452
[8]	validation_0-rmse:6.80658
[9]	validation_0-rmse:6.73929
[10]	validation_0-rmse:6.67271
[11]	validation_0-rmse:6.60677
[12]	validation_0-rmse:6.54150
[13]	validation_0-rmse:6.47690
[14]	validation_0-rmse:6.41297
[15]	validation_0-rmse:6.34965
[16]	validation_0-rmse:6.28699
[17]	validation_0-rmse:6.22495
[18]	validation_0-rmse:6.16354
[19]	validation_0-rmse:6.10277
[20]	validation_0-rmse:6.04260
[21]	validation_0-rmse:5.98306
[22]	validation_0-rmse:5.92411


[249]	validation_0-rmse:0.92568
[250]	validation_0-rmse:0.92174
[251]	validation_0-rmse:0.91788
[252]	validation_0-rmse:0.91405
[253]	validation_0-rmse:0.91030
[254]	validation_0-rmse:0.90661
[255]	validation_0-rmse:0.90299
[256]	validation_0-rmse:0.89942
[257]	validation_0-rmse:0.89591
[258]	validation_0-rmse:0.89246
[259]	validation_0-rmse:0.88905
[260]	validation_0-rmse:0.88570
[261]	validation_0-rmse:0.88241
[262]	validation_0-rmse:0.87918
[263]	validation_0-rmse:0.87599
[264]	validation_0-rmse:0.87287
[265]	validation_0-rmse:0.86979
[266]	validation_0-rmse:0.86675
[267]	validation_0-rmse:0.86376
[268]	validation_0-rmse:0.86082
[269]	validation_0-rmse:0.85793
[270]	validation_0-rmse:0.85509
[271]	validation_0-rmse:0.85229
[272]	validation_0-rmse:0.84955
[273]	validation_0-rmse:0.84685
[274]	validation_0-rmse:0.84419
[275]	validation_0-rmse:0.84157
[276]	validation_0-rmse:0.83901
[277]	validation_0-rmse:0.83649
[278]	validation_0-rmse:0.83400
[279]	validation_0-rmse:0.83156
[280]	va

[506]	validation_0-rmse:0.69916
[507]	validation_0-rmse:0.69912
[508]	validation_0-rmse:0.69909
[509]	validation_0-rmse:0.69905
[510]	validation_0-rmse:0.69902
[511]	validation_0-rmse:0.69899
[512]	validation_0-rmse:0.69895
[513]	validation_0-rmse:0.69891
[514]	validation_0-rmse:0.69888
[515]	validation_0-rmse:0.69884
[516]	validation_0-rmse:0.69881
[517]	validation_0-rmse:0.69878
[518]	validation_0-rmse:0.69875
[519]	validation_0-rmse:0.69872
[520]	validation_0-rmse:0.69869
[521]	validation_0-rmse:0.69866
[522]	validation_0-rmse:0.69863
[523]	validation_0-rmse:0.69860
[524]	validation_0-rmse:0.69857
[525]	validation_0-rmse:0.69854
[526]	validation_0-rmse:0.69852
[527]	validation_0-rmse:0.69849
[528]	validation_0-rmse:0.69846
[529]	validation_0-rmse:0.69843
[530]	validation_0-rmse:0.69840
[531]	validation_0-rmse:0.69838
[532]	validation_0-rmse:0.69835
[533]	validation_0-rmse:0.69832
[534]	validation_0-rmse:0.69829
[535]	validation_0-rmse:0.69827
[536]	validation_0-rmse:0.69824
[537]	va

[763]	validation_0-rmse:0.69632
[764]	validation_0-rmse:0.69632
[765]	validation_0-rmse:0.69632
[766]	validation_0-rmse:0.69632
[767]	validation_0-rmse:0.69632
[768]	validation_0-rmse:0.69632
[769]	validation_0-rmse:0.69631
[770]	validation_0-rmse:0.69631
[771]	validation_0-rmse:0.69630
[772]	validation_0-rmse:0.69630
[773]	validation_0-rmse:0.69630
[774]	validation_0-rmse:0.69630
[775]	validation_0-rmse:0.69630
[776]	validation_0-rmse:0.69629
[777]	validation_0-rmse:0.69629
[778]	validation_0-rmse:0.69629
[779]	validation_0-rmse:0.69628
[780]	validation_0-rmse:0.69628
[781]	validation_0-rmse:0.69627
[782]	validation_0-rmse:0.69627
[783]	validation_0-rmse:0.69627
[784]	validation_0-rmse:0.69627
[785]	validation_0-rmse:0.69627
[786]	validation_0-rmse:0.69626
[787]	validation_0-rmse:0.69626
[788]	validation_0-rmse:0.69626
[789]	validation_0-rmse:0.69625
[790]	validation_0-rmse:0.69625
[791]	validation_0-rmse:0.69625
[792]	validation_0-rmse:0.69624
[793]	validation_0-rmse:0.69624
[794]	va

XGBRegressor(alpha=0.01563, base_score=0.5, booster='gbtree',
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.5,
             gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.01, max_delta_step=0,
             max_depth=15, metric_period=100, min_child_weight=257, missing=nan,
             monotone_constraints='()', n_estimators=4000, n_jobs=4,
             num_parallel_tree=1, random_state=2020, reg_alpha=0.0156299993,
             reg_lambda=0.003, scale_pos_weight=1, silent=1, subsample=0.7,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [3]:
#===========================================================================
# use the model XGB to predict the prices for the test data
#===========================================================================
predictions = regressor.predict(final_X_test)

In [4]:
X = train_data.drop(['id','target'], axis=1)
Xtest = test_data.drop(['id'], axis=1)
y = train_data['target']

train = int(len(X)*0.85)
Xtrain, Xval = X.iloc[:train], X.iloc[train:]
ytrain, yval = y.iloc[:train], y.iloc[train:]

In [5]:
# params from this kernel https://www.kaggle.com/kailex/tabular-playground

params={'random_state': 33,'n_estimators':5000,
 'min_data_per_group': 5,
 'boosting_type': 'gbdt',
 'num_leaves': 256,
 'max_dept': -1,
 'learning_rate': 0.02,
 'subsample_for_bin': 200000,
 'lambda_l1': 1.074622455507616e-05,
 'lambda_l2': 2.0521330798729704e-06,
 'n_jobs': -1,
 'cat_smooth': 1.0,
 'silent': True,
 'importance_type': 'split',
 'metric': 'rmse',
 'feature_pre_filter': False,
 'bagging_fraction': 0.8206341150202605,
 'min_data_in_leaf': 100,
 'min_sum_hessian_in_leaf': 0.001,
 'bagging_freq': 6,
 'feature_fraction': 0.5,
 'min_gain_to_split': 0.0,
 'min_child_samples': 20}

In [6]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold

N_FOLDS = 7

kf = KFold(n_splits = N_FOLDS)
oof = np.zeros(len(y))
oof_vanilla = np.zeros(len(y))
preds = np.zeros(len(Xtest))
params['learning_rate'] = 0.005
params['num_iterations'] = 5000
for train_ind, test_ind in tqdm(kf.split(X)):
    Xtrain = X.iloc[train_ind]
    Xval = X.iloc[test_ind]
    ytrain = y.iloc[train_ind]
    yval = y.iloc[test_ind]

    model = LGBMRegressor(**params)
    vanilla_model = LGBMRegressor()
    
    model.fit(Xtrain, ytrain, eval_set = ((Xval,yval)), early_stopping_rounds = 50, verbose = 0)
    vanilla_model.fit(Xtrain, ytrain)
    p = model.predict(Xval)
    p_vanilla = vanilla_model.predict(Xval)
    oof[test_ind] = p
    oof_vanilla[test_ind] = p_vanilla
    
    preds += model.predict(Xtest)/N_FOLDS
    
print(f'mean square error on training data (vanilla model): {np.round(mean_squared_error(y, oof_vanilla, squared=False),5)}')    
print(f'mean square error on training data (with optuna tuning): {np.round(mean_squared_error(y, oof, squared=False),5)}')





1it [03:23, 203.22s/it]



2it [07:00, 211.47s/it]



3it [10:43, 216.65s/it]



4it [14:56, 231.10s/it]



5it [18:23, 222.47s/it]



6it [22:31, 231.17s/it]



7it [25:23, 217.59s/it]

mean square error on training data (vanilla model): 0.70252
mean square error on training data (with optuna tuning): 0.69566





In [7]:
X = train_data.drop(['id','target'], axis=1)
Xtest = test_data.drop(['id'], axis=1)
y = train_data['target']

In [9]:

tmp=X.copy()
import random
fold_list = [1,2,3,4,5,6,7]
folds = []
for i in range(int((tmp.shape[0])/7)):
    random.shuffle(fold_list)
    folds.extend(fold_list)
folds=folds+[1]
tmp['fold'] = folds
tmp.head(7)

Unnamed: 0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,fold
0,0.67039,0.8113,0.643968,0.291791,0.284117,0.855953,0.8907,0.285542,0.558245,0.779418,0.921832,0.866772,0.878733,0.305411,2
1,0.388053,0.621104,0.686102,0.501149,0.64379,0.449805,0.510824,0.580748,0.418335,0.432632,0.439872,0.434971,0.369957,0.369484,1
2,0.83495,0.227436,0.301584,0.293408,0.606839,0.829175,0.506143,0.558771,0.587603,0.823312,0.567007,0.677708,0.882938,0.303047,4
3,0.820708,0.160155,0.546887,0.726104,0.282444,0.785108,0.752758,0.823267,0.574466,0.580843,0.769594,0.818143,0.914281,0.279528,7
4,0.935278,0.421235,0.303801,0.880214,0.66561,0.830131,0.487113,0.604157,0.874658,0.863427,0.983575,0.900464,0.935918,0.435772,3
5,0.352623,0.258867,0.327373,0.802627,0.284219,0.296886,0.209743,0.27371,0.308018,0.235851,0.27876,0.251406,0.339135,0.293129,6
6,0.259096,0.803934,0.5809,0.322884,0.984705,0.378247,0.432821,0.562059,0.290965,0.316543,0.219192,0.326977,0.458653,0.2443,5


In [12]:
predictions = np.zeros(len(Xtest))
for fold in range(1,8):
    train_index_list = tmp[tmp['fold'] != fold].index
    test_index_list = tmp[tmp['fold'] == fold].index
                          
    X_train = X.iloc[train_index_list]
    y_train = y.iloc[train_index_list]
    X_val = X.iloc[test_index_list]
    y_val = y.iloc[test_index_list]

    model=LGBMRegressor(**params)
    eval_set = [(X_val, y_val)]
    model.fit(X_train, y_train,eval_metric='rmse', eval_set=eval_set, verbose=False)#,early_stopping_rounds=5,
    predictions += model.predict(Xtest)
predictions = predictions/7





























In [13]:
Xtest["target"]=predictions

In [17]:
Xtest.head()

Unnamed: 0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,target
0,0.3536,0.73878,0.600939,0.293377,0.285691,0.458006,0.620704,0.422249,0.369203,0.435727,0.55054,0.699134,0.286864,0.364515,7.91818
1,0.907222,0.189756,0.215531,0.869915,0.301333,0.528958,0.390351,0.521112,0.794779,0.79858,0.446475,0.449037,0.916964,0.513002,7.903751
2,0.179287,0.355353,0.623972,0.437812,0.282476,0.320826,0.386789,0.776422,0.222268,0.229102,0.211913,0.222651,0.327164,0.827941,7.925541
3,0.359385,0.181049,0.551368,0.206386,0.280763,0.482076,0.506677,0.362793,0.379737,0.345686,0.445276,0.518485,0.299028,0.598166,8.277034
4,0.335791,0.682607,0.676481,0.219465,0.282861,0.581721,0.748639,0.350158,0.448915,0.506878,0.817721,0.805895,0.790591,0.249275,8.083853


In [18]:
output=pd.DataFrame({"id":test_data.id, "target":predictions})
output.to_csv('submission7.csv', index=False)

In [19]:
output

Unnamed: 0,id,target
0,0,7.918180
1,2,7.903751
2,6,7.925541
3,7,8.277034
4,10,8.083853
...,...,...
199995,499984,8.189553
199996,499985,8.166483
199997,499987,8.214797
199998,499988,8.039639
