In [1]:


import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import optuna



In [2]:
df = pd.read_csv(r'C:\Users\Professional\Desktop\Kaggle competitions\30-days-of-ml\data\train_folds.csv')
df_test = pd.read_csv(r'C:\Users\Professional\Desktop\Kaggle competitions\30-days-of-ml\data\test.csv')
sample_submission = pd.read_csv(r'C:\Users\Professional\Desktop\Kaggle competitions\30-days-of-ml\data\sample_submission.csv')

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
df_test = df_test[useful_features]

for col in object_cols:
    temp_df = []
    temp_test_feat = None
    for fold in range(5):
        xtrain =  df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)
        feat = xtrain.groupby(col)["target"].agg("mean")
        feat = feat.to_dict()
        xvalid.loc[:, f"tar_enc_{col}"] = xvalid[col].map(feat)
        temp_df.append(xvalid)
        if temp_test_feat is None:
            temp_test_feat = df_test[col].map(feat)
        else:
            temp_test_feat += df_test[col].map(feat)
    
    temp_test_feat /= 5
    df_test.loc[:, f"tar_enc_{col}"] = temp_test_feat
    df = pd.concat(temp_df)
    

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
df_test = df_test[useful_features]

In [25]:
def run(trial):
    fold = 0
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda",1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 8)

    xtrain = df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)

    ytrain = xtrain.target
    yvalid = xvalid.target

    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])

    model = XGBRegressor(
        random_state=42,
        n_estimators=6000,
        learning_rate=learning_rate,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        max_depth=max_depth,
        n_jobs = 6
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    return rmse

In [26]:
study = optuna.create_study(direction="minimize")
study.optimize(run, n_trials=8)

[32m[I 2021-08-24 07:45:53,154][0m A new study created in memory with name: no-name-f1f3dfa2-0cf4-48b2-b507-585f241c15ea[0m


[0]	validation_0-rmse:6.50685
[479]	validation_0-rmse:0.72288


[32m[I 2021-08-24 07:47:00,070][0m Trial 0 finished with value: 0.7210591242526004 and parameters: {'learning_rate': 0.1653269327255586, 'reg_lambda': 8.353280532307176, 'reg_alpha': 0.0001447811909275867, 'subsample': 0.9061099267930357, 'colsample_bytree': 0.4765556457257669, 'max_depth': 7}. Best is trial 0 with value: 0.7210591242526004.[0m


[0]	validation_0-rmse:7.23118
[1000]	validation_0-rmse:0.71977
[2000]	validation_0-rmse:0.71857
[2325]	validation_0-rmse:0.71865


[32m[I 2021-08-24 07:48:55,647][0m Trial 1 finished with value: 0.718552617982699 and parameters: {'learning_rate': 0.0712313421486503, 'reg_lambda': 6.777973493371879, 'reg_alpha': 0.870457671903124, 'subsample': 0.28994786387244836, 'colsample_bytree': 0.5948540994947439, 'max_depth': 3}. Best is trial 1 with value: 0.718552617982699.[0m


[0]	validation_0-rmse:7.65189
[1000]	validation_0-rmse:0.73671
[2000]	validation_0-rmse:0.73340
[3000]	validation_0-rmse:0.73172
[4000]	validation_0-rmse:0.73073
[5000]	validation_0-rmse:0.72996
[5999]	validation_0-rmse:0.72931


[32m[I 2021-08-24 07:51:06,124][0m Trial 2 finished with value: 0.7293099112558643 and parameters: {'learning_rate': 0.01661210345451068, 'reg_lambda': 17.680408959189815, 'reg_alpha': 4.6067136102608774e-07, 'subsample': 0.363266541485849, 'colsample_bytree': 0.11764261052511464, 'max_depth': 1}. Best is trial 1 with value: 0.718552617982699.[0m


[0]	validation_0-rmse:7.62018
[1000]	validation_0-rmse:0.72410
[2000]	validation_0-rmse:0.72040
[3000]	validation_0-rmse:0.71903
[4000]	validation_0-rmse:0.71833
[5000]	validation_0-rmse:0.71799
[5999]	validation_0-rmse:0.71780


[32m[I 2021-08-24 07:56:05,980][0m Trial 3 finished with value: 0.7177815537119963 and parameters: {'learning_rate': 0.020724578804648803, 'reg_lambda': 6.794234536729048e-05, 'reg_alpha': 39.3317897605401, 'subsample': 0.2817675275106523, 'colsample_bytree': 0.4572411228170934, 'max_depth': 4}. Best is trial 3 with value: 0.7177815537119963.[0m


[0]	validation_0-rmse:7.60746
[1000]	validation_0-rmse:0.72446
[2000]	validation_0-rmse:0.72097
[3000]	validation_0-rmse:0.71944
[4000]	validation_0-rmse:0.71894
[5000]	validation_0-rmse:0.71856
[5999]	validation_0-rmse:0.71827


[32m[I 2021-08-24 08:00:13,604][0m Trial 4 finished with value: 0.7182369209881364 and parameters: {'learning_rate': 0.022370036248167958, 'reg_lambda': 3.000622744499094e-07, 'reg_alpha': 7.1734812537127794, 'subsample': 0.1648841114679282, 'colsample_bytree': 0.7788245091401795, 'max_depth': 3}. Best is trial 3 with value: 0.7177815537119963.[0m


[0]	validation_0-rmse:7.45829
[1000]	validation_0-rmse:0.72303
[1067]	validation_0-rmse:0.72320


[32m[I 2021-08-24 08:01:10,472][0m Trial 5 finished with value: 0.722648855123444 and parameters: {'learning_rate': 0.041727750091254776, 'reg_lambda': 5.370949892237831e-06, 'reg_alpha': 0.002375450970327109, 'subsample': 0.18974639798865434, 'colsample_bytree': 0.1805406125178555, 'max_depth': 7}. Best is trial 3 with value: 0.7177815537119963.[0m


[0]	validation_0-rmse:7.44269
[709]	validation_0-rmse:0.72521


[32m[I 2021-08-24 08:02:40,814][0m Trial 6 finished with value: 0.724191940303906 and parameters: {'learning_rate': 0.04375325539857733, 'reg_lambda': 1.4505483326737958e-07, 'reg_alpha': 1.7195101541425414e-08, 'subsample': 0.2713495636177378, 'colsample_bytree': 0.7580441840859904, 'max_depth': 8}. Best is trial 3 with value: 0.7177815537119963.[0m


[0]	validation_0-rmse:7.67920
[1000]	validation_0-rmse:0.73164
[2000]	validation_0-rmse:0.72804
[3000]	validation_0-rmse:0.72579
[4000]	validation_0-rmse:0.72417
[5000]	validation_0-rmse:0.72293
[5999]	validation_0-rmse:0.72192


[32m[I 2021-08-24 08:06:56,329][0m Trial 7 finished with value: 0.7219235382948443 and parameters: {'learning_rate': 0.013056958342082241, 'reg_lambda': 5.572751376423066e-05, 'reg_alpha': 7.359808315206786e-08, 'subsample': 0.6129431526147202, 'colsample_bytree': 0.5052920743525672, 'max_depth': 2}. Best is trial 3 with value: 0.7177815537119963.[0m


In [27]:


study.best_params



{'learning_rate': 0.020724578804648803,
 'reg_lambda': 6.794234536729048e-05,
 'reg_alpha': 39.3317897605401,
 'subsample': 0.2817675275106523,
 'colsample_bytree': 0.4572411228170934,
 'max_depth': 4}

In [32]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [33]:
df = pd.read_csv(r'C:\Users\Professional\Desktop\Kaggle competitions\30-days-of-ml\data\train_folds.csv')
df_test = pd.read_csv(r'C:\Users\Professional\Desktop\Kaggle competitions\30-days-of-ml\data\test.csv')
sample_submission = pd.read_csv(r'C:\Users\Professional\Desktop\Kaggle competitions\30-days-of-ml\data\sample_submission.csv')

In [34]:
useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
df_test = df_test[useful_features]

In [36]:
final_predictions = []
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    params = {'learning_rate': 0.020724578804648803,
 'reg_lambda': 6.794234536729048e-05,
 'reg_alpha': 39.3317897605401,
 'subsample': 0.2817675275106523,
 'colsample_bytree': 0.4572411228170934,
 'max_depth': 4}
    
    model = XGBRegressor(
        random_state=0, 
        #tree_method='gpu_hist',
        #gpu_id=0,
        #predictor="gpu_predictor",
        n_estimators=6000,
        n_jobs = 6,
        **params
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

[0]	validation_0-rmse:7.62020
[1000]	validation_0-rmse:0.72430
[2000]	validation_0-rmse:0.72052
[3000]	validation_0-rmse:0.71915
[4000]	validation_0-rmse:0.71844
[5000]	validation_0-rmse:0.71811
[5999]	validation_0-rmse:0.71788
0 0.7178746913977467
[0]	validation_0-rmse:7.61691
[1000]	validation_0-rmse:0.72373
[2000]	validation_0-rmse:0.71994
[3000]	validation_0-rmse:0.71864
[4000]	validation_0-rmse:0.71803
[5000]	validation_0-rmse:0.71777
[5730]	validation_0-rmse:0.71770
1 0.717688930711451
[0]	validation_0-rmse:7.61459
[1000]	validation_0-rmse:0.72549
[2000]	validation_0-rmse:0.72181
[3000]	validation_0-rmse:0.72052
[4000]	validation_0-rmse:0.72008
[5000]	validation_0-rmse:0.71987
[5999]	validation_0-rmse:0.71970
2 0.7196892025167398
[0]	validation_0-rmse:7.61698
[1000]	validation_0-rmse:0.72533
[2000]	validation_0-rmse:0.72171
[3000]	validation_0-rmse:0.72047
[4000]	validation_0-rmse:0.71992
[5000]	validation_0-rmse:0.71964
[5999]	validation_0-rmse:0.71943
3 0.7194236419292745
[0]	v

In [37]:
preds = np.mean(np.column_stack(final_predictions), axis=1)
sample_submission.target = preds
sample_submission.to_csv("sub_final.csv", index=False)

In [38]:
max(preds)

9.042635