In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [2]:
df = pd.read_csv("../input/30days-10folds/train_10_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

In [3]:
useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
df_test = df_test[useful_features]

In [9]:
final_predictions = []
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    params = {'learning_rate': 0.07853392035787837, 'reg_lambda': 1.7549293092194938e-05, 'reg_alpha': 14.68267919457715, 'subsample': 0.8031450486786944, 'colsample_bytree': 0.170759104940733, 'max_depth': 3}
    
    model = XGBRegressor(
        random_state=42, 
        tree_method='gpu_hist',
        gpu_id=0,
        predictor="gpu_predictor",
        n_estimators=5000,
        **params
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

[0]	validation_0-rmse:7.18401
[1000]	validation_0-rmse:0.71910
[2000]	validation_0-rmse:0.71745
[3000]	validation_0-rmse:0.71718
[3155]	validation_0-rmse:0.71720
0 0.717079142970714
[0]	validation_0-rmse:7.16574
[1000]	validation_0-rmse:0.71885
[2000]	validation_0-rmse:0.71700
[3000]	validation_0-rmse:0.71671
[3601]	validation_0-rmse:0.71667
1 0.7165983833016576
[0]	validation_0-rmse:7.17171
[1000]	validation_0-rmse:0.71732
[2000]	validation_0-rmse:0.71611
[2436]	validation_0-rmse:0.71610
2 0.7160192440212191
[0]	validation_0-rmse:7.17158
[1000]	validation_0-rmse:0.72012
[2000]	validation_0-rmse:0.71847
[3000]	validation_0-rmse:0.71807
[3410]	validation_0-rmse:0.71810
3 0.7180348501993807
[0]	validation_0-rmse:7.16581
[1000]	validation_0-rmse:0.72446
[2000]	validation_0-rmse:0.72269
[3000]	validation_0-rmse:0.72231
[3558]	validation_0-rmse:0.72226
4 0.7222181778581386
0.717989959670222 0.002214932785576633


In [10]:
preds = np.mean(np.column_stack(final_predictions), axis=1)
sample_submission.target = preds
sample_submission.to_csv("submission.csv", index=False)