In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/30days-folds/train_folds.csv
/kaggle/input/30daysofmlraw/sample_submission.csv
/kaggle/input/30daysofmlraw/train.csv
/kaggle/input/30daysofmlraw/test.csv


In [2]:
import optuna
train = pd.read_csv("../input/30days-folds/train_folds.csv")
test = pd.read_csv("../input/30daysofmlraw/test.csv")
sample_submission = pd.read_csv("../input/30daysofmlraw/sample_submission.csv")

In [3]:
#target encoding
df=train.copy()
df_test=test.copy()
useful_features=[col for col in df.columns if col not in ('id','kfold','target')]
object_cols = [col for col in useful_features if 'cat' in col]
df_test = df_test[useful_features]
for col in object_cols:
    temp_df = []
    temp_test_feat = None
    
    for fold in range(5):
        xtrain=df[df.kfold!=fold].reset_index(drop=True)
        xvalid=df[df.kfold==fold].reset_index(drop=True)
        feat=xtrain.groupby(col)['target'].agg("mean")
        feat=feat.to_dict()
        xvalid.loc[:,f'tar_enc_{col}']=xvalid[col].map(feat)
        temp_df.append(xvalid)
        if temp_test_feat is None:
            temp_test_feat=df_test[col].map(feat)
        else:
            temp_test_feat += df_test[col].map(feat)
     
    temp_test_feat /= 5
    df_test.loc[:, f"tar_enc_{col}"] = temp_test_feat
    df = pd.concat(temp_df)


In [4]:
#parameters
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

# Model hyperparameters
xgb_params = {
'lambda': 67.79737006663706,
'alpha': 40.12405005448161,
'colsample_bytree': 0.061613774851329205,
'subsample': 0.9556736521337416,
'learning_rate': 0.17024722721525629,
'n_estimators': 9489,
'max_depth': 3,
'booster': 'gbtree',
'min_child_weight': 155,
'seed' : 38,
    #'tree_method':'gpu_hist'
}


In [None]:
#hyperparameter optimization
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from xgboost import XGBRegressor
# Model hyperparameters
def run(trial):
    fold = 4
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 7)

    xtrain = df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)

    ytrain = xtrain.target
    yvalid = xvalid.target

    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    ordinal_encoder = OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    model = XGBRegressor(
        random_state=42,
        tree_method="gpu_hist",
        gpu_id=1,
        predictor="gpu_predictor",
        learning_rate=learning_rate,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        max_depth=max_depth,
        n_estimators=9849
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    return rmse

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(run, n_trials=30)

In [None]:
study.best_params

In [None]:
params={'learning_rate': 0.024995588074037102,
 'reg_lambda': 0.02404294288572649,
 'reg_alpha': 6.366939727261077,
 'subsample': 0.8852191911768938,
 'colsample_bytree': 0.16262421334504829,
 'max_depth': 3}
xgb_params_hyp={'n_estimators': 7000,
                **params,
                'random_state': 0
               }
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from xgboost import XGBRegressor

preds = 0
scores = []
numerical_cols = [col for col in useful_features if col.startswith("cont")]
useful_features=[col for col in df.columns if col not in ('id','kfold','target')]

for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    scaler = StandardScaler()
    xtrain[numerical_cols] = scaler.fit_transform(xtrain[numerical_cols])
    xvalid[numerical_cols] = scaler.transform(xvalid[numerical_cols])
    xtest[numerical_cols] = scaler.transform(xtest[numerical_cols])
    #print(xtrain.shape,xvalid.shape,xtest.shape)
    model = XGBRegressor(**xgb_params_hyp,eval_metric = "rmse")
    model.fit(xtrain, ytrain,early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    
    #Mean of the predictions
    preds += model.predict(xtest) / 5# Splits
    
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)


# BLENDING (USING 4 MODELS)

# MODEL 1

In [14]:
#blending
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from xgboost import XGBRegressor

final_test_predictions = []
final_valid_predictions = {}
scores = []

useful_features=[col for col in df.columns if col not in ('id','kfold','target')]
object_cols = [col for col in useful_features if  col.startswith('cat')]
numerical_cols = [col for col in useful_features if col.startswith("cont")]

#model_1
params={'learning_rate': 0.024995588074037102,
 'reg_lambda': 0.02404294288572649,
 'reg_alpha': 6.366939727261077,
 'subsample': 0.8852191911768938,
 'colsample_bytree': 0.16262421334504829,
 'max_depth': 3}
xgb_params_hyp={'n_estimators': 7000,
                **params
               }
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    xtest  = xtest[useful_features]
    
    ordinal_encoder = OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    #scaler
    scaler = StandardScaler()
    xtrain[numerical_cols] = scaler.fit_transform(xtrain[numerical_cols])
    xvalid[numerical_cols] = scaler.transform(xvalid[numerical_cols])
    xtest[numerical_cols] = scaler.transform(xtest[numerical_cols])
    
    model = XGBRegressor(
        random_state=fold,
#         tree_method='gpu_hist',
#         gpu_id=0,
#         predictor="gpu_predictor",
        **xgb_params_hyp
        
    )
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_1"]
final_valid_predictions.to_csv("train_pred_1.csv", index=False)

sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_1"]
sample_submission.to_csv("test_pred_1.csv", index=False)

0 0.7162172508614747
1 0.7161453514849473
2 0.7181290054144798
3 0.7178460933001035
4 0.7163238041108974
0.7169323010343805




# MODEL 2

In [6]:
!pip uninstall -y lightgbm
!apt-get install -y libboost-all-dev
!git clone --recursive https://github.com/Microsoft/LightGBM

Found existing installation: lightgbm 3.2.1.99
Uninstalling lightgbm-3.2.1.99:
  Successfully uninstalled lightgbm-3.2.1.99
Reading package lists... Done
Building dependency tree       
Reading state information... Done
libboost-all-dev is already the newest version (1.65.1.0ubuntu1).
0 upgraded, 0 newly installed, 0 to remove and 12 not upgraded.
Cloning into 'LightGBM'...
remote: Enumerating objects: 23316, done.[K
remote: Counting objects: 100% (995/995), done.[K
remote: Compressing objects: 100% (548/548), done.[K
remote: Total 23316 (delta 613), reused 732 (delta 427), pack-reused 22321[K
Receiving objects: 100% (23316/23316), 18.00 MiB | 22.82 MiB/s, done.
Resolving deltas: 100% (17012/17012), done.
Submodule 'include/boost/compute' (https://github.com/boostorg/compute) registered for path 'external_libs/compute'
Submodule 'eigen' (https://gitlab.com/libeigen/eigen.git) registered for path 'external_libs/eigen'
Submodule 'external_libs/fast_double_parser' (https://github.com/

In [7]:
%%bash
cd LightGBM
rm -r build
mkdir build
cd build
cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ ..
make -j$(nproc)

-- The C compiler identification is GNU 7.5.0
-- The CXX compiler identification is GNU 7.5.0
-- Check for working C compiler: /usr/bin/cc
-- Check for working C compiler: /usr/bin/cc -- works
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Detecting C compile features
-- Detecting C compile features - done
-- Check for working CXX compiler: /usr/bin/c++
-- Check for working CXX compiler: /usr/bin/c++ -- works
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Found OpenMP_C: -fopenmp (found version "4.5") 
-- Found OpenMP_CXX: -fopenmp (found version "4.5") 
-- Found OpenMP: TRUE (found version "4.5")  
-- Looking for CL_VERSION_2_2
-- Looking for CL_VERSION_2_2 - not found
-- Looking for CL_VERSION_2_1
-- Looking for CL_VERSION_2_1 - not found
-- Looking for CL_VERSION_2_0
-- Looking for CL_VERSION_2_0 - not found
-- Looking for CL_VERSION_1_2
-- Looking

rm: cannot remove 'build': No such file or directory
cmake: /opt/conda/lib/libcurl.so.4: no version information available (required by cmake)
/usr/bin/cmake: /opt/conda/lib/libcurl.so.4: no version information available (required by /usr/bin/cmake)
/usr/bin/cmake: /opt/conda/lib/libcurl.so.4: no version information available (required by /usr/bin/cmake)
/usr/bin/cmake: /opt/conda/lib/libcurl.so.4: no version information available (required by /usr/bin/cmake)
/usr/bin/cmake: /opt/conda/lib/libcurl.so.4: no version information available (required by /usr/bin/cmake)
/usr/bin/cmake: /opt/conda/lib/libcurl.so.4: no version information available (required by /usr/bin/cmake)
/usr/bin/cmake: /opt/conda/lib/libcurl.so.4: no version information available (required by /usr/bin/cmake)
/usr/bin/cmake: /opt/conda/lib/libcurl.so.4: no version information available (required by /usr/bin/cmake)
/usr/bin/cmake: /opt/conda/lib/libcurl.so.4: no version information available (required by /usr/bin/cmake)
/u

In [8]:
!cd LightGBM/python-package/;python setup.py install --precompile

running install
running build
running build_py
creating build
creating build/lib
creating build/lib/lightgbm
copying lightgbm/callback.py -> build/lib/lightgbm
copying lightgbm/compat.py -> build/lib/lightgbm
copying lightgbm/sklearn.py -> build/lib/lightgbm
copying lightgbm/libpath.py -> build/lib/lightgbm
copying lightgbm/basic.py -> build/lib/lightgbm
copying lightgbm/dask.py -> build/lib/lightgbm
copying lightgbm/plotting.py -> build/lib/lightgbm
copying lightgbm/engine.py -> build/lib/lightgbm
copying lightgbm/__init__.py -> build/lib/lightgbm
running egg_info
creating lightgbm.egg-info
writing lightgbm.egg-info/PKG-INFO
writing dependency_links to lightgbm.egg-info/dependency_links.txt
writing requirements to lightgbm.egg-info/requires.txt
writing top-level names to lightgbm.egg-info/top_level.txt
writing manifest file 'lightgbm.egg-info/SOURCES.txt'
reading manifest template 'MANIFEST.in'
no previously-included directories found matching 'build'
writing manifest file 'lightgbm.e

In [9]:
!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
!rm -r LightGBM

In [None]:
df_test.head()

In [12]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import OrdinalEncoder
final_valid_predictions = {}
final_test_predictions=[]
scores = []

df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30daysofmlraw/test.csv")
sample_submission = pd.read_csv("../input/30daysofmlraw/sample_submission.csv")


useful_features=[col for col in df.columns if col not in ('id','kfold','target')]
object_cols = [col for col in useful_features if  col.startswith('cat')]
numerical_cols = [col for col in useful_features if col.startswith("cont")]



import lightgbm as lgb
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    xtest  = xtest[useful_features]
    
    ordinal_encoder = OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    param = {
        "objective": "regression",
        "metric": "rmse",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "n_estimators": 10000,
        "early_stopping_round": 300,
        "device": "gpu",
        "gpu_platform_id": 0,
        "gpu_device_id": 0,
    }
    
    param2 = {
        'lambda_l1': 0.00472279780583036, 
        'lambda_l2': 2.9095205689488508e-05, 
        'num_leaves': 158, 
        'feature_fraction': 0.7386878356648194, 
        'bagging_fraction': 0.8459744550725283, 
        'bagging_freq': 2, 
        'max_depth': 2, 
        'max_bin': 249, 
        'learning_rate': 0.044738463593017294,
        'min_child_samples': 13
    }
    param.update(param2)
#     param = {
#     "random_state": 0,
#     "metric": "rmse",
#     "n_jobs": 6,
#     "early_stopping_round": 200,
#     "reg_alpha": 9.03513073170552,
#     "reg_lambda": 0.024555737897445917,
#     "colsample_bytree": 0.2185112060137363,
#     "learning_rate": 0.003049106861273527,
#     "max_depth": 65,
#     "num_leaves": 51,
#     "min_child_samples": 177,
#     "n_estimators": 8000,
#     "cat_smooth": 93.60968300634175,
#     "max_bin": 537,
#     "min_data_per_group": 117,
#     "bagging_freq": 1,
#     "bagging_fraction": 0.6709049555262285,
#     "cat_l2": 7.5586732660804445,
#     "verbose": -1}
    lgb_train = lgb.Dataset(xtrain, ytrain)
    lgb_valid = lgb.Dataset(xvalid, yvalid, reference=lgb_train)

    model = lgb.train(param, lgb_train, valid_sets=[lgb_valid], verbose_eval=1000)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)
    
print(np.mean(scores), np.std(scores))
print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_2"]
final_valid_predictions.to_csv("train_pred_2.csv", index=False)

sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_2"]
sample_submission.to_csv("test_pred_2.csv", index=False)



Training until validation scores don't improve for 300 rounds
[1000]	valid_0's rmse: 0.725479
[2000]	valid_0's rmse: 0.72169
[3000]	valid_0's rmse: 0.719718
[4000]	valid_0's rmse: 0.718642
[5000]	valid_0's rmse: 0.718104
[6000]	valid_0's rmse: 0.717781
[7000]	valid_0's rmse: 0.717624
[8000]	valid_0's rmse: 0.717495
Early stopping, best iteration is:
[8326]	valid_0's rmse: 0.717471
0 0.7174711663323653




Training until validation scores don't improve for 300 rounds
[1000]	valid_0's rmse: 0.725113
[2000]	valid_0's rmse: 0.721468
[3000]	valid_0's rmse: 0.719609
[4000]	valid_0's rmse: 0.718685
[5000]	valid_0's rmse: 0.718256
[6000]	valid_0's rmse: 0.718053
Early stopping, best iteration is:
[6252]	valid_0's rmse: 0.71799
1 0.717990209419654




Training until validation scores don't improve for 300 rounds
[1000]	valid_0's rmse: 0.727329
[2000]	valid_0's rmse: 0.723491
[3000]	valid_0's rmse: 0.721569
[4000]	valid_0's rmse: 0.720583
[5000]	valid_0's rmse: 0.720018
[6000]	valid_0's rmse: 0.719776
[7000]	valid_0's rmse: 0.719641
Early stopping, best iteration is:
[7028]	valid_0's rmse: 0.719636
2 0.7196357506965607




Training until validation scores don't improve for 300 rounds
[1000]	valid_0's rmse: 0.726758
[2000]	valid_0's rmse: 0.723115
[3000]	valid_0's rmse: 0.721407
[4000]	valid_0's rmse: 0.720507
[5000]	valid_0's rmse: 0.720028
[6000]	valid_0's rmse: 0.719738
[7000]	valid_0's rmse: 0.719675
Early stopping, best iteration is:
[7081]	valid_0's rmse: 0.719663
3 0.7196632415072806




Training until validation scores don't improve for 300 rounds
[1000]	valid_0's rmse: 0.726165
[2000]	valid_0's rmse: 0.722073
[3000]	valid_0's rmse: 0.720065
[4000]	valid_0's rmse: 0.719127
[5000]	valid_0's rmse: 0.718603
[6000]	valid_0's rmse: 0.718323
[7000]	valid_0's rmse: 0.718168
Early stopping, best iteration is:
[7262]	valid_0's rmse: 0.71813
4 0.718129515675863
0.7185779767263447 0.0009020313060687815
0.7185779767263447 0.0009020313060687815


In [None]:
final_predictions = []
scores = []

final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_2"]
final_valid_predictions.to_csv("train_pred_2.csv", index=False)

sample_submission.target = np.mean(np.column_stack(final_predictions), axis=1)
sample_submission.columns = ["id", "pred_2"]
sample_submission.to_csv("test_pred_2.csv", index=False)


# MODEL 3

In [None]:
#model_3 #training model only with continuous features
#only cat features
useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
model_3_features = [col for col in useful_features if col.startswith("cont")]
final_test_predictions = []
final_valid_predictions = {}
scores = []

for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[model_3_features]
    xvalid = xvalid[model_3_features]
    xtest  = xtest[model_3_features]
    #numerical_cols and model_3_features col are exactly same
    
    
    
    model = XGBRegressor(
        random_state=fold,
        tree_method='gpu_hist',
        gpu_id=0,
        predictor="gpu_predictor",
        **xgb_params_hyp
        
    )
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_3"]
final_valid_predictions.to_csv("train_pred_3.csv", index=False)

sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_3"]
sample_submission.to_csv("test_pred_3.csv", index=False)

    


# MODEL 4

In [None]:
#model_4 only numerical cols

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
model_4_features = [col for col in useful_features if col.startswith('cat')]
final_test_predictions = []
final_valid_predictions = {}
scores = []

for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    xtest  = xtest[model_4_features]
    xtrain = xtrain[model_4_features]
    xvalid = xvalid[model_4_features]
    #object_cols and model_3_features col are exactly same
    ordinal_encoder = OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    model = XGBRegressor(
        random_state=fold,
#         tree_method='gpu_hist',
#         gpu_id=0,
#         predictor="gpu_predictor",
        **xgb_params_hyp
        
    )
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_4"]
final_valid_predictions.to_csv("train_pred_4.csv", index=False)

sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_4"]
sample_submission.to_csv("test_pred_4.csv", index=False)

    


# STACKING ALL OF THEM INTO MAIN TABLE

In [15]:
#stacking all the models validations prediction side by side to form a new table
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30daysofmlraw/test.csv")
sample_submission = pd.read_csv("../input/30daysofmlraw/sample_submission.csv")


df1 = pd.read_csv("train_pred_1.csv")
df2 = pd.read_csv("train_pred_2.csv")
# df3 = pd.read_csv("train_pred_3.csv")
# df4 = pd.read_csv("train_pred_4.csv")

df_test1 = pd.read_csv("test_pred_1.csv")
df_test2 = pd.read_csv("test_pred_2.csv")
# df_test3 = pd.read_csv("test_pred_3.csv")
# df_test4 = pd.read_csv("test_pred_4.csv")

df = df.merge(df1, on="id", how="left")
df = df.merge(df2, on="id", how="left")
# df = df.merge(df3, on="id", how="left")
# df = df.merge(df4, on="id", how="left")

df_test = df_test.merge(df_test1, on="id", how="left")
df_test = df_test.merge(df_test2, on="id", how="left")
# df_test = df_test.merge(df_test3, on="id", how="left")
# df_test = df_test.merge(df_test4, on="id", how="left")

df.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont8,cont9,cont10,cont11,cont12,cont13,target,kfold,pred_1,pred_2
0,1,B,B,B,C,B,B,A,E,C,...,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985,8.113634,0,8.431372,8.123327
1,2,B,B,A,A,B,D,A,F,A,...,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233,2,8.452841,
2,3,A,A,A,C,B,D,A,D,A,...,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351,4,8.205483,
3,4,B,B,A,C,B,D,A,E,C,...,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682,8.049253,3,8.31698,
4,6,A,A,A,C,B,D,A,E,A,...,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.97226,1,8.290789,


In [16]:
useful_features = ["pred_1", "pred_2"]
df_test = df_test[useful_features]

final_predictions = []
scores = []
def optimize_blending(trial):
    
    for fold in range(5):
        learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
        reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
        reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
        subsample = trial.suggest_float("subsample", 0.1, 1.0)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
        max_depth = trial.suggest_int("max_depth", 1, 7)

        xtrain =  df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)
        xtest = df_test.copy()

        ytrain = xtrain.target
        yvalid = xvalid.target

        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]
        
        model = XGBRegressor(
        random_state=fold,
        learning_rate=learning_rate,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        max_depth=max_depth,
        n_estimators=7000,tree_method='gpu_hist',
        gpu_id=0,
        predictor="gpu_predictor",
            
    )
        model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
        preds_valid = model.predict(xvalid)
        rmse = mean_squared_error(yvalid, preds_valid, squared=False)
        print(fold, rmse)
        scores.append(rmse)
        return rmse
#print(np.mean(scores), np.std(scores))

In [17]:
study = optuna.create_study(direction="minimize")
study.optimize(optimize_blending, n_trials=15)

[32m[I 2021-08-30 19:53:46,114][0m A new study created in memory with name: no-name-25b4c95f-a0e3-4481-a920-97329f7e2a1d[0m


[0]	validation_0-rmse:7.63215
[840]	validation_0-rmse:0.71607


[32m[I 2021-08-30 19:53:47,968][0m Trial 0 finished with value: 0.7160591650271125 and parameters: {'learning_rate': 0.019166258780540017, 'reg_lambda': 22.784776028696673, 'reg_alpha': 0.8717395894027744, 'subsample': 0.7932580575546805, 'colsample_bytree': 0.5530513712998397, 'max_depth': 3}. Best is trial 0 with value: 0.7160591650271125.[0m


0 0.7160591650271125
[0]	validation_0-rmse:7.69510
[1000]	validation_0-rmse:0.71604
[1389]	validation_0-rmse:0.71605


[32m[I 2021-08-30 19:53:49,643][0m Trial 1 finished with value: 0.7160343450482172 and parameters: {'learning_rate': 0.010999850553622608, 'reg_lambda': 15.638311879225217, 'reg_alpha': 0.0079352273475891, 'subsample': 0.3126700722837664, 'colsample_bytree': 0.8066622591449545, 'max_depth': 2}. Best is trial 1 with value: 0.7160343450482172.[0m


0 0.7160343450482172
[0]	validation_0-rmse:6.26168
[364]	validation_0-rmse:0.71642


[32m[I 2021-08-30 19:53:50,544][0m Trial 2 finished with value: 0.716167169419351 and parameters: {'learning_rate': 0.19729871650760875, 'reg_lambda': 27.372392345636978, 'reg_alpha': 4.4456151044560363e-07, 'subsample': 0.3182248539647216, 'colsample_bytree': 0.20286759992169767, 'max_depth': 6}. Best is trial 1 with value: 0.7160343450482172.[0m


0 0.716167169419351
[0]	validation_0-rmse:7.43122
[498]	validation_0-rmse:0.71631


[32m[I 2021-08-30 19:53:52,425][0m Trial 3 finished with value: 0.7162066131852972 and parameters: {'learning_rate': 0.04524476456668035, 'reg_lambda': 7.734231152213524, 'reg_alpha': 0.0028983776824857703, 'subsample': 0.662002820403254, 'colsample_bytree': 0.5528299972799496, 'max_depth': 7}. Best is trial 1 with value: 0.7160343450482172.[0m


0 0.7162066131852972
[0]	validation_0-rmse:7.48215
[538]	validation_0-rmse:0.71616


[32m[I 2021-08-30 19:53:53,371][0m Trial 4 finished with value: 0.7160924566867437 and parameters: {'learning_rate': 0.038615773754976074, 'reg_lambda': 1.4408202948284945e-06, 'reg_alpha': 0.3120222444095932, 'subsample': 0.765569539443997, 'colsample_bytree': 0.785962460046084, 'max_depth': 4}. Best is trial 1 with value: 0.7160343450482172.[0m


0 0.7160924566867437
[0]	validation_0-rmse:7.31607
[441]	validation_0-rmse:0.71618


[32m[I 2021-08-30 19:53:54,157][0m Trial 5 finished with value: 0.7161040642308326 and parameters: {'learning_rate': 0.06017602354621338, 'reg_lambda': 9.394905842498815e-06, 'reg_alpha': 1.3890478944497628e-07, 'subsample': 0.6767171650639213, 'colsample_bytree': 0.33144738110033367, 'max_depth': 4}. Best is trial 1 with value: 0.7160343450482172.[0m


0 0.7161040642308326
[0]	validation_0-rmse:6.82354
[1000]	validation_0-rmse:0.71613
[1319]	validation_0-rmse:0.71613


[32m[I 2021-08-30 19:53:55,743][0m Trial 6 finished with value: 0.7161203150478576 and parameters: {'learning_rate': 0.12419114841165702, 'reg_lambda': 1.2983443140877307, 'reg_alpha': 75.56892023536537, 'subsample': 0.4269025293544081, 'colsample_bytree': 0.3665002411893955, 'max_depth': 1}. Best is trial 1 with value: 0.7160343450482172.[0m


0 0.7161203150478576
[0]	validation_0-rmse:7.66211
[852]	validation_0-rmse:0.71624


[32m[I 2021-08-30 19:53:58,097][0m Trial 7 finished with value: 0.7162038259371549 and parameters: {'learning_rate': 0.015267614981624942, 'reg_lambda': 1.1313296608687397e-06, 'reg_alpha': 0.01427002701166256, 'subsample': 0.27406020836176537, 'colsample_bytree': 0.8689054919408253, 'max_depth': 6}. Best is trial 1 with value: 0.7160343450482172.[0m


0 0.7162038259371549
[0]	validation_0-rmse:7.25316
[433]	validation_0-rmse:0.71618


[32m[I 2021-08-30 19:53:58,893][0m Trial 8 finished with value: 0.7160879053212202 and parameters: {'learning_rate': 0.06833341304337696, 'reg_lambda': 0.3163352872095286, 'reg_alpha': 1.0436863289846183e-05, 'subsample': 0.823998912720003, 'colsample_bytree': 0.8877662489446271, 'max_depth': 4}. Best is trial 1 with value: 0.7160343450482172.[0m


0 0.7160879053212202
[0]	validation_0-rmse:5.85759
[331]	validation_0-rmse:0.71654


[32m[I 2021-08-30 19:53:59,407][0m Trial 9 finished with value: 0.7160689013319208 and parameters: {'learning_rate': 0.24972945181361336, 'reg_lambda': 7.841954180535813e-06, 'reg_alpha': 0.00023532744290084227, 'subsample': 0.14769351519215373, 'colsample_bytree': 0.7375766501516121, 'max_depth': 3}. Best is trial 1 with value: 0.7160343450482172.[0m


0 0.7160689013319208
[0]	validation_0-rmse:7.69954
[1000]	validation_0-rmse:0.71660
[2000]	validation_0-rmse:0.71619
[3000]	validation_0-rmse:0.71615
[4000]	validation_0-rmse:0.71613
[4506]	validation_0-rmse:0.71613


[32m[I 2021-08-30 19:54:04,056][0m Trial 10 finished with value: 0.7161324885412395 and parameters: {'learning_rate': 0.01042383589159202, 'reg_lambda': 0.007884641414042314, 'reg_alpha': 34.545202066889665, 'subsample': 0.12837300282853242, 'colsample_bytree': 0.6717516554028448, 'max_depth': 1}. Best is trial 1 with value: 0.7160343450482172.[0m


0 0.7161324885412395
[0]	validation_0-rmse:7.62357
[993]	validation_0-rmse:0.71607


[32m[I 2021-08-30 19:54:05,293][0m Trial 11 finished with value: 0.7160613907853649 and parameters: {'learning_rate': 0.020269026666155494, 'reg_lambda': 0.014586149680824636, 'reg_alpha': 0.6217028532340929, 'subsample': 0.9584336551474979, 'colsample_bytree': 0.5412789795558409, 'max_depth': 2}. Best is trial 1 with value: 0.7160343450482172.[0m


0 0.7160613907853649
[0]	validation_0-rmse:7.60127
[1000]	validation_0-rmse:0.71606
[1020]	validation_0-rmse:0.71605


[32m[I 2021-08-30 19:54:06,670][0m Trial 12 finished with value: 0.7160431665676945 and parameters: {'learning_rate': 0.023192650969160875, 'reg_lambda': 50.70802376653459, 'reg_alpha': 1.028711419792025, 'subsample': 0.5053060482267447, 'colsample_bytree': 0.5677608106672491, 'max_depth': 2}. Best is trial 1 with value: 0.7160343450482172.[0m


0 0.7160431665676945
[0]	validation_0-rmse:7.69707
[1000]	validation_0-rmse:0.71606
[1415]	validation_0-rmse:0.71605


[32m[I 2021-08-30 19:54:08,384][0m Trial 13 finished with value: 0.7160534143573599 and parameters: {'learning_rate': 0.010752572983615948, 'reg_lambda': 62.823764956423275, 'reg_alpha': 0.026103427697755352, 'subsample': 0.5017561160607915, 'colsample_bytree': 0.6452851348843129, 'max_depth': 2}. Best is trial 1 with value: 0.7160343450482172.[0m


0 0.7160534143573599
[0]	validation_0-rmse:7.56890
[901]	validation_0-rmse:0.71606


[32m[I 2021-08-30 19:54:09,505][0m Trial 14 finished with value: 0.7160348327005465 and parameters: {'learning_rate': 0.027363737776892067, 'reg_lambda': 1.646766250459745e-08, 'reg_alpha': 0.0002005968608931572, 'subsample': 0.3641608513914987, 'colsample_bytree': 0.9752119542583748, 'max_depth': 2}. Best is trial 1 with value: 0.7160343450482172.[0m


0 0.7160348327005465


In [18]:
study.best_params

{'learning_rate': 0.010999850553622608,
 'reg_lambda': 15.638311879225217,
 'reg_alpha': 0.0079352273475891,
 'subsample': 0.3126700722837664,
 'colsample_bytree': 0.8066622591449545,
 'max_depth': 2}

In [19]:
blend_parms={'learning_rate': 0.010999850553622608,
 'reg_lambda': 15.638311879225217,
 'reg_alpha': 0.0079352273475891,
 'subsample': 0.3126700722837664,
 'colsample_bytree': 0.8066622591449545,
 'max_depth': 2}
xgb_blend_params={'n_estimators': 7000,
                **blend_parms,
                'random_state': 0
               }

final_test_predictions = []
#final_valid_predictions = {}
scores = []
for fold in range(5):

        xtrain =  df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)
        xtest = df_test.copy()

        ytrain = xtrain.target
        yvalid = xvalid.target

        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]
        
        model = XGBRegressor(**xgb_blend_params)
        model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
        preds_valid = model.predict(xvalid)
        test_preds = model.predict(xtest)
        final_predictions.append(test_preds)
        rmse = mean_squared_error(yvalid, preds_valid, squared=False)
        print(fold, rmse)
        scores.append(rmse)


[0]	validation_0-rmse:7.69508
[1000]	validation_0-rmse:0.71605
[1537]	validation_0-rmse:0.71606
0 0.7160476905210753
[0]	validation_0-rmse:7.69182
[1000]	validation_0-rmse:0.71588
[1238]	validation_0-rmse:0.71588
1 0.7158813670125022
[0]	validation_0-rmse:7.68956
[1000]	validation_0-rmse:0.71791
[1024]	validation_0-rmse:0.71791
2 0.7178996950648469
[0]	validation_0-rmse:7.69187
[1000]	validation_0-rmse:0.71771
[1194]	validation_0-rmse:0.71771
3 0.7177046943248524
[0]	validation_0-rmse:7.69731
[1000]	validation_0-rmse:0.71594
[1622]	validation_0-rmse:0.71593
4 0.7159209642589889


In [20]:
#preds=model.predict(X_test)
sample_submission.target = np.mean(np.column_stack(final_predictions), axis=1)
sample_submission.to_csv("submission_cpu_takur_optuna_blending_2models.csv", index = False)
print("Sent")


Sent
