In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [2]:
X = pd.read_csv('../input/tabular-playground-series-feb-2021/train.csv',encoding='utf-8',index_col=0)
test = pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv',encoding='utf-8',index_col=0)

y = X['target']
X = X.drop(['target'], axis= 1)

In [3]:
label = LabelEncoder()
categorical_feature = np.where(X.dtypes != 'float64')[0].tolist()
categorical_feature_columns = X.select_dtypes(exclude=['float64']).columns

for column in categorical_feature_columns:
        label.fit(X[column])
        X[column] = label.transform(X[column])
        test[column] = label.transform(test[column])

Slightly tuned the LGBM parameters from this kernel https://www.kaggle.com/awwalmalhi/extreme-fine-tuning-lgbm-using-7-step-training

In [4]:
lgbm_parameters = {
    'metric': 'rmse', 
    'n_jobs': -1,
    'n_estimators': 50000,
    'reg_alpha': 10.924491968127692,
    'reg_lambda': 17.396730654687218,
    'colsample_bytree': 0.21497646795452627,
    'subsample': 0.7582562557431147,
    'learning_rate': 0.009985133666265425,
    'max_depth': 18,
    'num_leaves': 63,
    'min_child_samples': 27,
    'max_bin': 523,
    'cat_l2': 0.025083670064082797
}

In [5]:
lgbm_val_pred = np.zeros(len(y))
lgbm_test_pred = np.zeros(len(test))
mse = []
kf = KFold(n_splits=10, shuffle=True)

for trn_idx, val_idx in tqdm(kf.split(X,y)):
    x_train_idx = X.iloc[trn_idx]
    y_train_idx = y.iloc[trn_idx]
    x_valid_idx = X.iloc[val_idx]
    y_valid_idx = y.iloc[val_idx]

    lgbm_model = LGBMRegressor(**lgbm_parameters)
    lgbm_model.fit(x_train_idx, y_train_idx, eval_set = ((x_valid_idx,y_valid_idx)),verbose = -1, early_stopping_rounds = 400,categorical_feature=categorical_feature)  
    lgbm_test_pred += lgbm_model.predict(test)/10
    mse.append(mean_squared_error(y_valid_idx, lgbm_model.predict(x_valid_idx)))
    
np.mean(mse)
pd.DataFrame({'id':test.index,'target':lgbm_test_pred}).to_csv('submission.csv', index=False)

0it [00:00, ?it/s]

Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[3257]	valid_0's rmse: 0.842359


1it [01:28, 88.53s/it]

Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[3358]	valid_0's rmse: 0.833311


2it [02:58, 89.57s/it]

Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[4247]	valid_0's rmse: 0.840687


3it [04:48, 98.71s/it]

Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[3096]	valid_0's rmse: 0.841742


4it [06:13, 93.15s/it]

Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[3222]	valid_0's rmse: 0.845787


5it [07:40, 91.18s/it]

Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[2603]	valid_0's rmse: 0.842103


6it [08:55, 85.46s/it]

Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[2950]	valid_0's rmse: 0.84482


7it [10:17, 84.48s/it]

Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[3233]	valid_0's rmse: 0.83974


8it [11:45, 85.61s/it]

Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[2793]	valid_0's rmse: 0.846024


9it [13:03, 83.30s/it]

Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[3146]	valid_0's rmse: 0.840959


10it [14:29, 86.93s/it]
