In [1]:
import os
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display
from pandas.api.types import CategoricalDtype

from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder    
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import xgboost as xgb

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

# Mute warnings
warnings.filterwarnings('ignore')

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/30-days-of-ml/sample_submission.csv
/kaggle/input/30-days-of-ml/train.csv
/kaggle/input/30-days-of-ml/test.csv


In [3]:
# Loading the Data into Dataframes
X_train = pd.read_csv("/kaggle/input/30-days-of-ml/train.csv", index_col="id")
X_test = pd.read_csv("/kaggle/input/30-days-of-ml/test.csv", index_col="id")

# Remove rows with missing target, separate target from predictors
X_train.dropna(axis=0, subset=['target'], inplace=True)

In [4]:
# Listing out the categorical columns of the data
categorical_columns = ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8','cat9']

# Encoding the categorical columns of the training and the test data using the fucntion encode_cat_columns
encoder = OrdinalEncoder()
X_train[categorical_columns] = encoder.fit_transform(X_train[categorical_columns])
X_test[categorical_columns] = encoder.transform(X_test[categorical_columns])

In [5]:
# Number of missing values in each column of training data
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

Series([], dtype: int64)


In [6]:
# parameters =  {
#                 'n_estimators' : 10000,
#                 'lambda': 0.002614400059630947, 
#                 'alpha': 13.146201819473635, 
#                 'colsample_bytree': 0.3,
#                 'subsample': 0.8, 
#                 'learning_rate': 0.018, 
#                 'max_depth': 3, 
#                 'min_child_weight': 62,
#                 'tree_method' : 'gpu_hist'
#               }
# model = xgb.XGBRegressor(**parameters)
model = xgb.XGBRegressor(n_estimators=10000, learning_rate=0.025, 
                         max_depth = 3,colsample_bytree = 0.6, gamma = 0.5, 
                         subsample = 0.99, reg_alpha = 50, reg_lambda = 0.8)
kf = KFold(n_splits=10, random_state=42, shuffle=True)
results = []
for train_index, test_index in kf.split(X=X_train):
    print(train_index, test_index)
    train = X_train.iloc[train_index]
    test = X_train.iloc[test_index]
    model.fit(train.drop('target', axis=1), train['target'], early_stopping_rounds=100,
              eval_set=[(test.drop('target', axis=1), test['target'])], verbose=False
             )
    test_prediction = model.predict(test.drop('target',axis=1))
    train_prediction = model.predict(train.drop('target',axis=1))
    print(f'validation error ',mean_squared_error(test['target'],test_prediction,squared=False))
    print(f'training error ',mean_squared_error(train['target'],train_prediction,squared=False))
    print("--------------------")
    fold_prediction=model.predict(X_test)
    results.append(fold_prediction)

[     1      2      3 ... 299997 299998 299999] [     0     16     22 ... 299962 299973 299983]
validation error  0.7164173827766601
training error  0.7054335100477276
--------------------
[     0      1      2 ... 299997 299998 299999] [     6     11     12 ... 299946 299957 299992]
validation error  0.7159051980757563
training error  0.706226054218701
--------------------
[     0      1      2 ... 299996 299997 299998] [     4     10     31 ... 299945 299970 299999]
validation error  0.7153785847848396
training error  0.7074827122460118
--------------------
[     0      1      2 ... 299996 299998 299999] [    38     39     46 ... 299928 299982 299997]
validation error  0.7174136034372304
training error  0.7057557135833066
--------------------
[     0      2      3 ... 299997 299998 299999] [     1      9     20 ... 299986 299988 299991]
validation error  0.7216183651325462
training error  0.7045652790443777
--------------------
[     0      1      2 ... 299997 299998 299999] [    23 

In [7]:
predictions=np.mean(np.column_stack(results),axis=1)

output = pd.DataFrame({'id': X_test.index,
                       'target': predictions})
output.to_csv('submission.csv', index=False)

In [8]:
# from sklearn.model_selection import GridSearchCV
# parameters =  {
#                 'lambda': 0.002614400059630947, 
#                 'alpha': 13.146201819473635, 
#                 'colsample_bytree': 0.3,
#                 'subsample': 0.8, 
#                 'learning_rate': 0.018, 
#                 'max_depth': 3, 
#                 'min_child_weight': 62,
#                 'tree_method' : 'gpu_hist'
#               }
# # model = xgb.XGBRegressor(n_estimators=1000, learning_rate=0.3, max_depth = 3, 
# #                          colsample_bytree = 0.6, gamma = 0.5, 
# #                          reg_alpha = 50, reg_lambda = 0.4)
# model = xgb.XGBRegressor(**parameters)
# kf = KFold(n_splits=5, random_state=5, shuffle=True)
# params = {
#         'n_estimators': [5000,6000,7000,8000,9000,10000]
#         }
# # define search
# search = GridSearchCV(model, 
#                       params, 
#                       scoring='neg_root_mean_squared_error', 
#                       cv=kf, 
#                       verbose=3)
# # execute search
# search.fit(X_train.drop('target', axis=1), X_train['target'])

In [9]:
# print('\n Best estimator:')
# print(search.best_estimator_)
# print('\n Best hyperparameters:')
# print(search.best_params_)
# print(search.best_score_)

In [10]:
# Hyperparameter tuning using optuna
# import optuna

In [11]:
# def objective(trial,data=X_train.drop('target', axis=1),target=X_train['target']):
    
#     train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
#     param = {
#         'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
#         'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
#         'alpha': trial.suggest_loguniform('alpha', 1e-3, 100.0),
#         'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]),
#         'subsample': trial.suggest_categorical('subsample', [0.8,0.9,1.0]),
#         'learning_rate': trial.suggest_categorical('learning_rate', [0.01,0.012,0.014,0.016,0.018,0.02]),
#         'n_estimators': 10000,
#         'max_depth': trial.suggest_categorical('max_depth', [3,4,5,6]),
#         'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
#     }
#     model = xgb.XGBRegressor(**param)  
    
#     model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    
#     preds = model.predict(test_x)
    
#     rmse = mean_squared_error(test_y, preds,squared=False)
    
#     return rmse

In [12]:
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials = 1000)
# print('Number of finished trials:', len(study.trials))
# print('Best trial:', study.best_trial.params)

In [13]:
# print('Best trial:', study.best_trial)

In [14]:
# study.trials_dataframe()