In [2]:
import numpy as np
import pandas as pd
import os
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

warnings.filterwarnings('ignore')

In [3]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [4]:
categorical_columns = train_data.select_dtypes(include=['object', 'category']).columns.tolist()
test_categorical_columns = test_data.select_dtypes(include=['object']).columns

In [5]:
for col in test_categorical_columns:
    category_list = sorted(train_data[col].dropna().unique())
    col_dtype = pd.CategoricalDtype(categories=category_list, ordered=False)
    
    train_data.loc[~train_data[col].isin(category_list), col] = np.nan
    test_data.loc[~test_data[col].isin(category_list), col] = np.nan
    
    train_data[col] = train_data[col].astype(col_dtype)
    test_data[col] = test_data[col].astype(col_dtype)

In [6]:
X_train_set = train_data.drop(columns=["id", "price"])
y_train_set = train_data["price"]


In [7]:
X_test_set = test_data.drop(columns=["id"])

In [8]:
def perform_cross_validation(model_class, X_train_set, y_train_set, parameters, n_folds=5):
    kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    validation_scores = []
    test_predictions = np.zeros((X_test_set.shape[0], n_folds), dtype=np.float32)
    
    # Loop over each fold
    for fold_num, (train_idx, valid_idx) in enumerate(kfold.split(X_train_set)):
        X_train_fold = X_train_set.iloc[train_idx]
        y_train_fold = y_train_set.iloc[train_idx]
        X_valid_fold = X_train_set.iloc[valid_idx]
        y_valid_fold = y_train_set.iloc[valid_idx]
        
        # Initialize and train the model
        regressor = model_class(**parameters, enable_categorical=True)
        regressor.fit(X_train_fold, y_train_fold, eval_set=[(X_valid_fold, y_valid_fold)], verbose=500, early_stopping_rounds=50)
        
        # Store predictions
        test_predictions[:, fold_num] = regressor.predict(X_test_set)
        
        print(f"Completed fold {fold_num + 1}")
        print('-' * 50)
    
    # Average the predictions over the folds
    final_predictions = np.mean(test_predictions, axis=1)
    return regressor, final_predictions

In [10]:
xgboost_params = {
    'lambda': 0.03880258557285165,
    'alpha': 0.02129832295514386,
    'colsample_bytree': 0.4,
    'subsample': 0.7,
    'learning_rate': 0.014,
    'max_depth': 17,
    'random_state': 2020,
    'min_child_weight': 85,
    'n_estimators': 10000,
    #'tree_method': 'gpu_hist'
}

print('Running XGBoost Cross-Validation...\n')
# Perform cross-validation with the XGBoost model
xgb_trained_model, xgb_test_predictions = perform_cross_validation(XGBRegressor, X_train_set, y_train_set, xgboost_params)

Running XGBoost Cross-Validation...

[0]	validation_0-rmse:74419.29006
[366]	validation_0-rmse:67883.05468
Completed fold 1
--------------------------------------------------
[0]	validation_0-rmse:74718.92972
[389]	validation_0-rmse:68677.14311
Completed fold 2
--------------------------------------------------
[0]	validation_0-rmse:79476.68630
[307]	validation_0-rmse:73917.39141
Completed fold 3
--------------------------------------------------
[0]	validation_0-rmse:82309.87939
[416]	validation_0-rmse:76439.98025
Completed fold 4
--------------------------------------------------
[0]	validation_0-rmse:82069.43187
[359]	validation_0-rmse:76258.41618
Completed fold 5
--------------------------------------------------


In [11]:
submission_df = pd.read_csv('sample_submission.csv')
submission_df['price'] = xgb_test_predictions.astype(np.float32)

submission_df.to_csv('submission_second.csv', index=False)