In [1]:
# import pandas as pd
# import numpy as np
# from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, StackingRegressor
# from catboost import CatBoostRegressor
# from lightgbm import LGBMRegressor
# from sklearn.linear_model import LinearRegression
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import mean_squared_error
# from sklearn.preprocessing import StandardScaler

# # Load the data
# train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
# test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from scipy.stats import randint, uniform

# Load the data
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

# Save the 'Id' column from the test set for submission
test_ids = test['Id']

# Preprocessing
# Handle missing values, encode categorical variables, etc.
# For simplicity, we'll drop columns with missing values and encode categorical variables with one-hot encoding.
train = train.dropna(axis=1)
test = test.dropna(axis=1)

# Separate features and target
X = train.drop(['Id', 'SalePrice'], axis=1)
y = train['SalePrice']

# One-hot encode categorical variables
X = pd.get_dummies(X)
test = pd.get_dummies(test)

# Ensure the test set has the same columns as the training set
X, test = X.align(test, join='left', axis=1, fill_value=0)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# =============================================
# XGBoost with Hyperparameter Tuning
# =============================================
print("Training XGBoost Model with Hyperparameter Tuning...")

# Define the XGBoost model
xgb_model = XGBRegressor(random_state=42, n_jobs=-1)

# Define the hyperparameter grid
param_dist = {
    'n_estimators': randint(100, 1000),  # Number of trees
    'max_depth': randint(3, 10),         # Maximum depth of a tree
    'learning_rate': uniform(0.01, 0.3), # Learning rate
    'subsample': uniform(0.6, 0.4),      # Subsample ratio of the training instances
    'colsample_bytree': uniform(0.6, 0.4), # Subsample ratio of columns when constructing each tree
    'gamma': uniform(0, 0.5),            # Minimum loss reduction required to make a split
    'reg_alpha': uniform(0, 1),          # L1 regularization term
    'reg_lambda': uniform(0, 1)          # L2 regularization term
}

# Randomized Search for hyperparameter tuning
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=50,  # Number of parameter settings sampled
    scoring='neg_mean_squared_error',  # Use negative MSE for scoring
    cv=5,  # 5-fold cross-validation
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Fit the Randomized Search
random_search.fit(X_train, y_train)

# Best hyperparameters
best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)

# Train the final model with the best hyperparameters
best_xgb_model = XGBRegressor(**best_params, random_state=42, n_jobs=-1)
best_xgb_model.fit(X_train, y_train)

# Predict on the validation set
y_pred_xgb = best_xgb_model.predict(X_val)

# Evaluate XGBoost using RMSE
rmse_xgb = np.sqrt(mean_squared_error(y_val, y_pred_xgb))
print(f'XGBoost RMSE: {rmse_xgb}')

# =============================================
# Predict on the Test Set
# =============================================
# Use the tuned XGBoost model for final predictions
test_pred_xgb = best_xgb_model.predict(test)

# Create the submission file
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': test_pred_xgb
})

# Save the submission file
submission.to_csv('submission_xgboost_tuned.csv', index=False)
print("Submission file saved as 'submission_xgboost_tuned.csv'.")

Training XGBoost Model with Hyperparameter Tuning...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Hyperparameters: {'colsample_bytree': 0.7173952698872152, 'gamma': 0.007039911357542228, 'learning_rate': 0.06965272122664154, 'max_depth': 5, 'n_estimators': 518, 'reg_alpha': 0.7712703466859457, 'reg_lambda': 0.07404465173409036, 'subsample': 0.7433862914177091}
XGBoost RMSE: 25323.559430992984
Submission file saved as 'submission_xgboost_tuned.csv'.
