# Below is my best score of .95 on xgboost, gotten after 50 attempts at hyperparameter tuning

In [10]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
import matplotlib.pyplot as plt

# Import the functions from the module
from data_processing_module import load_data, preprocess_data, split_data

# Cell 2: Use the imported functions
file_path = 'merged3.csv'  # Update this path to the location of your file
target_column = 'SalePrice_x'
drop_columns = ['Unnamed: 0', 'PID', 'Index']

# Load the data
data = load_data(file_path)

# Preprocess the data
X_preprocessed, y, preprocessor = preprocess_data(data, target_column, drop_columns)

# Split the data
X_train, X_test, y_train, y_test = split_data(X_preprocessed, y)

# Define the alpha values for Lasso regression
alpha_values = [0.001, 0.01, 0.1, 1]

# Define the parameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [7000],
    'learning_rate': [0.03, 0.001],
    'max_depth': [3],
    'subsample': [0.8],
    'colsample_bytree': [0.9]
}

# Initialize the results dictionary
results = {}

for alpha in alpha_values:
    print(f"Running Lasso with alpha={alpha}")

    # Define and fit the Lasso model
    lasso = Lasso(alpha=alpha, max_iter=10000)
    lasso.fit(X_train, y_train)

    # Get the selected features
    selected_features = np.where(lasso.coef_ != 0)[0]
    X_train_selected = X_train[:, selected_features]
    X_test_selected = X_test[:, selected_features]

    # Define the XGBoost model
    xgb_model = XGBRegressor(random_state=42)

    # Hyperparameter tune the XGBoost model
    grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb, scoring='r2', cv=5, n_jobs=-1, verbose=2)
    grid_search_xgb.fit(X_train_selected, y_train)

    best_xgb = grid_search_xgb.best_estimator_
    print(f"Best XGBoost Parameters for alpha={alpha}: {grid_search_xgb.best_params_}")

    # Make predictions with the best XGBoost model
    y_pred_xgb = best_xgb.predict(X_test_selected)

    # Evaluate the XGBoost model
    mse_xgb = mean_squared_error(y_test, y_pred_xgb)
    r2_xgb = r2_score(y_test, y_pred_xgb)

    print(f'XGBoost Model Mean Squared Error for alpha={alpha}: {mse_xgb}')
    print(f'XGBoost Model R-squared for alpha={alpha}: {r2_xgb}')

    # Store the results
    results[alpha] = {
        'best_params': grid_search_xgb.best_params_,
        'mse': mse_xgb,
        'r2': r2_xgb
    }

# Optionally, you can plot the results or further analyze them
for alpha, result in results.items():
    print(f"Alpha: {alpha}")
    print(f"Best Parameters: {result['best_params']}")
    print(f"MSE: {result['mse']}")
    print(f"R2: {result['r2']}")
    print("\n")




Running Lasso with alpha=0.001
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best XGBoost Parameters for alpha=0.001: {'colsample_bytree': 0.9, 'learning_rate': 0.03, 'max_depth': 3, 'n_estimators': 7000, 'subsample': 0.8}
XGBoost Model Mean Squared Error for alpha=0.001: 320937885.95468134
XGBoost Model R-squared for alpha=0.001: 0.9460580348968506
Running Lasso with alpha=0.01
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best XGBoost Parameters for alpha=0.01: {'colsample_bytree': 0.9, 'learning_rate': 0.03, 'max_depth': 3, 'n_estimators': 7000, 'subsample': 0.8}
XGBoost Model Mean Squared Error for alpha=0.01: 309527158.7336001
XGBoost Model R-squared for alpha=0.01: 0.9479759335517883
Running Lasso with alpha=0.1


  model = cd_fast.enet_coordinate_descent(


Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best XGBoost Parameters for alpha=0.1: {'colsample_bytree': 0.9, 'learning_rate': 0.03, 'max_depth': 3, 'n_estimators': 7000, 'subsample': 0.8}
XGBoost Model Mean Squared Error for alpha=0.1: 303793238.5887984
XGBoost Model R-squared for alpha=0.1: 0.9489396810531616
Running Lasso with alpha=1


  model = cd_fast.enet_coordinate_descent(


Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best XGBoost Parameters for alpha=1: {'colsample_bytree': 0.9, 'learning_rate': 0.03, 'max_depth': 3, 'n_estimators': 7000, 'subsample': 0.8}
XGBoost Model Mean Squared Error for alpha=1: 294400067.5396449
XGBoost Model R-squared for alpha=1: 0.9505184292793274
Alpha: 0.001
Best Parameters: {'colsample_bytree': 0.9, 'learning_rate': 0.03, 'max_depth': 3, 'n_estimators': 7000, 'subsample': 0.8}
MSE: 320937885.95468134
R2: 0.9460580348968506


Alpha: 0.01
Best Parameters: {'colsample_bytree': 0.9, 'learning_rate': 0.03, 'max_depth': 3, 'n_estimators': 7000, 'subsample': 0.8}
MSE: 309527158.7336001
R2: 0.9479759335517883


Alpha: 0.1
Best Parameters: {'colsample_bytree': 0.9, 'learning_rate': 0.03, 'max_depth': 3, 'n_estimators': 7000, 'subsample': 0.8}
MSE: 303793238.5887984
R2: 0.9489396810531616


Alpha: 1
Best Parameters: {'colsample_bytree': 0.9, 'learning_rate': 0.03, 'max_depth': 3, 'n_estimators': 7000, 'subsample': 0.8}
