# House Prices - Advanced Regression Techniques

* Author: [John Adeojo](https://www.john-adeojo.com/)
* Blog: [more projects on my medium blog](https://medium.com/@johnadeojo)
* LinkedIn: [Follow me](https://www.linkedin.com/in/john-adeojo/)

# Import Data 

In [1]:
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv(r"https://raw.githubusercontent.com/john-adeojo/kaggle_advanced_regression/main/data/01_raw/train%20(3).csv")
df_test = pd.read_csv(r"https://raw.githubusercontent.com/john-adeojo/kaggle_advanced_regression/main/data/01_raw/test%20(2).csv")

In [3]:
# do some quick data profiling with ydata profiling
# from ydata_profiling import ProfileReport

# profile = ProfileReport(df_train, title="Pandas Profiling Report: House Price Data")
# profile.to_file(r"C:\Users\johna\anaconda3\envs\kaggle-env\kaggle_advanced_regression\data\02_reports\testprofile_report.html")

In [4]:
# profile = ProfileReport(df_test, title="Pandas Profiling Report: House Price Data (Test)")
# profile.to_file(r"C:\Users\johna\anaconda3\envs\kaggle-env\kaggle_advanced_regression\data\02_reports\testprofile_report.html")

# Data Wrangling & Cleaning

In [5]:
# define funtion te rplace missing vairbales

import pandas as pd

def impute_missing(df, test=False):
    
    if test == False:
        id_df = df['Id']
        y = df['SalePrice']
        df = df.drop(columns=['Id', 'SalePrice'])
    else:
        id_df = df['Id']
        df = df.drop(columns=['Id'])
    
    
    df['LotFrontage'] = df['LotFrontage'].fillna(0)
    df['Alley'] = df['Alley'].fillna('No Alley')
    df['MasVnrType'] = df['MasVnrType'].fillna('None')
    df['MasVnrArea'] = df['MasVnrArea'].fillna(0)
    df['BsmtQual'] = df['BsmtQual'].fillna('None')
    df['BsmtCond'] = df['BsmtCond'].fillna('None')
    df['BsmtExposure'] = df['BsmtExposure'].fillna('None')
    df['BsmtFinType1'] = df['BsmtFinType1'].fillna('None')
    df['BsmtFinType2'] = df['BsmtFinType2'].fillna('None')
    df['Electrical'] = df['Electrical'].fillna('SBrkr')
    df['FireplaceQu'] = df['FireplaceQu'].fillna('None')
    df['GarageType'] = df['GarageType'].fillna('None')
    df['GarageYrBlt'] = df['GarageYrBlt'].fillna(9999)
    df['GarageFinish'] = df['GarageFinish'].fillna('None')
    df['GarageQual'] = df['GarageQual'].fillna('None')
    df['GarageCond'] = df['GarageCond'].fillna('None')
    df['PoolQC'] = df['PoolQC'].fillna('None')
    df['Fence'] = df['Fence'].fillna('None')
    df['MiscFeature'] = df['MiscFeature'].fillna('None')
    
    df['MSZoning'] = df['MSZoning'].fillna('RL')
    df['Utilities'] = df['Utilities'].fillna('AllPub')
    df['Exterior1st'] = df['Exterior1st'].fillna('VinylSd') 
    df['Exterior2nd'] = df['Exterior2nd'].fillna('VinylSd') 
    df['BsmtFinSF1'] = df['BsmtFinSF1'].fillna(0) 
    df['BsmtFinSF2'] = df['BsmtFinSF2'].fillna(0)
    df['BsmtUnfSF'] = df['BsmtUnfSF'].fillna(0)
    df['BsmtUnfSF'] = df['BsmtUnfSF'].fillna(0)
    df['TotalBsmtSF'] = df['TotalBsmtSF'].fillna(0)
    df['BsmtFullBath'] = df['BsmtFullBath'].fillna(0)
    df['BsmtHalfBath'] = df['BsmtHalfBath'].fillna(0)
    df['KitchenQual'] = df['KitchenQual'].fillna('TA')
    df['Functional'] = df['Functional'].fillna('Typ')
    df['Functional'] = df['Functional'].fillna('Typ')
    df['GarageCars'] = df['GarageCars'].fillna(0)
    df['GarageArea'] = df['GarageArea'].fillna(0) 
    df['SaleType'] = df['SaleType'].fillna('WD') 
    
    # convert int to object
    df['YearBuilt'] = df['YearBuilt'].astype('object')
    df['YearRemodAdd'] = df['YearRemodAdd'].astype('object')
    df['GarageYrBlt'] = df['GarageYrBlt'].astype('object')
    df['YrSold'] = df['YrSold'].astype('object')


    
    if test == False:
        return df, y, id_df
    else:
        return df, id_df



In [6]:
X, y, train_id = impute_missing(df_train, test=False)
X_test, test_id = impute_missing(df_test, test=True)

In [7]:
# Identify categorical and numeric columns
cat_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
num_features = X.select_dtypes(include=['number']).columns.tolist()

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from umap import UMAP
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor



preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features),
    ])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(tree_method='gpu_hist', objective='reg:squarederror'))
])



In [9]:
from scipy.stats import uniform, randint

# Helper function
def generate_uniform_distribution(min_val, max_val, size=1, seed=19):
    # create a uniform distribution object
    dist = uniform(loc=min_val, scale=max_val-min_val)
    return dist


In [10]:
# split data 
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.25, random_state=42)

In [11]:
from skopt.space import Real, Integer, Categorical


param_grid = {
    'regressor__learning_rate': Real(0.01, 0.3, prior='log-uniform'),  # or use lower values like Real(0.001, 0.1, prior='log-uniform') for finer search
    'regressor__n_estimators': Integer(50, 1000),  # Number of boosting rounds
    'regressor__max_depth': Integer(3, 10),  # Maximum depth of a tree
    'regressor__min_child_weight': Integer(1, 10),  # Minimum sum of instance weight needed in a child
    'regressor__gamma': Real(0, 5),  # Minimum loss reduction required to make a further partition on a leaf node of the tree
    'regressor__subsample': Real(0.5, 1),  # Subsample ratio of the training instances
    'regressor__colsample_bytree': Real(0.5, 1),  # Subsample ratio of columns when constructing each tree
    'regressor__reg_alpha': Real(0, 100, prior='log-uniform'),  # L1 regularization term on weights
    'regressor__reg_lambda': Real(1, 100, prior='log-uniform'),  # L2 regularization term on weights
}


In [12]:
from sklearn.model_selection import RandomizedSearchCV
from skopt import BayesSearchCV

model_random_search = BayesSearchCV(pipeline, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_iter=30, n_jobs=1)
model_random_search.fit(X_train, y_train)

In [13]:
best_params = model_random_search.best_params_

# Print the best parameters
print("Best parameters found: ", best_params)

Best parameters found:  OrderedDict([('regressor__colsample_bytree', 0.95826792083501), ('regressor__gamma', 4.696482949998402), ('regressor__learning_rate', 0.036472672926974405), ('regressor__max_depth', 5), ('regressor__min_child_weight', 2), ('regressor__n_estimators', 382), ('regressor__reg_alpha', 4.5076049909087335), ('regressor__reg_lambda', 5.0), ('regressor__subsample', 0.5)])


In [14]:
validation_score = model_random_search.score(X_validation, y_validation)

# Print the validation score
print("Validation score: ", validation_score)

Validation score:  -38387.856446736114
