# House Prices - Advanced Regression Techniques

* Author: [John Adeojo](https://www.john-adeojo.com/)
* Blog: [more projects on my medium blog](https://medium.com/@johnadeojo)
* LinkedIn: [Follow me](https://www.linkedin.com/in/john-adeojo/)

# Import Data 

In [1]:
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv(r"https://raw.githubusercontent.com/john-adeojo/kaggle_advanced_regression/main/data/01_raw/train%20(3).csv")
df_test = pd.read_csv(r"https://raw.githubusercontent.com/john-adeojo/kaggle_advanced_regression/main/data/01_raw/test%20(2).csv")

In [3]:
# do some quick data profiling with ydata profiling
# from ydata_profiling import ProfileReport

# profile = ProfileReport(df_train, title="Pandas Profiling Report: House Price Data")
# profile.to_file(r"C:\Users\johna\anaconda3\envs\kaggle-env\kaggle_advanced_regression\data\02_reports\testprofile_report.html")

In [4]:
# profile = ProfileReport(df_test, title="Pandas Profiling Report: House Price Data (Test)")
# profile.to_file(r"C:\Users\johna\anaconda3\envs\kaggle-env\kaggle_advanced_regression\data\02_reports\testprofile_report.html")

# Data Wrangling & Cleaning

In [5]:
# define funtion te rplace missing vairbales

import pandas as pd

def impute_missing(df, test=False):
    
    if test == False:
        id_df = df['Id']
        y = df['SalePrice']
        df = df.drop(columns=['Id', 'SalePrice'])
    else:
        id_df = df['Id']
        df = df.drop(columns=['Id'])
    
    
    df['LotFrontage'] = df['LotFrontage'].fillna(0)
    df['Alley'] = df['Alley'].fillna('No Alley')
    df['MasVnrType'] = df['MasVnrType'].fillna('None')
    df['MasVnrArea'] = df['MasVnrArea'].fillna(0)
    df['BsmtQual'] = df['BsmtQual'].fillna('None')
    df['BsmtCond'] = df['BsmtCond'].fillna('None')
    df['BsmtExposure'] = df['BsmtExposure'].fillna('None')
    df['BsmtFinType1'] = df['BsmtFinType1'].fillna('None')
    df['BsmtFinType2'] = df['BsmtFinType2'].fillna('None')
    df['Electrical'] = df['Electrical'].fillna('SBrkr')
    df['FireplaceQu'] = df['FireplaceQu'].fillna('None')
    df['GarageType'] = df['GarageType'].fillna('None')
    df['GarageYrBlt'] = df['GarageYrBlt'].fillna(9999)
    df['GarageFinish'] = df['GarageFinish'].fillna('None')
    df['GarageQual'] = df['GarageQual'].fillna('None')
    df['GarageCond'] = df['GarageCond'].fillna('None')
    df['PoolQC'] = df['PoolQC'].fillna('None')
    df['Fence'] = df['Fence'].fillna('None')
    df['MiscFeature'] = df['MiscFeature'].fillna('None')
    
    df['MSZoning'] = df['MSZoning'].fillna('RL')
    df['Utilities'] = df['Utilities'].fillna('AllPub')
    df['Exterior1st'] = df['Exterior1st'].fillna('VinylSd') 
    df['Exterior2nd'] = df['Exterior2nd'].fillna('VinylSd') 
    df['BsmtFinSF1'] = df['BsmtFinSF1'].fillna(0) 
    df['BsmtFinSF2'] = df['BsmtFinSF2'].fillna(0)
    df['BsmtUnfSF'] = df['BsmtUnfSF'].fillna(0)
    df['BsmtUnfSF'] = df['BsmtUnfSF'].fillna(0)
    df['TotalBsmtSF'] = df['TotalBsmtSF'].fillna(0)
    df['BsmtFullBath'] = df['BsmtFullBath'].fillna(0)
    df['BsmtHalfBath'] = df['BsmtHalfBath'].fillna(0)
    df['KitchenQual'] = df['KitchenQual'].fillna('TA')
    df['Functional'] = df['Functional'].fillna('Typ')
    df['Functional'] = df['Functional'].fillna('Typ')
    df['GarageCars'] = df['GarageCars'].fillna(0)
    df['GarageArea'] = df['GarageArea'].fillna(0) 
    df['SaleType'] = df['SaleType'].fillna('WD') 
    
    # convert int to object
    df['YearBuilt'] = df['YearBuilt'].astype('object')
    df['YearRemodAdd'] = df['YearRemodAdd'].astype('object')
    df['GarageYrBlt'] = df['GarageYrBlt'].astype('object')
    df['YrSold'] = df['YrSold'].astype('object')


    
    if test == False:
        return df, y, id_df
    else:
        return df, id_df



In [6]:
X, y, train_id = impute_missing(df_train, test=False)
X_test, test_id = impute_missing(df_test, test=True)

In [7]:
# Identify categorical and numeric columns
cat_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
num_features = X.select_dtypes(include=['number']).columns.tolist()

In [8]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class LogTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return np.log1p(X)  # Using log1p to avoid issues with 0 values


In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import Ridge, LinearRegression, ElasticNet
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor


preprocessor = ColumnTransformer(
    transformers=[
        ('logtransform', LogTransformer(), num_features),
        ('standardize', StandardScaler(), num_features), 
        ('onehotencode', OneHotEncoder(handle_unknown='ignore'), cat_features),
    ])

xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(tree_method='gpu_hist', objective='reg:squarederror', random_state=42))
])

rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

ridge_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', Ridge(random_state=42))
])



In [10]:
# split data 
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.25, random_state=42)

y_train = np.log(y_train)
y_validation = np.log(y_validation)

In [11]:
from skopt.space import Real, Integer, Categorical

# Parameter grid for XGBoost
xgb_param_grid = {
    'regressor__learning_rate': Real(0.01, 0.3, prior='log-uniform'),
    'regressor__n_estimators': Integer(50, 2000),
    'regressor__max_depth': Integer(3, 50),
    'regressor__min_child_weight': Integer(1, 20),
    'regressor__gamma': Real(0, 5),
    'regressor__subsample': Real(0.5, 1),
    'regressor__colsample_bytree': Real(0.5, 1),
    'regressor__reg_alpha': Real(0.1, 1000, prior='log-uniform'),
    'regressor__reg_lambda': Real(0.1, 1000, prior='log-uniform'),
}

# Parameter grid for RandomForest
rf_param_grid = {
    'regressor__n_estimators': Integer(50, 2000),
    'regressor__max_depth': Integer(3, 50),
    'regressor__min_samples_split': Integer(2, 20),
    'regressor__min_samples_leaf': Integer(1, 20),
}


# Parameter grid for Ridge
ridge_param_grid = {
    'regressor__alpha': Real(0.1, 1000, prior='log-uniform'),
}




In [None]:
from skopt import BayesSearchCV


# Tune XGBoost model
xgb_tuner = BayesSearchCV(
    xgb_pipeline,
    xgb_param_grid,
    cv=5,
    scoring='neg_root_mean_squared_error',
    n_iter=30,
    n_jobs=-1
)
xgb_tuner.fit(X_train, y_train)

# Tune RandomForest model
rf_tuner = BayesSearchCV(
    rf_pipeline,
    rf_param_grid,
    cv=5,
    scoring='neg_root_mean_squared_error',
    n_iter=30,
    n_jobs=-1
)
rf_tuner.fit(X_train, y_train)

# Tune Ridge model
ridge_tuner = BayesSearchCV(
    ridge_pipeline,
    ridge_param_grid,
    cv=5,
    scoring='neg_root_mean_squared_error',
    n_iter=30,
    n_jobs=-1
)
ridge_tuner.fit(X_train, y_train)





In [None]:
validation_score = xgb_tuner.score(X_validation, y_validation)

# Print the validation score
print("Validation score xgb: ", -validation_score)

validation_score = rf_tuner.score(X_validation, y_validation)

# Print the validation score
print("Validation score random forest: ", -validation_score)

validation_score = ridge_tuner.score(X_validation, y_validation)

# Print the validation score
print("Validation score ridge: ", -validation_score)


In [None]:
from sklearn.linear_model import LinearRegression

# Create a list of the tuned models with names
models = [
    ('xgb', xgb_tuner.best_estimator_),
    ('rf', rf_tuner.best_estimator_),
    ('ridge', ridge_tuner.best_estimator_)
]

# Create the stacking model
stacking_regressor = StackingRegressor(
    estimators=models,
    final_estimator=LinearRegression(),
    cv=5,
    n_jobs=-1
)

# Fit the stacking model on your training data
stacking_regressor.fit(X_train, y_train)

In [None]:
# Test stacking model on validation set 
from sklearn.metrics import mean_squared_error

y_pred = stacking_regressor.predict(X_validation)
stacked_rmse = np.sqrt(mean_squared_error(y_validation, y_pred))
print("Validation score stacked model: ", stacked_rmse)


In [None]:
stacking_regressor.fit(X, y)


In [None]:
test_pred = np.exp(stacking_regressor.predict(X_test))
test_pred

In [None]:
submission = pd.DataFrame(test_id)
submission['SalePrice'] = test_pred.round()

In [None]:
submission.to_csv(r"C:\Users\johna\OneDrive\Desktop\submission.csv", index=False)