# House Prices Regression

In [168]:

import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from scipy.stats import randint, uniform
from sklearn.metrics import root_mean_squared_error, r2_score
import numpy as np
from sklearn.preprocessing import OneHotEncoder


In [94]:
model_df = pd.read_csv("/Users/sa17/Desktop/house-prices-regression/data/processed/newtrain.csv")

model_df

Unnamed: 0,SalePrice,MSSubClass,LotArea,OverallQual,GrLivArea,GarageCars,1stFlrSF,FullBath,TotRmsAbvGrd,YearBuilt,...,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker
0,208500,60,8450,7,1710,2,856,2,8,2003,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,181500,20,9600,6,1262,2,1262,2,6,1976,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,223500,60,11250,7,1786,2,920,2,6,2001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,140000,70,9550,7,1717,3,961,1,7,1915,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,250000,60,14260,8,2198,3,1145,2,9,2000,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1394,175000,60,7917,6,1647,2,953,2,7,1999,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1395,210000,20,13175,6,2073,2,2073,2,7,1978,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1396,266500,70,9042,7,2340,1,1188,2,9,1941,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1397,142125,20,9717,5,1078,1,1078,1,5,1950,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Linear Regression Model

In [95]:
# Declare feature vector and target variable 
X = model_df.drop(["SalePrice"], axis=1)

y = model_df["SalePrice"]

# Split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression Model
linear = LinearRegression()
linear.fit(X_train, y_train)
linear.score(X_test, y_test)

# Prediction Model
linear_pred = linear.predict(X_test)

# Evaluate Model
linear_rmse = root_mean_squared_error(y_test, linear_pred)
linear_r2 = r2_score(y_test, linear_pred)

print(f"Linear Regression RMSE: {linear_rmse:.2f}")
print(f"Linear Regression R2 Score: {linear_r2:.2f}")



Linear Regression RMSE: 22093.61
Linear Regression R2 Score: 0.85


Root Mean Squared Error measures how far the model’s predictions are, on average, from the actual SalePrice. 


R2 measures how well the model explains the variance in the data. R2 ranges from 0 t0 1 and the closer it is to 1 the better the model is.

The Linear Regression model predictions are off by about 22,093 on average.

0.85 = 85% of the variation in house prices. 

## Hyperparameter Tuning

In [96]:
# For Linear Regression use ElasticNet (combination of Lasso (L1) and Ridge (L2) to regularized the model)
elastic = ElasticNet()

# Hyperparameter Tuning with RandomizedSearchCV  
param_dist = {
    "alpha": np.logspace(-3, 2, 100),      # Regularization strength 
    "l1_ratio": np.linspace(0.1, 1.0, 12)  # Determines the mix between Lasso (L1) and Ridge (L2)
}

elastic_search = RandomizedSearchCV(
    estimator=elastic,                      # Model
    param_distributions=param_dist,         # Hyperparameter
    n_iter=10,                              # Number of parameter combinations to try
    cv=5,                                   # 5-fold cross-validation
    scoring="neg_root_mean_squared_error",  # set to negative mean squared error for minimization
    random_state=42,
    n_jobs=-1,                              # using all processors
    verbose=2                               # the computation time for each fold and parameter is displayed
)

# Tuned Model
elastic_search.fit(X_train, y_train)
linear_tuned = elastic_search.best_estimator_
print(f"\nBest Parameters: {linear_tuned}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END alpha=49.770235643321136, l1_ratio=0.9181818181818181; total time=   0.0s
[CV] END alpha=49.770235643321136, l1_ratio=0.9181818181818181; total time=   0.0s
[CV] END alpha=49.770235643321136, l1_ratio=0.9181818181818181; total time=   0.0s
[CV] END alpha=49.770235643321136, l1_ratio=0.9181818181818181; total time=   0.0s
[CV] END alpha=49.770235643321136, l1_ratio=0.9181818181818181; total time=   0.0s
[CV] END alpha=3.8535285937105273, l1_ratio=0.7545454545454545; total time=   0.0s
[CV] END alpha=3.8535285937105273, l1_ratio=0.7545454545454545; total time=   0.0s
[CV] END alpha=3.8535285937105273, l1_ratio=0.7545454545454545; total time=   0.0s
[CV] END alpha=3.8535285937105273, l1_ratio=0.7545454545454545; total time=   0.0s
[CV] END alpha=3.8535285937105273, l1_ratio=0.7545454545454545; total time=   0.0s
[CV] END alpha=55.90810182512222, l1_ratio=0.26363636363636367; total time=   0.0s
[CV] END alpha=55.90810182

In [97]:
# Prediction Model
linear_tuned_pred = linear_tuned.predict(X_test)

# Evaluate Model
linear_tuned_rmse = root_mean_squared_error(y_test, linear_tuned_pred)
linear_tuned_r2 = r2_score(y_test, linear_tuned_pred)

print(f"Tuned Linear Regression (Elastic) RMSE: {linear_tuned_rmse:.2f}")
print(f"Tuned Linear Regression (Elastic) R2 Score: {linear_tuned_r2:.2f}")

Tuned Linear Regression (Elastic) RMSE: 21887.15
Tuned Linear Regression (Elastic) R2 Score: 0.85



The Tuned Linear Regression model predictions are off by about 21,887 on average.

0.85 = 85% of the variation in house prices.

The perfomance of the model didn't really change.

## Random Forest Regressor Model

In [98]:
# Random Forest Regressor Model
random = RandomForestRegressor()
random.fit(X_train, y_train)
random.score(X_test, y_test)

# Prediction Model
random_pred = random.predict(X_test)

# Evaluate Model
random_rmse = root_mean_squared_error(y_test, random_pred)
random_r2 = r2_score(y_test, random_pred)

print(f"Random Forest Regressor RMSE: {random_rmse:.2f}")
print(f"Random Forest Regressor R2 Score: {random_r2:.2f}")

Random Forest Regressor RMSE: 22301.23
Random Forest Regressor R2 Score: 0.84


The Random Forest Regressor model predictions are off by about 22,301 on average.

0.84 = 84% of the variation in house prices. 

## Hyperparamater Tuning

In [99]:
# Hyperparameter Tuning with RandomizedSearchCV  
param_dist = {
    "n_estimators": randint(50, 500),       # Number of trees
    "max_depth": randint(3, 30),            # Max depth of each tree
    "min_samples_split": randint(2, 10),    # Minimum samples to split at internal node
    "min_samples_leaf": randint(1, 10),     # Minimum samples at leaf node
    "max_features": ["sqrt", "log2"],       # Number of features to consider at each split
    "bootstrap": [True, False]              # Whether bootstrap samples are used
}

random_search = RandomizedSearchCV(
    estimator=random,                       
    param_distributions=param_dist,        
    n_iter=10,                           
    cv=5,                                  
    scoring="neg_root_mean_squared_error",  
    random_state=42,
    n_jobs=-1,                             
    verbose=2                              
)

# Tuned Model
random_search.fit(X_train, y_train)
random_tuned = random_search.best_estimator_
print(f"\nBest Parameters: {random_tuned}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END bootstrap=True, max_depth=22, max_features=sqrt, min_samples_leaf=8, min_samples_split=6, n_estimators=70; total time=   0.1s
[CV] END bootstrap=True, max_depth=22, max_features=sqrt, min_samples_leaf=8, min_samples_split=6, n_estimators=70; total time=   0.1s
[CV] END bootstrap=True, max_depth=22, max_features=sqrt, min_samples_leaf=8, min_samples_split=6, n_estimators=70; total time=   0.1s
[CV] END bootstrap=True, max_depth=22, max_features=sqrt, min_samples_leaf=8, min_samples_split=6, n_estimators=70; total time=   0.1s
[CV] END bootstrap=True, max_depth=22, max_features=sqrt, min_samples_leaf=8, min_samples_split=6, n_estimators=70; total time=   0.1s
[CV] END bootstrap=True, max_depth=28, max_features=sqrt, min_samples_leaf=7, min_samples_split=4, n_estimators=137; total time=   0.1s
[CV] END bootstrap=True, max_depth=28, max_features=sqrt, min_samples_leaf=7, min_samples_split=4, n_estimators=137; total time=

In [100]:
# Prediction Model
random_tuned_pred = random_tuned.predict(X_test)

# Evaluate Model
random_tuned_rmse = root_mean_squared_error(y_test, random_tuned_pred)
random_tuned_r2 = r2_score(y_test, random_tuned_pred)

print(f"Tuned Random Forest Regressor RMSE: {random_tuned_rmse:.2f}")
print(f"Tuned Random Forest Regressor R2 Score: {random_tuned_r2:.2f}")

Tuned Random Forest Regressor RMSE: 20846.57
Tuned Random Forest Regressor R2 Score: 0.86


The Tuned Random Forest Regressor model predictions are off by about 20,846 on average.

0.86 = 86% of the variation in house prices. 

The perfomance of the model has increased.

## Gradient Boosting Regressor Model

In [101]:
# Gradient Boosting Regressor Model
gradient = GradientBoostingRegressor()
gradient.fit(X_train, y_train)
gradient.score(X_test, y_test)

# Prediction Model
gradient_pred = gradient.predict(X_test)

# Evaluate Model
gradient_rmse = root_mean_squared_error(y_test, gradient_pred)
gradient_r2 = r2_score(y_test, gradient_pred)

print(f"Gradient Boosting Regressor RMSE: {gradient_rmse:.2f}")
print(f"Gradient Boosting Regressor R2 Score: {gradient_r2:.2f}")

Gradient Boosting Regressor RMSE: 22064.10
Gradient Boosting Regressor R2 Score: 0.85


The Gradient Boosting Regressor model predictions are off by about 22,064 on average.

0.85 = 85% of the variation in house prices. 

## Hyperparameter Tuning

In [102]:
# Hyperparameter Tuning with RandomizedSearchCV  
param_dist = {
    "n_estimators": randint(50, 500),  
    "learning_rate": uniform(0.01, 0.10),    # step size shrinkage
    "max_depth": randint(3, 30),            
    "min_samples_split": randint(2, 10),    
    "min_samples_leaf": randint(1, 10),     
    "max_features": ["sqrt", "log2"],                
}

gradient_search = RandomizedSearchCV(
    estimator=gradient,                      
    param_distributions=param_dist,         
    n_iter=10,                              
    cv=5,                                  
    scoring="neg_root_mean_squared_error",  
    random_state=42,
    n_jobs=-1,                              
    verbose=2                               
)

# Tuned Model
gradient_search.fit(X_train, y_train)
gradient_tuned = gradient_search.best_estimator_
print(f"\nBest Parameters: {gradient_tuned}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END learning_rate=0.047454011884736254, max_depth=17, max_features=sqrt, min_samples_leaf=8, min_samples_split=6, n_estimators=70; total time=   0.1s
[CV] END learning_rate=0.047454011884736254, max_depth=17, max_features=sqrt, min_samples_leaf=8, min_samples_split=6, n_estimators=70; total time=   0.1s
[CV] END learning_rate=0.047454011884736254, max_depth=17, max_features=sqrt, min_samples_leaf=8, min_samples_split=6, n_estimators=70; total time=   0.1s
[CV] END learning_rate=0.047454011884736254, max_depth=17, max_features=sqrt, min_samples_leaf=8, min_samples_split=6, n_estimators=70; total time=   0.1s
[CV] END learning_rate=0.047454011884736254, max_depth=17, max_features=sqrt, min_samples_leaf=8, min_samples_split=6, n_estimators=70; total time=   0.1s
[CV] END learning_rate=0.025601864044243652, max_depth=21, max_features=sqrt, min_samples_leaf=8, min_samples_split=6, n_estimators=149; total time=   0.1s
[CV] END

In [103]:
# Prediction Model
gradient_tuned_pred = gradient_tuned.predict(X_test)

# Evaluate Model
gradient_tuned_rmse = root_mean_squared_error(y_test, gradient_tuned_pred)
gradient_tuned_r2 = r2_score(y_test, gradient_tuned_pred)

print(f"Tuned Gradient Boosting Regressor RMSE: {gradient_tuned_rmse:.2f}")
print(f"Tuned Gradient Boosting Regressor R2 Score: {gradient_tuned_r2:.2f}")

Tuned Gradient Boosting Regressor RMSE: 19906.37
Tuned Gradient Boosting Regressor R2 Score: 0.88


The Tuned Gradient Boosting Regressor model predictions are off by about 19,906 on average.

0.88 = 88% of the variation in house prices. 

The performace of the model has increased tremendously. 

## XGBoost Regressor Model

In [104]:
# XGBoost Regressor Model
xgboost = XGBRegressor()
xgboost.fit(X_train, y_train)
xgboost.score(X_test, y_test)

# Prediction Model
xgboost_pred = xgboost.predict(X_test)

# Evaluate Model
xgboost_rmse = root_mean_squared_error(y_test, xgboost_pred)
xgboost_r2 = r2_score(y_test, xgboost_pred)

print(f"XGboost Regressor RMSE: {xgboost_rmse:.2f}")
print(f"XGboost Regressor R2 Score: {xgboost_r2:.2f}")

XGboost Regressor RMSE: 24978.33
XGboost Regressor R2 Score: 0.80


The XGBoost Regressor model predictions are off by about 24,978 on average.

0.80 = 80% of the variation in house prices. 

## Hyperparameter Tuning

In [105]:
# Hyperparameter Tuning with RandomizedSearchCV  
param_dist = {
    "n_estimators": randint(50, 500),  
    "learning_rate": uniform(0.01, 0.10),   
    "max_depth": randint(3, 30), 
    "min_child_weight": randint(1, 10),      # Minimum samples at child node (controls overfitting)      
    "subsample": uniform(0.5, 0.5),          # Fraction of the training data to randomly sample (prevents overfitting)
    "colsample_bytree": uniform(0.5, 0.5)    # Fraction of features to randomly sample for each tree           
}

xgboost_search = RandomizedSearchCV(
    estimator=xgboost,                      
    param_distributions=param_dist,         
    n_iter=10,                              
    cv=5,                                  
    scoring="neg_root_mean_squared_error",  
    random_state=42,
    n_jobs=-1,                              
    verbose=2                               
)

# Tuned Model
xgboost_search.fit(X_train, y_train)
xgboost_tuned = xgboost_search.best_estimator_
print(f"\nBest Parameters: {xgboost_tuned}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END colsample_bytree=0.6872700594236812, learning_rate=0.10507143064099161, max_depth=13, min_child_weight=8, n_estimators=238, subsample=0.7984250789732434; total time=   0.3s
[CV] END colsample_bytree=0.6872700594236812, learning_rate=0.10507143064099161, max_depth=13, min_child_weight=8, n_estimators=238, subsample=0.7984250789732434; total time=   0.3s
[CV] END colsample_bytree=0.6872700594236812, learning_rate=0.10507143064099161, max_depth=13, min_child_weight=8, n_estimators=238, subsample=0.7984250789732434; total time=   0.3s
[CV] END colsample_bytree=0.6872700594236812, learning_rate=0.10507143064099161, max_depth=13, min_child_weight=8, n_estimators=238, subsample=0.7984250789732434; total time=   0.3s
[CV] END colsample_bytree=0.6872700594236812, learning_rate=0.10507143064099161, max_depth=13, min_child_weight=8, n_estimators=238, subsample=0.7984250789732434; total time=   0.3s
[CV] END colsample_bytree=0.7

In [106]:
# Prediction Model
xgboost_tuned_pred = xgboost_tuned.predict(X_test)

# Evaluate Model
xgboost_tuned_rmse = root_mean_squared_error(y_test, xgboost_tuned_pred)
xgboost_tuned_r2 = r2_score(y_test, xgboost_tuned_pred)

print(f"Tuned XGBoost Regressor RMSE: {xgboost_tuned_rmse:.2f}")
print(f"Tuned XGBoost Regressor R2 Score: {xgboost_tuned_r2:.2f}")

Tuned XGBoost Regressor RMSE: 20210.29
Tuned XGBoost Regressor R2 Score: 0.87


The Tuned XGBoost Regressor model predictions are off by about 20,210 on average.

0.87 = 87% of the variation in house prices. 

The performace of the model has increased tremendously as well.

## Model's Summary

In [110]:
print(f"Linear Regression RMSE: {linear_rmse:.2f}")
print(f"Linear Regression R2 Score: {linear_r2:.2f}\n")

print(f"Tuned Linear Regression (Elastic) RMSE: {linear_tuned_rmse:.2f}")
print(f"Tuned Linear Regression (Elastic) R2 Score: {linear_tuned_r2:.2f}\n")

print(f"Random Forest Regressor RMSE: {random_rmse:.2f}")
print(f"Random Forest Regressor R2 Score: {random_r2:.2f}\n")

print(f"Tuned Random Forest Regressor RMSE: {random_tuned_rmse:.2f}")
print(f"Tuned Random Forest Regressor R2 Score: {random_tuned_r2:.2f}\n")

print(f"Gradient Boosting Regressor RMSE: {gradient_rmse:.2f}")
print(f"Gradient Boosting Regressor R2 Score: {gradient_r2:.2f}\n")

print(f"Tuned Gradient Boosting Regressor RMSE: {gradient_tuned_rmse:.2f}")
print(f"Tuned Gradient Boosting Regressor R2 Score: {gradient_tuned_r2:.2f}\n")

print(f"XGboost Regressor RMSE: {xgboost_rmse:.2f}")
print(f"XGboost Regressor R2 Score: {xgboost_r2:.2f}\n")

print(f"Tuned XGBoost Regressor RMSE: {xgboost_tuned_rmse:.2f}")
print(f"Tuned XGBoost Regressor R2 Score: {xgboost_tuned_r2:.2f}")

Linear Regression RMSE: 22093.61
Linear Regression R2 Score: 0.85

Tuned Linear Regression (Elastic) RMSE: 21887.15
Tuned Linear Regression (Elastic) R2 Score: 0.85

Random Forest Regressor RMSE: 22301.23
Random Forest Regressor R2 Score: 0.84

Tuned Random Forest Regressor RMSE: 20846.57
Tuned Random Forest Regressor R2 Score: 0.86

Gradient Boosting Regressor RMSE: 22064.10
Gradient Boosting Regressor R2 Score: 0.85

Tuned Gradient Boosting Regressor RMSE: 19906.37
Tuned Gradient Boosting Regressor R2 Score: 0.88

XGboost Regressor RMSE: 24978.33
XGboost Regressor R2 Score: 0.80

Tuned XGBoost Regressor RMSE: 20210.29
Tuned XGBoost Regressor R2 Score: 0.87


The best-performing model was the Tuned Gradient Boosting Regressor. It achieved the most accurate predictions, with an average error (RMSE) of about $19,906. Additionally, it explained 88% of the variation in house prices (R2 = 0.88), indicating a strong fit between the model and the data.

## Apply Tuned Gradient Boosting Regressor to Test Dataset

In [201]:
df = pd.read_csv("/Users/sa17/Desktop/house-prices-regression/data/raw/test.csv")

# Use the same columns from model dataset
columns = ["MSSubClass", "LotArea", "OverallQual", "GrLivArea", "GarageCars", "1stFlrSF", "FullBath", "TotRmsAbvGrd", "YearBuilt", "YearRemodAdd", "Foundation", "Fireplaces", "ExterQual", "Neighborhood"]

test_df = df[columns]

test_df

Unnamed: 0,MSSubClass,LotArea,OverallQual,GrLivArea,GarageCars,1stFlrSF,FullBath,TotRmsAbvGrd,YearBuilt,YearRemodAdd,Foundation,Fireplaces,ExterQual,Neighborhood
0,20,11622,5,896,1.0,896,1,5,1961,1961,CBlock,0,TA,NAmes
1,20,14267,6,1329,1.0,1329,1,6,1958,1958,CBlock,0,TA,NAmes
2,60,13830,5,1629,2.0,928,2,6,1997,1998,PConc,1,TA,Gilbert
3,60,9978,6,1604,2.0,926,2,7,1998,1998,PConc,1,TA,Gilbert
4,120,5005,8,1280,2.0,1280,2,5,1992,1992,PConc,0,Gd,StoneBr
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160,1936,4,1092,0.0,546,1,5,1970,1970,CBlock,0,TA,MeadowV
1455,160,1894,4,1092,1.0,546,1,6,1970,1970,CBlock,0,TA,MeadowV
1456,20,20000,5,1224,2.0,1224,1,7,1960,1996,CBlock,1,TA,Mitchel
1457,85,10441,5,970,0.0,970,1,6,1992,1992,PConc,0,TA,Mitchel


In [202]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MSSubClass    1459 non-null   int64  
 1   LotArea       1459 non-null   int64  
 2   OverallQual   1459 non-null   int64  
 3   GrLivArea     1459 non-null   int64  
 4   GarageCars    1458 non-null   float64
 5   1stFlrSF      1459 non-null   int64  
 6   FullBath      1459 non-null   int64  
 7   TotRmsAbvGrd  1459 non-null   int64  
 8   YearBuilt     1459 non-null   int64  
 9   YearRemodAdd  1459 non-null   int64  
 10  Foundation    1459 non-null   object 
 11  Fireplaces    1459 non-null   int64  
 12  ExterQual     1459 non-null   object 
 13  Neighborhood  1459 non-null   object 
dtypes: float64(1), int64(10), object(3)
memory usage: 159.7+ KB


In [203]:
test_df.describe()

Unnamed: 0,MSSubClass,LotArea,OverallQual,GrLivArea,GarageCars,1stFlrSF,FullBath,TotRmsAbvGrd,YearBuilt,YearRemodAdd,Fireplaces
count,1459.0,1459.0,1459.0,1459.0,1458.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0
mean,57.378341,9819.161069,6.078821,1486.045922,1.766118,1156.534613,1.570939,6.385195,1971.357779,1983.662783,0.58122
std,42.74688,4955.517327,1.436812,485.566099,0.775945,398.16582,0.55519,1.508895,30.390071,21.130467,0.64742
min,20.0,1470.0,1.0,407.0,0.0,407.0,0.0,3.0,1879.0,1950.0,0.0
25%,20.0,7391.0,5.0,1117.5,1.0,873.5,1.0,5.0,1953.0,1963.0,0.0
50%,50.0,9399.0,6.0,1432.0,2.0,1079.0,2.0,6.0,1973.0,1992.0,0.0
75%,70.0,11517.5,7.0,1721.0,2.0,1382.5,2.0,7.0,2001.0,2004.0,1.0
max,190.0,56600.0,10.0,5095.0,5.0,5095.0,4.0,15.0,2010.0,2010.0,4.0


In [204]:
test_df.shape

(1459, 14)

In [205]:
# Checking for null values 
test_df.isna().sum()

MSSubClass      0
LotArea         0
OverallQual     0
GrLivArea       0
GarageCars      1
1stFlrSF        0
FullBath        0
TotRmsAbvGrd    0
YearBuilt       0
YearRemodAdd    0
Foundation      0
Fireplaces      0
ExterQual       0
Neighborhood    0
dtype: int64

In [207]:
# Checking the values for garage cars to see what best to fill null values with
test_df["GarageCars"].value_counts()

GarageCars
2.0    770
1.0    407
3.0    193
0.0     76
4.0     11
5.0      1
Name: count, dtype: int64

In [208]:
# Filling with 2.0 as it’s the most common value by far so it won’t skew the data
test_df.loc[:, "GarageCars"] = test_df["GarageCars"].fillna(2.0)

In [209]:
# One-hot encode all categorical columns 
cat_columns = test_df.select_dtypes(include=["object"]).columns

encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore") 

# Fit, transform and create a new dataframe 
onehot = encoder.fit_transform(test_df[cat_columns])

onehot_df = pd.DataFrame(onehot, columns=encoder.get_feature_names_out(cat_columns), index=test_df.index)

# Add the one-hot encoded columns to the original DataFrame and drop original categorical columns
encoded_df = pd.concat([test_df.drop(columns=cat_columns), onehot_df], axis=1)

encoded_df


Unnamed: 0,MSSubClass,LotArea,OverallQual,GrLivArea,GarageCars,1stFlrSF,FullBath,TotRmsAbvGrd,YearBuilt,YearRemodAdd,...,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker
0,20,11622,5,896,1.0,896,1,5,1961,1961,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,20,14267,6,1329,1.0,1329,1,6,1958,1958,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,60,13830,5,1629,2.0,928,2,6,1997,1998,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,60,9978,6,1604,2.0,926,2,7,1998,1998,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,120,5005,8,1280,2.0,1280,2,5,1992,1992,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160,1936,4,1092,0.0,546,1,5,1970,1970,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1455,160,1894,4,1092,1.0,546,1,6,1970,1970,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1456,20,20000,5,1224,2.0,1224,1,7,1960,1996,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1457,85,10441,5,970,0.0,970,1,6,1992,1992,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [213]:
# Use Tuned Gradient Boosting Regressor and save sale price pedictions to csv file 
sale_predictions = gradient_tuned.predict(encoded_df)

final = pd.DataFrame({
    "Id": df["Id"],
    "SalePrice": sale_predictions
})

final.to_csv("/Users/sa17/Desktop/house-prices-regression/data/final/sale_predictions.csv", index=False)