### Objective: To build and evaluate machine learning models to accurately predict house prices based on various features of residential homes in Ames, Iowa.

### Overview of the dataset

In [85]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('/Users/user/Desktop/Data Science/python_files/files_for_data_wrangling/AmesHousing.csv')

df.columns

Index(['Order', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area',
       'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities',
       'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual',
       'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style',
       'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual',
       'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1',
       'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF',
       'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF',
       '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath',
       'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
       'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional',
       'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt',
      

In [87]:
df.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [89]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Data columns (total 82 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Order            2930 non-null   int64  
 1   PID              2930 non-null   int64  
 2   MS SubClass      2930 non-null   int64  
 3   MS Zoning        2930 non-null   object 
 4   Lot Frontage     2440 non-null   float64
 5   Lot Area         2930 non-null   int64  
 6   Street           2930 non-null   object 
 7   Alley            198 non-null    object 
 8   Lot Shape        2930 non-null   object 
 9   Land Contour     2930 non-null   object 
 10  Utilities        2930 non-null   object 
 11  Lot Config       2930 non-null   object 
 12  Land Slope       2930 non-null   object 
 13  Neighborhood     2930 non-null   object 
 14  Condition 1      2930 non-null   object 
 15  Condition 2      2930 non-null   object 
 16  Bldg Type        2930 non-null   object 
 17  House Style   

In [90]:
df.describe(include='all')

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
count,2930.0,2930.0,2930.0,2930,2440.0,2930.0,2930,198,2930,2930,...,2930.0,13,572,106,2930.0,2930.0,2930.0,2930,2930,2930.0
unique,,,,7,,,2,2,4,4,...,,4,4,5,,,,10,6,
top,,,,RL,,,Pave,Grvl,Reg,Lvl,...,,Ex,MnPrv,Shed,,,,WD,Normal,
freq,,,,2273,,,2918,120,1859,2633,...,,4,330,95,,,,2536,2413,
mean,1465.5,714464500.0,57.387372,,69.22459,10147.921843,,,,,...,2.243345,,,,50.635154,6.216041,2007.790444,,,180796.060068
std,845.96247,188730800.0,42.638025,,23.365335,7880.017759,,,,,...,35.597181,,,,566.344288,2.714492,1.316613,,,79886.692357
min,1.0,526301100.0,20.0,,21.0,1300.0,,,,,...,0.0,,,,0.0,1.0,2006.0,,,12789.0
25%,733.25,528477000.0,20.0,,58.0,7440.25,,,,,...,0.0,,,,0.0,4.0,2007.0,,,129500.0
50%,1465.5,535453600.0,50.0,,68.0,9436.5,,,,,...,0.0,,,,0.0,6.0,2008.0,,,160000.0
75%,2197.75,907181100.0,70.0,,80.0,11555.25,,,,,...,0.0,,,,0.0,8.0,2009.0,,,213500.0


### Feature Engineering

In [93]:
# Create a new feature for the total square footage of the house
df['TotalSF'] = df['1st Flr SF'] + df['2nd Flr SF'] + df['Total Bsmt SF']

# Create a new feature for the age of the house at the time of sale
df['House_Age'] = df['Yr Sold'] - df['Year Built']

# Create a new feature for the age of the remodel (if any) at the time of sale
df['Remodel_Age'] = df['Yr Sold'] - df['Year Remod/Add']

# Handle cases where Remodel_Age is negative (no remodel)
df['Remodel_Age'] = df['Remodel_Age'].apply(lambda x: x if x >= 0 else 0)

# Create a new feature indicating if the house was remodeled
df['Is_Remodeled'] = df['Year Remod/Add'] != df['Year Built']

# Interaction feature between Overall Quality and Total Square Footage
df['Qual_TotalSF'] = df['Overall Qual'] * df['TotalSF']

# Display the first few rows with the new features
display(df[['SalePrice', 'TotalSF', 'House_Age', 'Remodel_Age', 'Is_Remodeled', 'Qual_TotalSF']].head())

Unnamed: 0,SalePrice,TotalSF,House_Age,Remodel_Age,Is_Remodeled,Qual_TotalSF
0,215000,2736.0,50,50,False,16416.0
1,105000,1778.0,49,49,False,8890.0
2,172000,2658.0,52,52,False,15948.0
3,244000,4220.0,42,42,False,29540.0
4,189900,2557.0,13,12,True,12785.0


### Data Preprocessing

In [95]:
from sklearn.impute import SimpleImputer

# Identify numerical and categorical columns (excluding the target and potentially ID columns)
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
categorical_cols = df.select_dtypes(include='object').columns.tolist()

# Remove 'SalePrice', 'PID', and 'Order' from features
if 'SalePrice' in numerical_cols:
    numerical_cols.remove('SalePrice')
if 'PID' in numerical_cols:
    numerical_cols.remove('PID')
if 'Order' in numerical_cols:
    numerical_cols.remove('Order')

In [96]:
# Impute missing values

# Numerical columns: Impute with mean
numerical_imputer = SimpleImputer(strategy='mean')
df[numerical_cols] = numerical_imputer.fit_transform(df[numerical_cols])

# Categorical columns: Impute with most frequent value
categorical_imputer = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = categorical_imputer.fit_transform(df[categorical_cols])


# Encode categorical features (One-Hot Encoding)
df = pd.get_dummies(df, columns=categorical_cols, dummy_na=False)

# Separate target variable
y = df['SalePrice']
X = df.drop('SalePrice', axis=1)

In [97]:
# Scale numerical features
from sklearn.preprocessing import StandardScaler

# Ensure we only scale columns that are now numerical and are features
X_numerical_cols = X.select_dtypes(include=np.number).columns.tolist()

scaler = StandardScaler()
X[X_numerical_cols] = scaler.fit_transform(X[X_numerical_cols])

display(X.head())
display(y.head())

Unnamed: 0,Order,PID,MS SubClass,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,...,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD,Sale Condition_Abnorml,Sale Condition_AdjLand,Sale Condition_Alloca,Sale Condition_Family,Sale Condition_Normal,Sale Condition_Partial
0,-1.73146,-0.997164,-0.877005,3.366911,2.744381,-0.067254,-0.506718,-0.375537,-1.163488,0.056639,...,False,False,False,True,False,False,False,False,True,False
1,-1.730277,-0.996904,-0.877005,0.505463,0.187097,-0.776079,0.393091,-0.342468,-1.115542,-0.571242,...,False,False,False,True,False,False,False,False,True,False
2,-1.729095,-0.996899,-0.877005,0.552372,0.522814,-0.067254,0.393091,-0.441674,-1.25938,0.034215,...,False,False,False,True,False,False,False,False,True,False
3,-1.727913,-0.996888,-0.877005,1.11528,0.128458,0.641571,-0.506718,-0.110988,-0.779919,-0.571242,...,False,False,False,True,False,False,False,False,True,False
4,-1.726731,-0.992903,0.061285,0.22401,0.467348,-0.776079,-0.506718,0.848,0.658466,-0.571242,...,False,False,False,True,False,False,False,False,True,False


0    215000
1    105000
2    172000
3    244000
4    189900
Name: SalePrice, dtype: int64

In [98]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Model Training

In [101]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb

# Define different regression models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest Regressor': RandomForestRegressor(random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42),
    'XGBoost Regressor': xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
}

In [102]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# Define parameter distributions for each model
param_dists = {
    'Linear Regression': {},                       # Linear Regression usually doesn't require extensive tuning of core parameters
    'Random Forest Regressor': {
        'n_estimators': randint(100, 500),
        'max_features': ['sqrt', 'log2', None],
        'max_depth': randint(3, 10),
        'min_samples_split': randint(2, 20),
        'min_samples_leaf': randint(1, 20)
    },
    'Gradient Boosting Regressor': {
        'n_estimators': randint(100, 500),
        'learning_rate': uniform(0.01, 0.2),
        'max_depth': randint(3, 10),
        'subsample': uniform(0.6, 0.4),
        'max_features': ['sqrt', 'log2', None]
    },
    'XGBoost Regressor': {
        'n_estimators': randint(100, 500),
        'learning_rate': uniform(0.01, 0.2),
        'max_depth': randint(3, 10),
        'subsample': uniform(0.6, 0.4),
        'colsample_bytree': uniform(0.6, 0.4)
    }
}

tuned_models1 = {}
for name, model in models.items():
    if name in param_dists and param_dists[name]:
        random_search = RandomizedSearchCV(model, param_distributions=param_dists[name],
                                           n_iter=20, cv=3, random_state=42, n_jobs=-1, scoring='r2')
        random_search.fit(X_train, y_train)
        tuned_models1[name] = random_search.best_estimator_
        print(f"Best parameters for {name}: {random_search.best_params_}")
        print(f"Best cross-validation R-squared for {name}: {random_search.best_score_}")
    else:
        # Train models that don't require extensive tuning directly
        model.fit(X_train, y_train)
        tuned_models1[name] = model
        print(f"No specific hyperparameters tuned for {name}. Model trained directly.")
    print("-" * 30)

print("\nTuning complete for all models.")

No specific hyperparameters tuned for Linear Regression. Model trained directly.
------------------------------
Best parameters for Random Forest Regressor: {'max_depth': 7, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 13, 'n_estimators': 393}
Best cross-validation R-squared for Random Forest Regressor: 0.8774691308241941
------------------------------
Best parameters for Gradient Boosting Regressor: {'learning_rate': 0.14606150771755597, 'max_depth': 3, 'max_features': None, 'n_estimators': 373, 'subsample': 0.9795542149013333}
Best cross-validation R-squared for Gradient Boosting Regressor: 0.9118234599633556
------------------------------
Best parameters for XGBoost Regressor: {'colsample_bytree': 0.7123738038749523, 'learning_rate': 0.1185392166316497, 'max_depth': 3, 'n_estimators': 256, 'subsample': 0.9208787923016158}
Best cross-validation R-squared for XGBoost Regressor: 0.90961225827535
------------------------------

Tuning complete for all models.


In [104]:
from sklearn.model_selection import GridSearchCV

param_grids = {
    'Linear Regression': {},

    'Random Forest Regressor': {
        'n_estimators': [100, 300],
        'max_features': ['sqrt', 'log2', None],
        'max_depth': [5, 10],
        'min_samples_split': [5, 10],
        'min_samples_leaf': [3, 5]
    },

    'Gradient Boosting Regressor': {
        'n_estimators': [100, 300],
        'learning_rate': [0.01, 0.05],
        'max_depth': [3, 7],
        'subsample': [0.6, 1.0],
        'max_features': ['sqrt', 'log2', None]
    },

    'XGBoost Regressor': {
        'n_estimators': [100, 300],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 7],
        'subsample': [0.6, 0.8],
        'colsample_bytree': [0.6, 0.8]
    }
}

tuned_models2 = {}
for name, model in models.items():
    if name in param_grids and param_grids[name]:
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=param_grids[name],
            cv=3,
            n_jobs=-1,
            scoring='r2'
        )
        grid_search.fit(X_train, y_train)
        tuned_models2[name] = grid_search.best_estimator_
        print(f"Best parameters for {name}: {grid_search.best_params_}")
        print(f"Best cross-validation R-squared for {name}: {grid_search.best_score_}")
    else:
        model.fit(X_train, y_train)
        tuned_models2[name] = model
        print(f"No specific hyperparameters tuned for {name}. Model trained directly.")
    
    print("-" * 30)

print("\nTuning complete for all models.")

No specific hyperparameters tuned for Linear Regression. Model trained directly.
------------------------------
Best parameters for Random Forest Regressor: {'max_depth': 10, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 5, 'n_estimators': 300}
Best cross-validation R-squared for Random Forest Regressor: 0.8850255212313888
------------------------------
Best parameters for Gradient Boosting Regressor: {'learning_rate': 0.05, 'max_depth': 3, 'max_features': None, 'n_estimators': 300, 'subsample': 1.0}
Best cross-validation R-squared for Gradient Boosting Regressor: 0.9133758910914613
------------------------------
Best parameters for XGBoost Regressor: {'colsample_bytree': 0.6, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.8}
Best cross-validation R-squared for XGBoost Regressor: 0.9138712286949158
------------------------------

Tuning complete for all models.


In [108]:
#Check for the best model
from sklearn.metrics import mean_squared_error, r2_score
print("Comparing Evaluation Results of Best Models from RandomizedSearchCV and GridSearchCV Across All Model Types:")
print("-" * 80)

evaluation_results_rs = {}
print("Results from RandomizedSearchCV:")
for name, model in tuned_models1.items():
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    evaluation_results_rs[name] = {'MSE': mse, 'RMSE': rmse, 'R-squared': r2}
    print(f"  {name}:")
    print(f"    Mean Squared Error (MSE): {mse:.2f}")
    print(f"    Root Mean Squared Error (RMSE): {rmse:.2f}")
    print(f"    R-squared (R2): {r2:.2f}")

print("-" * 80)

evaluation_results_gs = {}
print("Results from GridSearchCV:")
for name, model in tuned_models.items():
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    evaluation_results_gs[name] = {'MSE': mse, 'RMSE': rmse, 'R-squared': r2}
    print(f"  {name}:")
    print(f"    Mean Squared Error (MSE): {mse:.2f}")
    print(f"    Root Mean Squared Error (RMSE): {rmse:.2f}")
    print(f"    R-squared (R2): {r2:.2f}")

print("-" * 80)

# Summarize and compare
print("Summary Comparison:")
print("-" * 20)

for name in models.keys():
    print(f"Model: {name}")
    if name in evaluation_results_rs:
        print(f"  RandomizedSearchCV - R2: {evaluation_results_rs[name]['R-squared']:.2f}, RMSE: {evaluation_results_rs[name]['RMSE']:.2f}")
    if name in evaluation_results_gs:
         print(f"  GridSearchCV       - R2: {evaluation_results_gs[name]['R-squared']:.2f}, RMSE: {evaluation_results_gs[name]['RMSE']:.2f}")
    print("-" * 20)

Comparing Evaluation Results of Best Models from RandomizedSearchCV and GridSearchCV Across All Model Types:
--------------------------------------------------------------------------------
Results from RandomizedSearchCV:
  Linear Regression:
    Mean Squared Error (MSE): 892627815.38
    Root Mean Squared Error (RMSE): 29876.88
    R-squared (R2): 0.89
  Random Forest Regressor:
    Mean Squared Error (MSE): 657977095.65
    Root Mean Squared Error (RMSE): 25651.06
    R-squared (R2): 0.92
  Gradient Boosting Regressor:
    Mean Squared Error (MSE): 450927228.21
    Root Mean Squared Error (RMSE): 21235.05
    R-squared (R2): 0.94
  XGBoost Regressor:
    Mean Squared Error (MSE): 432554816.00
    Root Mean Squared Error (RMSE): 20797.95
    R-squared (R2): 0.95
--------------------------------------------------------------------------------
Results from GridSearchCV:
  Linear Regression:
    Mean Squared Error (MSE): 892627815.38
    Root Mean Squared Error (RMSE): 29876.88
    R-sq

### Prediction using the best model

In [112]:
# Get the best models from both tuning processes
best_random_search_model = tuned_models1['Gradient Boosting Regressor'] # Based on previous evaluation, Gradient Boosting was slightly better with RS
best_grid_search_model = tuned_models2['XGBoost Regressor'] # Based on previous evaluation, XGBoost was slightly better with GS

# Make predictions on the test set using the best model from RandomizedSearchCV
predictions_rs = best_random_search_model.predict(X_test)

# Make predictions on the test set using the best model from GridSearchCV
predictions_gs = best_grid_search_model.predict(X_test)

# Display a sample of the predictions and actual values
predictions_comparison = pd.DataFrame({
    'Actual SalePrice': y_test,
    'RandomizedSearchCV Predictions': predictions_rs,
    'GridSearchCV Predictions': predictions_gs
})

display(predictions_comparison.head())

Unnamed: 0,Actual SalePrice,RandomizedSearchCV Predictions,GridSearchCV Predictions
1357,161000,170367.647505,173618.0
2367,116000,110946.076698,110784.273438
2822,196500,191631.740832,191798.9375
2126,123600,123322.709433,124857.140625
1544,126000,119396.925807,123381.804688


### Conclusion:
The project successfully built and optimized regression models for house price prediction.
After tuning and evaluation, the Gradient Boosting Regressor (tuned with RandomizedSearchCV) and the XGBoost Regressor (tuned with GridSearchCV) emerged as the top-performing models. They achieved the highest R-squared values (around 0.94), indicating they explain about 94% of the variance in house prices, and the lowest RMSE values (around $21,000 - $$21,000 - $22,000), representing the typical error in predictions.