# House Price Prediction with XGBoost

This repository contains a machine learning model built with XGBoost to predict house prices using Melbourne housing data. The model is trained on various features such as 'Distance', 'Bedroom2', 'Postcode', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude', 'Bathroom', 'Car', and 'Propertycount'.


In [86]:
import pandas as pd
import numpy as np

df = pd.read_csv('melb_data.csv')

In [87]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Create a subset of columns with missing values
subset_df = df[['Car', 'BuildingArea', 'YearBuilt']]

# Enable iterative imputer
imputer = IterativeImputer(max_iter=10, random_state=0)
subset_imputed = imputer.fit_transform(subset_df)

# Replace the original columns with the imputed values
df[['Car', 'BuildingArea', 'YearBuilt']] = subset_imputed


In [88]:
float_columns = df.select_dtypes(include='Float64').drop('Price', axis=1)

X = float_columns.drop('Car', axis=1)
y = df.Price

In [89]:
from sklearn.ensemble import RandomForestRegressor

# Create a random forest regressor model
model = RandomForestRegressor()

# Fit the model to your data
model.fit(X, y)

# Get the feature importance scores
importances = model.feature_importances_

# Create a dataframe to store the feature importances
feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': importances})

# Sort the features by importance in descending order
feature_importance = feature_importance.sort_values('Importance', ascending=False)

# Print the feature importances
print(feature_importance)


          Feature  Importance
0        Distance    0.200701
2        Bedroom2    0.187951
1        Postcode    0.129464
5        Landsize    0.124307
6    BuildingArea    0.099779
9      Longtitude    0.078574
8       Lattitude    0.074752
7       YearBuilt    0.040281
3        Bathroom    0.031040
10  Propertycount    0.023728
4             Car    0.009422


In [90]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score
from xgboost import XGBRegressor

X = float_columns
y = df.Price

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and configure the XGBoost model for regression
model = XGBRegressor('learning_rate'= 0.1, 'max_depth'= 5, 'n_estimators'= 300)
model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = model.predict(X_val)

# Evaluate the model using mean squared error (MSE)
mse = mean_squared_error(y_val, y_pred)
print(f"Mean Squared Error: {mse}")



Mean Squared Error: 75501204568.58104


In [91]:
from sklearn.metrics import r2_score

r2 = r2_score(y_val, y_pred)

print(r2)


0.8099230331501153


In [92]:
# Feature importance
importance = model.feature_importances_
feature_names = X.columns
feature_importance = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
feature_importance = feature_importance.sort_values('Importance', ascending=False)
print(feature_importance)

          Feature  Importance
2        Bedroom2    0.260446
1        Postcode    0.170475
3        Bathroom    0.166126
0        Distance    0.102782
6    BuildingArea    0.079563
7       YearBuilt    0.055683
5        Landsize    0.050183
8       Lattitude    0.042795
9      Longtitude    0.035892
10  Propertycount    0.023278
4             Car    0.012777


In [95]:
from sklearn.model_selection import GridSearchCV


# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.01, 0.001]
}



# Perform grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

# Get the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Make predictions on the validation set using the best model
y_pred = best_model.predict(X_val)

# Evaluate the model using mean squared error (MSE)
mse = mean_squared_error(y_val, y_pred)
print(f"Mean Squared Error: {mse}")

print(f"Best Parameters: {best_params}")


Mean Squared Error: 72507418187.31776
Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 300}


In [96]:
r2 = r2_score(y_val, y_pred)

print(r2)

0.8174599968051802


In [99]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

# Perform cross-validation
cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')

# Calculate mean MSE and R-squared scores
mean_mse = -cv_scores.mean()
mean_r2 = r2_score(y, model.predict(X))

# Print the results
print(f"Mean Squared Error: {mean_mse}")
print(f"R-squared: {mean_r2}")


Mean Squared Error: 96252363702.29651
R-squared: 0.9182273616148418
