In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from geopy.geocoders import Nominatim
from geopy import distance
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import scipy.stats as stats

# About Regression Models used

This notebook presents a thorough regression analysis of a real estate transaction dataset for the year 2023. After cleaning and exploring the data, I applied four regression techniques: linear regression, polynomial regression, decision tree regression, and random forest regression. The goal was to predict property prices based on various features and compare the performance of these models.

I then implemented linear regression to capture the linear relationship between independent variables and property prices. To account for potential nonlinear relationships, I also employed polynomial regression. Additionally, decision tree regression was utilized to partition the feature space and make predictions. Finally, random forest regression was implemented to combine multiple decision trees and enhance prediction accuracy, with hyperparameter tuning for optimization.

In [3]:
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv(r'C:\Users\Karan\Desktop\Real Estate Price Prediction\406.csv')

df.head(10)

Unnamed: 0.1,Unnamed: 0,Area,Property Type,Amount,Transaction Size (sq.m),Property Size (sq.m),Room(s),Parking,Nearest Metro,Nearest Mall,...,mall_x,mall_y,Landmark,landmark_x,landmark_y,Metro_Dist,Mall_Dist,Landmark_Dist,Registration type_Ready,Is Free Hold?_Non Free Hold
0,0,BUSINESS BAY,Unit,2631000.0,105.75,105.75,2,1,Business Bay Metro Station,Dubai Mall,...,25.197438,55.276923,"Downtown Dubai, Dubai, United Arab Emirates",25.194128,55.267727,1.808766,1.4814,1.256166,0,0
1,1,JUMEIRAH LAKES TOWERS,Unit,1170137.0,99.64,99.64,2,1,Damac Properties,Marina Mall,...,25.076352,55.139384,"Burj Al Arab, Dubai, United Arab Emirates",25.141555,55.183691,0.882072,1.559598,7.418886,0,0
2,2,JUMEIRAH VILLAGE CIRCLE,Unit,590000.0,63.95,63.95,1,1,Dubai Internet City,Marina Mall,...,25.076352,55.139384,"Sports City Swimming Academy, Dubai, United Ar...",25.041833,55.216391,5.656309,9.926351,10.058487,0,0
3,3,JUMEIRAH VILLAGE CIRCLE,Unit,600000.0,61.36,61.36,1,1,Dubai Internet City,Marina Mall,...,25.076352,55.139384,"Sports City Swimming Academy, Dubai, United Ar...",25.041833,55.216391,5.656309,9.926351,10.058487,0,0
4,4,DUBAI CREEK HARBOUR,Unit,1340888.0,64.09,64.09,1,1,Creek Metro Station,City Centre Mirdif,...,25.216319,55.40522,"Dubai International Airport, Dubai, United Ara...",25.253175,55.363098,1.198724,6.074031,5.131121,0,0
5,5,DUBAI CREEK HARBOUR,Unit,2261888.0,97.23,97.23,2,1,Creek Metro Station,City Centre Mirdif,...,25.216319,55.40522,"Dubai International Airport, Dubai, United Ara...",25.253175,55.363098,1.198724,6.074031,5.131121,0,0
6,7,DUBAI CREEK HARBOUR,Unit,2299888.0,97.23,97.23,2,1,Creek Metro Station,City Centre Mirdif,...,25.216319,55.40522,"Dubai International Airport, Dubai, United Ara...",25.253175,55.363098,1.198724,6.074031,5.131121,0,0
7,8,DUBAI CREEK HARBOUR,Unit,2296888.0,100.15,100.15,2,1,Creek Metro Station,City Centre Mirdif,...,25.216319,55.40522,"Dubai International Airport, Dubai, United Ara...",25.253175,55.363098,1.198724,6.074031,5.131121,0,0
8,9,BUSINESS BAY,Unit,3216000.0,133.39,133.39,2,1,Business Bay Metro Station,Dubai Mall,...,25.197438,55.276923,"Downtown Dubai, Dubai, United Arab Emirates",25.194128,55.267727,9.824972,7.784286,8.778089,0,0
9,10,DUBAI CREEK HARBOUR,Unit,1594888.0,75.42,75.42,1,1,Creek Metro Station,City Centre Mirdif,...,25.216319,55.40522,"Dubai International Airport, Dubai, United Ara...",25.253175,55.363098,1.198724,6.074031,5.131121,0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18174 entries, 0 to 18173
Data columns (total 35 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Unnamed: 0                   18174 non-null  int64  
 1   Area                         18174 non-null  object 
 2   Property Type                18174 non-null  object 
 3   Amount                       18174 non-null  float64
 4   Transaction Size (sq.m)      18174 non-null  float64
 5   Property Size (sq.m)         18174 non-null  float64
 6   Room(s)                      18174 non-null  int64  
 7   Parking                      18174 non-null  int64  
 8   Nearest Metro                18174 non-null  object 
 9   Nearest Mall                 18174 non-null  object 
 10  Nearest Landmark             18174 non-null  object 
 11  Project                      18174 non-null  object 
 12  Date                         18174 non-null  object 
 13  Month           

In [5]:
#Creating a dataframe that includes columns to be used as independent variables

df2 = df[['Transaction Size (sq.m)', 'lat', 'lon', 'Metro_Dist', 'Mall_Dist', 'Landmark_Dist', 'Room(s)', 'Registration type_Ready', 'Is Free Hold?_Non Free Hold', 'Parking']]

#### Multiple Linear Regression

In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df2, df['Amount'].values, test_size=0.2, random_state=42)

# Create the Linear Regression model
ml_regressor = LinearRegression()

# Train the model
ml_regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred_multiple_linear = ml_regressor.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred_multiple_linear)
print("Mean Squared Error:", mse)

# Calculate R-squared value
r2 = r2_score(y_test, y_pred_multiple_linear)
print("R-squared:", r2)

Mean Squared Error: 5068526624299.885
R-squared: 0.6503291511530205


#### Polynomial Regression

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df2, df['Amount'].values, test_size=0.2, random_state=42)

# Create polynomial features
degree = 2  # degree of polynomial features
poly_features = PolynomialFeatures(degree=degree)
X_train_poly = poly_features.fit_transform(X_train)
X_test_poly = poly_features.transform(X_test)

# Create the Linear Regression model
p_regressor = LinearRegression()

# Train the model
p_regressor.fit(X_train_poly, y_train)

# Make predictions on the test set
y_pred_polynomial = p_regressor.predict(X_test_poly)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred_polynomial)
print("Mean Squared Error:", mse)

# Calculate R-squared value
r2 = r2_score(y_test, y_pred_polynomial)
print("R-squared:", r2)


Mean Squared Error: 3683830942127.073
R-squared: 0.7458574477311202


#### Decision Tree Regression

In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df2, df['Amount'], test_size=0.2, random_state=42)

# Create the decision tree regressor
dt_regressor = DecisionTreeRegressor()

# Train the model
dt_regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred_decision_tree = dt_regressor.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred_decision_tree)
print("Mean Squared Error:", mse)

# Calculate R-squared value
r2 = r2_score(y_test, y_pred_decision_tree)
print("R-squared:", r2)


Mean Squared Error: 2787245361633.1025
R-squared: 0.8077116835345267


#### Random Forest Regression

In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df2, df['Amount'].values, test_size=0.2, random_state=42)

# Create the Random Forest regressor
rf_regressor = RandomForestRegressor()

# Train the model
rf_regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred_random_forest = rf_regressor.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred_random_forest)
print("Mean Squared Error:", mse)

# Calculate R-squared value
r2 = r2_score(y_test, y_pred_random_forest)
print("R-squared:", r2)

Mean Squared Error: 2409448598995.0474
R-squared: 0.8337753751110777


#### Hyperparameter Tuning

In [11]:
from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf_regressor.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


In [12]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)
{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [14]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [15]:
rf_random.best_params_

{'n_estimators': 400,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': False}

#### Comparing the two Random Forest Regression models

In [16]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [18]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

basemodel_accuracy=evaluate(rf_regressor,X_test,y_test)

print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - basemodel_accuracy) / basemodel_accuracy))

Model Performance
Average Error: 329815.1761 degrees.
Accuracy = 85.15%.
Model Performance
Average Error: 337439.8457 degrees.
Accuracy = 85.07%.
Improvement of 0.09%.


In [19]:
#Storing the model so we can use it in the Real Estate Price Prediction application

from joblib import dump,load
dump(rf_random,'housepriceprediction.joblib')

['housepriceprediction.joblib']

Next, I have developed an app for predicting real estate prices, leveraging the power of the random forest regression model. This app serves as a valuable tool for individuals seeking reliable property price estimates. I have designed the app using Flask, a Python web framework, and HTML for the user interface. By harnessing the capabilities of the random forest regression model within a user-friendly interface, users can input relevant property features and obtain price predictions. The app streamlines the prediction process, empowering users to make informed decisions about buying or selling real estate based on reliable and up-to-date estimations.