The dataset was sourced from: https://www.kaggle.com/datasets/yasserh/housing-prices-dataset

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

## Data Inspection

In [7]:
df = pd.read_csv("/Users/kehindeslaptop/Code/projects/home-price-prediction/backend/data/housing-dataset.csv")
df.head(50)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,total_rooms,area_per_room,amenity_count
0,9310000,6550,4,2,2,1,0,0,0,1,1,1,1,6,1091.666667,2
1,9240000,3500,4,2,2,1,0,0,1,0,2,0,2,6,583.333333,2
2,9240000,7800,3,2,2,1,0,0,0,0,0,1,1,5,1560.0,1
3,9100000,6000,4,1,2,1,0,1,0,0,2,0,1,5,1200.0,2
4,9100000,6600,4,2,2,1,1,1,0,1,1,1,0,6,1100.0,4
5,8960000,8500,3,2,4,1,0,0,0,1,2,0,2,5,1700.0,2
6,8890000,4600,3,2,2,1,1,0,0,1,2,0,2,5,920.0,3
7,8855000,6420,3,2,2,1,0,0,0,1,1,1,1,5,1284.0,2
8,8750000,4320,3,1,2,1,0,1,1,0,2,0,1,4,1080.0,3
9,8680000,7155,3,2,1,1,1,1,0,1,2,0,0,5,1431.0,4


In [90]:
df.shape

In [91]:
df.info

In [92]:
df.columns

In [93]:
df.dtypes

In [94]:
df.isnull().sum()

In [95]:
# Encoding boolean values from string to int (1 for yes, 0 for no)
df[['mainroad','guestroom','basement','hotwaterheating','airconditioning','prefarea']] = df[['mainroad','guestroom','basement','hotwaterheating','airconditioning','prefarea']].replace({'yes': 1, 'no': 0}).astype(int)

In [96]:
# Encoding furnishing status from string to int (0 for unfurnished, 1 for semi-furnished, 2 for furnished)
df["furnishingstatus"].replace({"unfurnished": 0, "semi-furnished": 1, "furnished": 2}, inplace=True)

In [97]:
df.head(5)

In [98]:
df.dtypes

Feature Engineering:

In [99]:
# Feature: Total Number of Rooms
df['total_rooms'] = df['bedrooms'] + df['bathrooms']

# Feature: Area per room
df['area_per_room'] = df['area'] / df['total_rooms']

# Feature: Number of Amenities
amenities = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning']
df['amenity_count'] = df[amenities].sum(axis=1)

In [106]:
from scipy import stats

# We will use z-score to find and remove outliers
z_scores = stats.zscore(df[['price', 'area']])
abs_z_scores = abs(z_scores)
outliers = (abs_z_scores > 2.5).any(axis=1)
outlier_data = df[outliers]

print(f"Outliers based on Z-scores:\n{outlier_data}")
print("Number of Outliers:", len(df[outliers].index))


In [107]:
# We will drop the 13 outliers from the dataset
df= df.drop(df.index[outliers])

Checking Correlation between features

In [108]:
corr = df.corr()
plt.figure(figsize=(20, 20))
sns.heatmap(corr, annot=True)

In [109]:
corr_target = abs(corr["price"])
correlated_features = corr_target[corr_target>0.2]

names = [index for index, value in correlated_features.items()]

names.remove("price")

print(names)

In [8]:
df.dtypes

price                 int64
area                  int64
bedrooms              int64
bathrooms             int64
stories               int64
mainroad              int64
guestroom             int64
basement              int64
hotwaterheating       int64
airconditioning       int64
parking               int64
prefarea              int64
furnishingstatus      int64
total_rooms           int64
area_per_room       float64
amenity_count         int64
dtype: object