The dataset was sourced from: https://www.kaggle.com/datasets/yasserh/housing-prices-dataset

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

## Data Inspection

In [89]:
df = pd.read_csv("../../backend/data/housing-dataset.csv")
df.head(50)

In [90]:
df.shape

In [91]:
df.info

In [92]:
df.columns

In [93]:
df.dtypes

In [94]:
df.isnull().sum()

In [95]:
# Encoding boolean values from string to int (1 for yes, 0 for no)
df[['mainroad','guestroom','basement','hotwaterheating','airconditioning','prefarea']] = df[['mainroad','guestroom','basement','hotwaterheating','airconditioning','prefarea']].replace({'yes': 1, 'no': 0}).astype(int)

In [96]:
# Encoding furnishing status from string to int (0 for unfurnished, 1 for semi-furnished, 2 for furnished)
df["furnishingstatus"].replace({"unfurnished": 0, "semi-furnished": 1, "furnished": 2}, inplace=True)

In [97]:
df.head(5)

In [98]:
df.dtypes

Feature Engineering:

In [99]:
# Feature: Total Number of Rooms
df['total_rooms'] = df['bedrooms'] + df['bathrooms']

# Feature: Area per room
df['area_per_room'] = df['area'] / df['total_rooms']

# Feature: Number of Amenities
amenities = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning']
df['amenity_count'] = df[amenities].sum(axis=1)

In [106]:
from scipy import stats

# We will use z-score to find and remove outliers
z_scores = stats.zscore(df[['price', 'area']])
abs_z_scores = abs(z_scores)
outliers = (abs_z_scores > 2.5).any(axis=1)
outlier_data = df[outliers]

print(f"Outliers based on Z-scores:\n{outlier_data}")
print("Number of Outliers:", len(df[outliers].index))


In [107]:
# We will drop the 13 outliers from the dataset
df= df.drop(df.index[outliers])

Checking Correlation between features

In [108]:
corr = df.corr()
plt.figure(figsize=(20, 20))
sns.heatmap(corr, annot=True)

In [109]:
corr_target = abs(corr["price"])
correlated_features = corr_target[corr_target>0.2]

names = [index for index, value in correlated_features.items()]

names.remove("price")

print(names)

## Preprocessing

In [119]:
# Saving dataset after changes
df.to_csv('../../backend/data/housing-dataset.csv', index=False)

# Creating training dataset
train_df = pd.read_csv('../../backend/data/housing-dataset.csv')
train_df.to_csv('../../backend/data/training-housing-dataset.csv')


In [120]:
train_df.head()

In [121]:
# Scaling Data
scaled_columns = ["price", "area", "area_per_room"]
scaler = MinMaxScaler()
train_df[scaled_columns] = scaler.fit_transform(df[scaled_columns])

In [122]:
# Saving scaling
train_df.to_csv('../../backend/data/training-housing-dataset.csv')

In [123]:
train_df.head()

In [124]:
# Creating training and testing datasets
X = train_df.drop(columns="price")
Y = train_df['price']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, random_state=42)

In [126]:
# Finding best alpha for model
from sklearn.linear_model import ElasticNetCV
Enet_cv = ElasticNetCV(alphas=[0.01, 0.1, 1.0, 10.0, 100.0], cv=5)
Enet_cv.fit(X_train, Y_train)
print(f"Optimal alpha: {Enet_cv.alpha_}")

In [129]:
# Training the model
from sklearn.linear_model import ElasticNet
model = ElasticNet(alpha=0.01)
model.fit(X_train, Y_train)

In [131]:
# Making predictions with the model
Y_pred = model.predict(X_test)
# Evaluating model performance
mse = mean_squared_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)

print("mean squared error:", mse)
print("regression^2", r2)