In [388]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("final_cleaned_home_price_data.csv")

In [389]:
upper_limit = df['Price'].quantile(0.99)
df = df[df['Price'] < upper_limit]

In [390]:
df['Bathroom_Count'] = df["Bathroom_Count"].astype(str)
df['Room_Count'] = df["Room_Count"].astype(str)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19412 entries, 0 to 19608
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Net_Area                19412 non-null  int64  
 1   Gross_Area              19412 non-null  float64
 2   Room_Count              19412 non-null  object 
 3   Floor_Location          19412 non-null  object 
 4   Building_Age            19412 non-null  object 
 5   Heating_Type            19412 non-null  object 
 6   Price                   19412 non-null  float64
 7   City                    19412 non-null  object 
 8   Occupancy_Status        19412 non-null  object 
 9   Investment_Eligibility  19412 non-null  object 
 10  Title_Deed_Status       19412 non-null  object 
 11  Bathroom_Count          19412 non-null  object 
dtypes: float64(2), int64(1), object(9)
memory usage: 1.9+ MB


I will first apply ordinal encoding on my categorical data that has a rank. This type of ordinal encoding uses the mean of the target to decide the rank so it is more accurate and not random

To start encoding, i need to first split my data to avoid data leakage


In [391]:
from sklearn.model_selection import train_test_split

x = df.drop(columns = ['Price'])
y = df['Price']

x_train, x_temp, y_train, y_temp = train_test_split(
    x,
    y,
    test_size = 0.3,
    random_state=100,
)

In [392]:
x_val, x_test, y_val, y_test = train_test_split(
    x_temp,
    y_temp,
    test_size = 0.5,
    random_state = 100
)

In [393]:
from feature_engine.encoding import OrdinalEncoder

features = ['Room_Count', 'Floor_Location', 'Building_Age','Heating_Type','City','Bathroom_Count']
ordinal_encoder = OrdinalEncoder(encoding_method='ordered',
                                 variables=features)
ordinal_encoder.fit(x_train, y_train) 
ord_train_t = ordinal_encoder.transform(x_train)
ord_test_t = ordinal_encoder.transform(x_test)
ord_val_t = ordinal_encoder.transform(x_val)


nan_features = ['Floor_Location', 'City']
ord_train_t[nan_features] = ord_train_t[nan_features].fillna(-1)
ord_test_t[nan_features] = ord_test_t[nan_features].fillna(-1)
ord_val_t[nan_features] = ord_val_t[nan_features].fillna(-1)

#na values are now uniquely replaced, otherwise XGBoost can't deal with them



I will now label encode the rest of my data, keeping in mind that i left continuous data untouched

In [394]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

label_encoders = {}

for col in ['Occupancy_Status', 'Investment_Eligibility', 'Title_Deed_Status']:
    label = LabelEncoder()
    label.fit(x_train[col])                  
    x_train[col] = label.transform(x_train[col])
    x_test[col] = label.transform(x_test[col])
    x_val[col] = label.transform(x_val[col])
    label_encoders[col] = label             



In [395]:
import pickle

with open("ordinal_encoders.pkl", "wb") as f:
    pickle.dump(ordinal_encoder, f)

with open("label_encoders.pkl", "wb") as f:
    pickle.dump(label_encoders, f)

In [396]:
ordinal_cols = ['Room_Count', 'Floor_Location', 'Building_Age', 'Heating_Type', 'City', 'Bathroom_Count']

x_train_final = x_train.copy()
x_test_final = x_test.copy()
x_val_final = x_val.copy()
x_train_final[ordinal_cols] = ord_train_t[ordinal_cols]
x_test_final[ordinal_cols] = ord_test_t[ordinal_cols]
x_val_final[ordinal_cols] = ord_val_t[ordinal_cols]

In [397]:
import xgboost as xgb
from xgboost import XGBRegressor

model = XGBRegressor(
    objective='reg:squarederror', 
    n_estimators=1000,           
    learning_rate=0.05,           
    max_depth=6,                  
    random_state=42,
    n_jobs=-1                     
)

model.fit(x_train_final, y_train)
y_pred = model.predict(x_test_final)

In [398]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:,.2f}") 
print(f"MAE: {mae:,.2f}")
print(f"R-squared: {r2:.4f}")

RMSE: 1,205,360.03
MAE: 691,651.04
R-squared: 0.6623


In [399]:
y_pred_val = model.predict(x_val_final)
r2 = r2_score(y_val, y_pred_val)
mse = mean_squared_error(y_val, y_pred_val)
rmse = np.sqrt(mse)
print(f"\n## Evaluation Metrics ##")
print(f"R-squared (R2): {r2:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")


## Evaluation Metrics ##
R-squared (R2): 0.6371
Mean Squared Error (MSE): 1526387084307.3955
Root Mean Squared Error (RMSE): 1235470.3899
