# Real Estate Price Prediction with Aqar Dataset


In this notebook, we will:
1. Load and inspect the dataset
2. Preprocess data (handle missing values, encode categorical variables, scale features)
3. Train and evaluate regression models to predict property price
4. Train and evaluate classification models to predict if the property is for sale
5. Visualize model performance

### Import Libraries

In [37]:
import sys
sys.path.append('../src')

In [38]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.figure_factory as ff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from model import train_and_evaluate

In [39]:
import importlib
import metrics

importlib.reload(metrics)

from metrics import regression_metrics, classification_metrics

### Load Dataset

In [40]:
file_path = r"C:\Users\Rubah\OneDrive\ÿ≥ÿ∑ÿ≠ ÿßŸÑŸÖŸÉÿ™ÿ®\MLmodels\data\aqar_fm_listings.csv"
data = pd.read_csv(file_path)

In [41]:
data

Unnamed: 0,id,title,url,price,meter_price,price_2_payments,price_4_payments,price_12_payments,rnpl_monthly_price,rent_period,...,rega_licensed,plan_no,parcel_no,user_type,user_verified,company_name,user_paid_tier,description,images,videos
0,6502998,ÿØŸàÿ± ŸÑŸÑÿ®Ÿäÿπ ŸÅŸä ÿ¥ÿßÿ±ÿπ ÿ¥ŸáŸäÿØ ÿßŸÑÿØŸäŸÜ ÿ´ŸÖ ÿßŸÑŸàÿ∑ŸÜ ŸÅŸáÿØ ÿ•ÿ®ÿ±ÿß...,https://sa.aqar.fm/ÿØŸàÿ±-ŸÑŸÑÿ®Ÿäÿπ/ÿßŸÑÿ±Ÿäÿßÿ∂/ÿ¥ŸÖÿßŸÑ-ÿßŸÑÿ±Ÿäÿß...,1500000.0,,,,,,,...,True,3783,407 + 408,0,True,,5.0,ŸÖÿ¥ÿ±Ÿàÿπ ÿßÿØŸàÿßÿ± ÿ≥ŸêŸÄŸÄŸÄÿØÿ±ÿß ŸÖÿ¥ÿ±Ÿàÿπ Ÿäÿ™ŸÖŸäÿ≤ ÿ®ŸÖŸàŸÇÿπŸá ÿßŸÑÿßÿ≥ÿ™ÿ±...,"['039762754_1765464943659.jpg', '039762758_176...",['039762752_1765465117982']
1,6428644,"ŸÅŸäŸÑÿß ŸÑŸÑÿ®Ÿäÿπ ŸÅŸä ÿ¥ÿßÿ±ÿπ ÿ£ÿ≠ŸÖÿØ ÿ®ŸÜ ÿ≠ŸäÿßŸÜ, ÿ≠Ÿä ÿßŸÑŸÖŸÑŸÇÿß, ŸÖÿØ...",https://sa.aqar.fm/ŸÅŸÑŸÑ-ŸÑŸÑÿ®Ÿäÿπ/ÿßŸÑÿ±Ÿäÿßÿ∂/ÿ¥ŸÖÿßŸÑ-ÿßŸÑÿ±Ÿäÿß...,13500000.0,,,,,,,...,True,3776,177 / 2,1,True,ÿ¥ÿ±ŸÉÿ© ÿ£ÿ´ÿ±Ÿâ ÿßŸÑŸÖÿ™ÿ≠ÿØÿ© ÿßŸÑÿπŸÇÿßÿ±Ÿäÿ©,2.0,ŸÑŸÑÿ®Ÿäÿπ ŸÅŸäŸÑÿß ÿ®ÿ™ÿµŸÖŸäŸÖ ŸÖŸÖŸäÿ≤ ÿ®ŸÖŸÑŸÇÿß ÿßŸÑÿ≥ŸÑŸàÿØŸäÿ± \n\n( Ÿä...,"['002902001_1764757893800.jpg', '002902004_176...",[]
2,6516181,"ÿØŸàÿ± ŸÑŸÑÿ•Ÿäÿ¨ÿßÿ± ŸÅŸä ÿ¥ÿßÿ±ÿπ ÿ¥ÿØÿßÿØ ÿ®ŸÜ ÿ£Ÿàÿ≥, ÿ≠Ÿä ÿßŸÑÿπŸÑŸäÿß, ŸÖÿØ...",https://sa.aqar.fm/ÿØŸàÿ±-ŸÑŸÑÿ•Ÿäÿ¨ÿßÿ±/ÿßŸÑÿ±Ÿäÿßÿ∂/ÿ¥ŸÖÿßŸÑ-ÿßŸÑÿ±...,105000.0,,,,,0.0,,...,True,772,121,0,True,ÿ¥ÿ±ŸÉÿ© ÿßŸÑÿ≥ŸàÿßÿπÿØ ÿßŸÑÿπŸÇÿßÿ±Ÿäÿ©,2.0,ŸÑŸÑÿ•Ÿäÿ¨ÿßÿ± ŸÅÿ±ÿµÿ© ÿ≥ŸÉŸÜŸäÿ© ŸÖŸÖŸäÿ≤ÿ©\n\nüè° ŸÜŸàÿπ ÿßŸÑÿπŸÇÿßÿ±: ÿØŸàÿ± ...,"['043760574_1766388486688.jpg', '043760579_176...",['043760577_1766389086357']
3,6495984,"ÿ¥ŸÇÿ© ŸÑŸÑÿ®Ÿäÿπ ŸÅŸä ÿ¥ÿßÿ±ÿπ ÿßŸÑÿ∫ÿßÿ¶ÿ±, ÿ≠Ÿä ÿßŸÑŸÖÿ±Ÿàÿ¨, ŸÖÿØŸäŸÜÿ© ÿ£ÿ®Ÿá...",https://sa.aqar.fm/ÿ¥ŸÇŸÇ-ŸÑŸÑÿ®Ÿäÿπ/ÿßÿ®Ÿáÿß/ÿ≠Ÿä-ÿØÿ±ÿ©-ÿßŸÑŸÖŸÜÿ≥...,635000.0,,,,,,,...,True,1022 / 1424ŸáŸÄ / ÿπ / 1,499,0,True,ŸÖŸÉÿ™ÿ® ÿÆŸäÿßŸÑ ÿßŸÑÿ£ÿ±ŸÉÿßŸÜ ŸÑŸÑÿπŸÇÿßÿ±,4.0,üè† ÿ¥ŸÇŸÇ ÿØŸàÿ®ŸÑŸÉÿ≥ ŸÜÿ∏ÿßŸÖ ÿ™ÿßŸàŸÜ ŸáÿßŸàÿ≥üè†\nŸÑŸÑÿ®Ÿäÿπ ÿ¥ŸÇŸÇ ÿØŸàÿ®ŸÑŸÉÿ≥...,"['000635467_1765094224978.jpg', '000635463_176...",['000635463_1765094375794']
4,6513148,"ÿ¥ŸÇÿ© ŸÑŸÑÿ®Ÿäÿπ ŸÅŸä ÿ¥ÿßÿ±ÿπ ÿßŸÑÿ≥ÿßŸÑŸÖŸäÿ©, ÿ≠Ÿä ÿßŸÑÿ®ÿ≠Ÿäÿ±ÿ©, ŸÖÿØŸäŸÜÿ© ...",https://sa.aqar.fm/ÿ¥ŸÇŸÇ-ŸÑŸÑÿ®Ÿäÿπ/ÿßÿ®Ÿáÿß/ÿ≠Ÿä-ÿßÿ®Ÿáÿß-ÿßŸÑÿ¨ÿØ...,510000.0,,,,,,,...,True,152 / ÿ£ / 1,17,0,True,ŸÖŸÉÿ™ÿ® ÿÆŸäÿßŸÑ ÿßŸÑÿ£ÿ±ŸÉÿßŸÜ ŸÑŸÑÿπŸÇÿßÿ±,4.0,üåüŸÑŸÑÿ®Ÿäÿπ ÿ¥ŸÇŸÇ ÿ™ŸÖŸÑŸäŸÉ ŸÅŸä ÿßÿ±ŸÇŸâ ÿßÿ≠Ÿäÿßÿ° ÿßÿ®Ÿáÿß ÿ≠Ÿä ÿßŸÑÿßÿ∑ŸÑÿßŸÑ...,"['000635468_1766167214019.jpg', '000635462_176...",['000635460_1766167341766']
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6255,6511657,"ÿ¥ŸÇÿ© ŸÑŸÑÿ•Ÿäÿ¨ÿßÿ± ŸÅŸä ÿ¥ÿßÿ±ÿπ ÿπŸÑŸÇŸÖÿ© ÿ®ŸÜ Ÿàÿßÿ¶ŸÑ, ÿ≠Ÿä ÿßŸÑŸÜÿ±ÿ¨ÿ≥, ...",https://sa.aqar.fm/ÿ¥ŸÇŸÇ-ŸÑŸÑÿ•Ÿäÿ¨ÿßÿ±/ÿßŸÑÿ±Ÿäÿßÿ∂/ÿ¥ŸÖÿßŸÑ-ÿßŸÑÿ±...,80000.0,,,,,0.0,,...,True,2737,3863,0,True,ÿ¥ÿ±ŸÉÿ© ŸÖÿ≥ÿ™ŸàŸäÿßÿ™ ÿßŸÑÿπŸÇÿßÿ±Ÿäÿ©,2.0,ŸÖÿ¥ÿ±Ÿàÿπ ÿ≥ŸäŸÑ 35 ÿ≠Ÿä ÿßŸÑŸÜÿ±ÿ¨ÿ≥ ( ÿ¥ŸÖÿßŸÑ ÿ≥ŸÑŸÖÿßŸÜ) \nÿßŸÑŸÖÿ¥ÿ±Ÿàÿπ...,"['005417420_1766051816215.jpg', '005417420_176...",['005417426_1766051881089']
6256,6479026,"ÿ¥ŸÇÿ© ŸÑŸÑÿ®Ÿäÿπ ŸÅŸä ÿ¥ÿßÿ±ÿπ ÿßŸÑŸÇÿ®ÿ©, ÿ≠Ÿä ÿ•ÿ¥ÿ®ŸäŸÑŸäÿ©, ŸÖÿØŸäŸÜÿ© ÿßŸÑÿ±...",https://sa.aqar.fm/ÿ¥ŸÇŸÇ-ŸÑŸÑÿ®Ÿäÿπ/ÿßŸÑÿ±Ÿäÿßÿ∂/ÿ¥ÿ±ŸÇ-ÿßŸÑÿ±Ÿäÿßÿ∂...,750000.0,,,,,,,...,True,2958/ÿ£,88 + 90 + 92,1,True,ÿ¥ÿ±ŸÉÿ© ÿßŸÑÿ•ŸÜÿ¥ÿßÿ° ÿßŸÑŸÖÿ™ŸÉÿßŸÖŸÑ ŸÑŸÑÿßÿ≥ÿ™ÿ´ŸÖÿßÿ±,2.0,ÿ¥ŸÇÿ© ŸÑŸÑÿ®Ÿäÿπ - ÿØŸàÿ± ÿ£ÿ±ÿ∂Ÿä \nÿπŸÖÿ± ÿßŸÑÿ¥ŸÇÿ© (18ÿ≥ŸÜÿ©)\nÿßŸÑŸÖŸà...,"['002508177_1763983175791.jpg', '002508176_176...",['002508176_1763983307296']
6257,6427826,"ŸÅŸäŸÑÿß ŸÑŸÑÿ®Ÿäÿπ ŸÅŸä ÿ¥ÿßÿ±ÿπ ÿ¨ÿ®ŸÑ ŸÜŸÖÿ±ÿ©, ÿ≠Ÿä ŸÇÿ±ÿ∑ÿ®ÿ©, ŸÖÿØŸäŸÜÿ© ÿß...",https://sa.aqar.fm/ŸÅŸÑŸÑ-ŸÑŸÑÿ®Ÿäÿπ/ÿßŸÑÿ±Ÿäÿßÿ∂/ÿ¥ÿ±ŸÇ-ÿßŸÑÿ±Ÿäÿßÿ∂...,2700000.0,,,,,,,...,True,2231 / ÿ®,134 / 1,1,True,ÿ¥ÿ±ŸÉÿ© ÿßŸÑÿ•ŸÜÿ¥ÿßÿ° ÿßŸÑŸÖÿ™ŸÉÿßŸÖŸÑ ŸÑŸÑÿßÿ≥ÿ™ÿ´ŸÖÿßÿ±,2.0,ŸÑŸÑÿ®Ÿäÿπ ŸÅŸäŸÑÿß ŸÜÿ∏ÿßŸÖ ÿßÿØŸàÿßÿ±\n\nÿØŸàÿ± + ÿØŸàÿ± + ÿ¥ŸÇÿ©\n\nŸÅŸä...,"['002508170_1760693045311.jpg', '002508176_176...",['002508177_1760693103724']
6258,6444496,"ÿ¥ŸÇÿ© ŸÑŸÑÿ®Ÿäÿπ ŸÅŸä ÿ¥ÿßÿ±ÿπ ÿ≥ÿßŸÑŸÖ ÿ®ŸÜ ÿπŸÖŸäÿ± ÿßŸÑÿßŸàÿ≥Ÿä, ÿ≠Ÿä ÿßŸÑÿπÿß...",https://sa.aqar.fm/ÿ¥ŸÇŸÇ-ŸÑŸÑÿ®Ÿäÿπ/ÿßŸÑÿ±Ÿäÿßÿ∂/ÿ¥ŸÖÿßŸÑ-ÿßŸÑÿ±Ÿäÿß...,1150000.0,,,,,,,...,True,2078 / ÿ£ ÿ®,1865 / ÿ£,0,,,2.0,ŸÑŸÑÿ®Ÿäÿπ ÿ¥ŸÇÿ© ÿ≠Ÿâ ÿßŸÑÿπÿßÿ±ÿ∂ ÿßŸÑŸÖÿ≥ÿßÿ≠ÿ© 148ŸÖ ÿßŸÑÿ™ŸÅÿßÿµŸäŸÑ 3 ÿ∫ÿ±...,['048647517_1761673158040.jpg'],['048647512_1761673221930']


### Quick Dataset Overview
We check column types, non-null counts, and numeric statistics to understand the dataset. Missing values are also important for preprocessing.

In [42]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6260 entries, 0 to 6259
Data columns (total 79 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       6260 non-null   int64  
 1   title                    6251 non-null   object 
 2   url                      6260 non-null   object 
 3   price                    6260 non-null   float64
 4   meter_price              431 non-null    float64
 5   price_2_payments         617 non-null    float64
 6   price_4_payments         596 non-null    float64
 7   price_12_payments        598 non-null    float64
 8   rnpl_monthly_price       950 non-null    float64
 9   rent_period              835 non-null    float64
 10  area_sqm                 6260 non-null   int64  
 11  deed_area                6249 non-null   float64
 12  num_bedrooms             5487 non-null   float64
 13  num_bathrooms            3523 non-null   float64
 14  num_living_rooms        

In [43]:
data.describe()


Unnamed: 0,id,price,meter_price,price_2_payments,price_4_payments,price_12_payments,rnpl_monthly_price,rent_period,area_sqm,deed_area,...,published_at,last_update,verified,boosted,premium,has_img,has_video,ad_license_number,user_type,user_paid_tier
count,6260.0,6260.0,431.0,617.0,596.0,598.0,950.0,835.0,6260.0,6249.0,...,6260.0,6260.0,6260.0,6260.0,6260.0,6260.0,6260.0,6250.0,6260.0,6068.0
mean,6443584.0,1080462.0,2175.690557,16633.387358,4694.127517,4443.946488,7628265.0,2.868263,1135.608,1035.925661,...,1761887000.0,1766943000.0,0.997604,0.027636,0.000479,1.0,0.949521,7196583000.0,0.206709,2.093935
std,71097.28,3267496.0,7112.493061,36445.059439,19959.835698,20372.317223,77176540.0,0.338407,19992.13,13777.533207,...,4281328.0,202837.4,0.048896,0.16394,0.021888,0.0,0.218949,19997730.0,0.404977,0.712559
min,6021534.0,37.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,...,1739142000.0,1762460000.0,0.0,0.0,0.0,1.0,0.0,7100142000.0,0.0,0.0
25%,6395056.0,299000.0,400.0,0.0,0.0,0.0,0.0,3.0,130.0,144.34,...,1758711000.0,1766984000.0,1.0,0.0,0.0,1.0,1.0,7200670000.0,0.0,2.0
50%,6465124.0,680000.0,800.0,0.0,0.0,0.0,3121.0,3.0,198.0,233.97,...,1763052000.0,1766998000.0,1.0,0.0,0.0,1.0,1.0,7200749000.0,0.0,2.0
75%,6502997.0,1130000.0,2000.0,0.0,0.0,0.0,5796.0,3.0,348.0,450.0,...,1765505000.0,1767005000.0,1.0,0.0,0.0,1.0,1.0,7200791000.0,0.0,2.0
max,6525619.0,182091400.0,100000.0,265000.0,220000.0,240000.0,1545860000.0,3.0,1146600.0,576433.0,...,1767012000.0,1767013000.0,1.0,1.0,1.0,1.0,1.0,7200816000.0,1.0,5.0


In [44]:
data.isnull().sum()

id                   0
title                9
url                  0
price                0
meter_price       5829
                  ... 
company_name       318
user_paid_tier     192
description         34
images               0
videos               0
Length: 79, dtype: int64

###  Feature Selection & Preprocessing

Drop columns not useful for modeling

In [45]:
drop_cols = ['id','url','images','videos','description','title','deed_number']
data = data.drop(columns=drop_cols)

In [46]:
numeric_cols = ['area_sqm', 'num_bedrooms', 'num_rooms', 'latitude', 'longitude', 'user_paid_tier']


In [47]:
categorical_cols = ['furnished', 'duplex', 'ac', 'pool', 'city']


In [48]:
for col in numeric_cols:
    data[col] = data[col].fillna(data[col].median())

In [49]:
for col in categorical_cols:
    data[col] = data[col].fillna('Unknown')

In [50]:
data_encoded = pd.get_dummies(data[numeric_cols + categorical_cols], drop_first=True)


In [51]:
y = data['price']

In [52]:
y_log = np.log1p(y)

In [53]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(data_encoded)

### Define Target Variable

###  Scale features

### Train/Test split and model training

In [54]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest Regressor": RandomForestRegressor(random_state=42),
    "SVR": SVR()
}


In [55]:
metrics_results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    metrics = train_and_evaluate(
        model,
        X_scaled,
        y_log,        
        task="regression",
        plot=True
    )
    
    metrics_results[name] = metrics


Training Linear Regression...



Training Random Forest Regressor...



Training SVR...


In [56]:
metrics = pd.DataFrame(metrics_results)

In [57]:
metrics

Unnamed: 0,Linear Regression,Random Forest Regressor,SVR
MSE,1.411909,0.766283,1.210803
RMSE,1.188238,0.875376,1.100365


In [58]:
import os
from joblib import dump

os.makedirs("models", exist_ok=True)

best_model_name = 'Random Forest Regressor'
best_model = models[best_model_name]
best_model_name = 'Random Forest Regressor_aqar'
model_path = f"models/{best_model_name.replace(' ', '_')}.joblib"
dump(best_model, model_path)

print(f"Saved {best_model_name} as '{model_path}'")

Saved Random Forest Regressor_aqar as 'models/Random_Forest_Regressor_aqar.joblib'
