In [1]:

import pandas as pd
import xgboost as xgb
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv("../data/HouseDataRaw.csv")

In [24]:
df.shape

(25155, 39)

In [3]:
df.drop(columns=["Unnamed: 0"], inplace=True)

In [25]:
df.nunique()

Unnamed: 0                  25155
district                       39
price                        1888
address                      1638
AdUpdateDate                  186
Category                        1
GrossSquareMeters             584
BuildingAge                    10
NumberFloorsofBuilding         55
UsingStatus                     3
EligibilityForInvestment        3
BuildStatus                     3
TitleStatus                     5
ItemStatus                      2
NumberOfBathrooms               7
NumberOfWCs                     7
AdCreationDate                728
Type                            1
NetSquareMeters               517
NumberOfRooms                  27
FloorLocation                  55
HeatingType                    15
CreditEligibility               3
InsideTheSite                   2
StructureType                   8
MortgageStatus                  2
Swap                            2
Balcony                         2
PriceStatus                     2
RentalIncome  

In [22]:
df.head()

Unnamed: 0.1,Unnamed: 0,district,price,address,AdUpdateDate,Category,GrossSquareMeters,BuildingAge,NumberFloorsofBuilding,UsingStatus,...,PriceStatus,RentalIncome,NumberOfBalconies,BalconyType,HallSquareMeters,WCSquareMeters,IsItVideoNavigable?,Subscription,BathroomSquareMeters,BalconySquareMeters
0,0,adalar,3100000.0,"['Anasayfa', 'Satılık Daire', 'İstanbul Satılı...",24 Şubat 2022,Satılık,160 m2,21 Ve Üzeri,3,Mülk Sahibi Oturuyor,...,Genel Fiyat,,,,,,,,,
1,1,adalar,1600000.0,"['Anasayfa', 'Satılık Daire', 'İstanbul Satılı...",02 Mart 2022,Satılık,120 m2,5-10,3,Mülk Sahibi Oturuyor,...,Genel Fiyat,,,,,,,,,
2,2,adalar,18500000.0,"['Anasayfa', 'Satılık Müstakil Ev', 'İstanbul ...",11 Şubat 2022,Satılık,350 m2,21 Ve Üzeri,2,Mülk Sahibi Oturuyor,...,Genel Fiyat,,,,,,,,,
3,3,adalar,9500000.0,"['Anasayfa', 'Satılık Bina', 'İstanbul Satılık...",11 Şubat 2022,Satılık,550 m2,11-15,3,Mülk Sahibi Oturuyor,...,Genel Fiyat,,,,,,,,,
4,4,adalar,25000000.0,"['Anasayfa', 'Satılık Köşk', 'İstanbul Satılık...",19 Ocak 2022,Satılık,840 m2,21 Ve Üzeri,4,Boş,...,Genel Fiyat,,,,,,,,,


In [4]:
# ValueError: could not convert string to float: '34550000arrow_downward%3'


df["price"] = (
    df["price"]
    .str.replace(",", "", regex=True)  # Remove commas
    .str.replace("TL", "", regex=True)  # Remove 'TL'
    .str.extract(r"(\d+\.?\d*)")[0]  # Extract only numeric values
    .astype(float)  # Convert to float
)

  .str.extract('(\d+\.?\d*)')[0]  # Extract only numeric values


In [5]:
df["Price_Category"] = pd.qcut(df["price"], q=3, labels=[0, 1, 2])

print("Price Thresholds (approximate):")
print(df.groupby("Price_Category")["price"].agg(["min", "max", "count"]))
df.drop(columns=["price"], inplace=True)

Price Thresholds (approximate):
                      min           max  count
Price_Category                                
0                 20000.0  9.200000e+05   8389
1                922500.0  2.100000e+06   8452
2               2118000.0  1.600000e+10   8314


  print(df.groupby('Price_Category')['price'].agg(['min', 'max', 'count']))


In [6]:
# The desitrbution of the target variable is balanced
df["Price_Category"].value_counts()

Price_Category
1    8452
0    8389
2    8314
Name: count, dtype: int64

In [None]:
categorical_cols = df.select_dtypes(include=["object"]).columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

scaler = StandardScaler()
numerical_cols = df.select_dtypes(include=["int64", "float64"]).columns
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [56]:
X = df.drop("Price_Category", axis=1)
y = df["Price_Category"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [58]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.74      0.75      1672
           1       0.61      0.63      0.62      1689
           2       0.80      0.78      0.79      1670

    accuracy                           0.72      5031
   macro avg       0.72      0.72      0.72      5031
weighted avg       0.72      0.72      0.72      5031



In [55]:
X = df.drop(columns=["Price_Category"])
y = df["Price_Category"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# XGBoost model
model = xgb.XGBClassifier(
    objective="multi:softmax", num_class=3, eval_metric="mlogloss", random_state=42
)
model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.83      0.84      1678
           1       0.72      0.75      0.74      1690
           2       0.88      0.86      0.87      1663

    accuracy                           0.82      5031
   macro avg       0.82      0.82      0.82      5031
weighted avg       0.82      0.82      0.82      5031

