In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


#### Review The Data

In [2]:
data = pd.read_csv('final_cleaned_home_price_data.csv')

city_tier = pd.read_csv("final_cleaned_home_price_data.csv")

In [3]:
data.head()

Unnamed: 0,Net_Area,Gross_Area,Room_Count,Floor_Location,Building_Age,Heating_Type,Price,City,Occupancy_Status,Investment_Eligibility,Title_Deed_Status,Bathroom_Count
0,120,150.0,4.0,4.Kat,21 Ve Üzeri,Kombi Doğalgaz,950000.0,adana,Boş,Unknown,Kat Mülkiyeti,1.0
1,100,125.0,4.0,3.Kat,4,Kombi Doğalgaz,1250000.0,adana,Boş,Unknown,Unknown,1.0
2,89,95.0,3.0,4.Kat,0 (Yeni),Kombi Doğalgaz,1750000.0,adana,Boş,Uygun,Unknown,1.0
3,40,55.0,2.0,6.Kat,0 (Yeni),Kombi Doğalgaz,1300000.0,adana,Boş,Uygun,Unknown,1.0
4,140,150.0,4.0,Düz Giriş (Zemin),5-10,Klimalı,1700000.0,adana,Boş,Uygun,Kat Mülkiyeti,1.0


In [4]:
data.describe()


Unnamed: 0,Net_Area,Gross_Area,Room_Count,Price,Bathroom_Count
count,19609.0,19609.0,19609.0,19609.0,19609.0
mean,127.370493,151.459687,3.692004,3035337.0,1.479117
std,74.944502,112.92999,1.102164,2966766.0,0.761345
min,30.0,32.0,1.0,230000.0,0.0
25%,90.0,100.0,3.0,1650000.0,1.0
50%,120.0,135.0,4.0,2300000.0,1.0
75%,150.0,167.0,4.0,3270000.0,2.0
max,2000.0,2500.0,12.0,52000000.0,6.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19609 entries, 0 to 19608
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Net_Area                19609 non-null  int64  
 1   Gross_Area              19609 non-null  float64
 2   Room_Count              19609 non-null  float64
 3   Floor_Location          19609 non-null  object 
 4   Building_Age            19609 non-null  object 
 5   Heating_Type            19609 non-null  object 
 6   Price                   19609 non-null  float64
 7   City                    19609 non-null  object 
 8   Occupancy_Status        19609 non-null  object 
 9   Investment_Eligibility  19609 non-null  object 
 10  Title_Deed_Status       19609 non-null  object 
 11  Bathroom_Count          19609 non-null  float64
dtypes: float64(4), int64(1), object(7)
memory usage: 1.8+ MB


In [6]:
for col in ["Occupancy_Status","Investment_Eligibility"]: print(f"\n--- {col} ---\n{data[col].unique()}")


--- Occupancy_Status ---
['Boş' 'Kiracı Oturuyor' 'Mülk Sahibi Oturuyor']

--- Investment_Eligibility ---
['Unknown' 'Uygun']


## Maknig the Hybrid data (OHE and Ordinal)

####  1. Map Building Age, Floor Location (Ordinal) 

In [7]:
# Mapping Building_Age categories to numerical values
age_mapping = {
    '0 (Yeni)': 0,
    '1': 1,
    '2': 2,
    '3': 3,
    '4': 4,
    '5-10': 7,       # Average of 5 and 10
    '11-15': 13,     # Average of 11 and 15
    '16-20': 18,     # Average of 16 and 20
    '21 Ve Üzeri': 25 # A safe guess for "21+"
}

data['Building_Age'] = data['Building_Age'].map(age_mapping)

In [8]:
data["Building_Age"].unique()

array([25,  4,  0,  7, 18,  3,  2, 13,  1])

In [9]:
data.head()

Unnamed: 0,Net_Area,Gross_Area,Room_Count,Floor_Location,Building_Age,Heating_Type,Price,City,Occupancy_Status,Investment_Eligibility,Title_Deed_Status,Bathroom_Count
0,120,150.0,4.0,4.Kat,25,Kombi Doğalgaz,950000.0,adana,Boş,Unknown,Kat Mülkiyeti,1.0
1,100,125.0,4.0,3.Kat,4,Kombi Doğalgaz,1250000.0,adana,Boş,Unknown,Unknown,1.0
2,89,95.0,3.0,4.Kat,0,Kombi Doğalgaz,1750000.0,adana,Boş,Uygun,Unknown,1.0
3,40,55.0,2.0,6.Kat,0,Kombi Doğalgaz,1300000.0,adana,Boş,Uygun,Unknown,1.0
4,140,150.0,4.0,Düz Giriş (Zemin),7,Klimalı,1700000.0,adana,Boş,Uygun,Kat Mülkiyeti,1.0


In [10]:
data_copy = data.copy()

In [20]:
data = data_copy.copy()

In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19609 entries, 0 to 19608
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Net_Area                19609 non-null  int64  
 1   Gross_Area              19609 non-null  float64
 2   Room_Count              19609 non-null  float64
 3   Floor_Location          19609 non-null  object 
 4   Building_Age            19609 non-null  int64  
 5   Heating_Type            19609 non-null  object 
 6   Price                   19609 non-null  float64
 7   City                    19609 non-null  object 
 8   Occupancy_Status        19609 non-null  object 
 9   Investment_Eligibility  19609 non-null  object 
 10  Title_Deed_Status       19609 non-null  object 
 11  Bathroom_Count          19609 non-null  float64
dtypes: float64(4), int64(2), object(6)
memory usage: 1.8+ MB


In [None]:
# --- 2. Map Floor Location (Ordinal) ---
floor_mapping = {
    # Special Floors (Ground/Garden)
    'Düz Giriş (Zemin)': 0,
    'Yüksek Giriş': 0.5,
    'Bahçe Katı': 0,
    'Müstakil': 1,
    'Villa Tipi': 1,
    'Bahçe Dublex': 1,

    # Basements
    'Bodrum Kat': -1,
    'Kot 1 (-1).Kat': -1, 'Kot 2 (-2).Kat': -2,
    'Kot 3 (-3).Kat': -3, 'Kot 4 (-4).Kat': -4,

    # Roof Types 
    # We use 5 because the median height is 4, and roof is usually the top.
    'Çatı Katı': 5,      
    # We use 6 to indicate it's "Better/Higher" than a standard roof flat
    'Çatı Dubleks': 6,   

    # Standard Floors
    '1.Kat': 1, '2.Kat': 2, '3.Kat': 3, '4.Kat': 4, '5.Kat': 5,
    '6.Kat': 6, '7.Kat': 7, '8.Kat': 8, '9.Kat': 9, '10.Kat': 10,
    '11.Kat': 11, '12.Kat': 12, '13.Kat': 13, '14.Kat': 14, '15.Kat': 15,
    '16.Kat': 16, '17.Kat': 17, '18.Kat': 18, '19.Kat': 19, '20.Kat': 20,
    '21.Kat': 21, '22.Kat': 22, '26.Kat': 26, '40+.Kat': 40
}



# Apply initial mapping
data['Floor_Location'] = data['Floor_Location'].map(floor_mapping)



# Check for unmapped values
print("Unmapped Ages:", data['Building_Age'].isnull().sum())
print("Unmapped Floors:", data['Floor_Location'].isnull().sum())

Unmapped Ages: 0
Unmapped Floors: 0


In [23]:
data.describe()

Unnamed: 0,Net_Area,Gross_Area,Room_Count,Floor_Location,Building_Age,Price,Bathroom_Count
count,19609.0,19609.0,19609.0,19609.0,19609.0,19609.0,19609.0
mean,127.370493,151.459687,3.692004,2.863303,7.719976,3035337.0,1.479117
std,74.944502,112.92999,1.102164,2.514035,8.394998,2966766.0,0.761345
min,30.0,32.0,1.0,-4.0,0.0,230000.0,0.0
25%,90.0,100.0,3.0,1.0,0.0,1650000.0,1.0
50%,120.0,135.0,4.0,2.0,7.0,2300000.0,1.0
75%,150.0,167.0,4.0,4.0,13.0,3270000.0,2.0
max,2000.0,2500.0,12.0,40.0,25.0,52000000.0,6.0


#### OHE For The rest

In [24]:
data.info()

data_copy = data.copy()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19609 entries, 0 to 19608
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Net_Area                19609 non-null  int64  
 1   Gross_Area              19609 non-null  float64
 2   Room_Count              19609 non-null  float64
 3   Floor_Location          19609 non-null  float64
 4   Building_Age            19609 non-null  int64  
 5   Heating_Type            19609 non-null  object 
 6   Price                   19609 non-null  float64
 7   City                    19609 non-null  object 
 8   Occupancy_Status        19609 non-null  object 
 9   Investment_Eligibility  19609 non-null  object 
 10  Title_Deed_Status       19609 non-null  object 
 11  Bathroom_Count          19609 non-null  float64
dtypes: float64(5), int64(2), object(5)
memory usage: 1.8+ MB


In [25]:
data.describe()

Unnamed: 0,Net_Area,Gross_Area,Room_Count,Floor_Location,Building_Age,Price,Bathroom_Count
count,19609.0,19609.0,19609.0,19609.0,19609.0,19609.0,19609.0
mean,127.370493,151.459687,3.692004,2.863303,7.719976,3035337.0,1.479117
std,74.944502,112.92999,1.102164,2.514035,8.394998,2966766.0,0.761345
min,30.0,32.0,1.0,-4.0,0.0,230000.0,0.0
25%,90.0,100.0,3.0,1.0,0.0,1650000.0,1.0
50%,120.0,135.0,4.0,2.0,7.0,2300000.0,1.0
75%,150.0,167.0,4.0,4.0,13.0,3270000.0,2.0
max,2000.0,2500.0,12.0,40.0,25.0,52000000.0,6.0


In [27]:
data.groupby("City")["Price"].median()

City
Tier_1    2950000.0
Tier_2    2100000.0
Tier_3    1590000.0
Name: Price, dtype: float64

In [None]:
# making the City Tiering" (Clustering)

# 1. Calculate Median Price per City
city_medians = data.groupby("City")["Price"].median()

# 2. Define the Cutoffs (Quartiles)
q1 = city_medians.quantile(0.25)
q3 = city_medians.quantile(0.75)

# 3. Define the Tiers (The Lists)
tier_1 = city_medians[city_medians > q3].index.tolist()
tier_2 = city_medians[(city_medians <= q3) & (city_medians >= q1)].index.tolist()
tier_3 = city_medians[city_medians < q1].index.tolist()

# 4. Create a Function to Map City -> Tier
def map_city_tier(city):
    if city in tier_1:
        return "Tier_1"
    elif city in tier_2:
        return "Tier_2"
    else:
        # Fallback for Tier 3 and any unknown cities
        return "Tier_3"

# 5. Apply it IN-PLACE (No new column)
data['City'] = data['City'].apply(map_city_tier)

# Check the result
print(data['City'].value_counts())

City
Tier_2    12365
Tier_1     6085
Tier_3     1159
Name: count, dtype: int64


In [29]:
dum = pd.get_dummies(data[["Heating_Type","City","Occupancy_Status","Investment_Eligibility","Title_Deed_Status"]] , dtype=int,drop_first=True)

In [30]:
dum.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19609 entries, 0 to 19608
Data columns (total 22 columns):
 #   Column                                 Non-Null Count  Dtype
---  ------                                 --------------  -----
 0   Heating_Type_Doğalgaz Sobalı           19609 non-null  int64
 1   Heating_Type_Güneş Enerjisi            19609 non-null  int64
 2   Heating_Type_Isıtma Yok                19609 non-null  int64
 3   Heating_Type_Jeotermal                 19609 non-null  int64
 4   Heating_Type_Kat Kaloriferi            19609 non-null  int64
 5   Heating_Type_Klimalı                   19609 non-null  int64
 6   Heating_Type_Kombi Doğalgaz            19609 non-null  int64
 7   Heating_Type_Merkezi (Pay Ölçer)       19609 non-null  int64
 8   Heating_Type_Merkezi Doğalgaz          19609 non-null  int64
 9   Heating_Type_Merkezi Kömür             19609 non-null  int64
 10  Heating_Type_Sobalı                    19609 non-null  int64
 11  Heating_Type_Yerden Isıtma  

In [32]:
columns_to_drop = ["Heating_Type","City","Occupancy_Status","Investment_Eligibility","Title_Deed_Status"]
data.drop(columns=columns_to_drop, inplace=True)

In [33]:
data = pd.concat([data, dum], axis=1)

In [34]:
data.head()

Unnamed: 0,Net_Area,Gross_Area,Room_Count,Floor_Location,Building_Age,Price,Bathroom_Count,Heating_Type_Doğalgaz Sobalı,Heating_Type_Güneş Enerjisi,Heating_Type_Isıtma Yok,...,City_Tier_2,City_Tier_3,Occupancy_Status_Kiracı Oturuyor,Occupancy_Status_Mülk Sahibi Oturuyor,Investment_Eligibility_Uygun,Title_Deed_Status_Hisseli Tapu,Title_Deed_Status_Kat Mülkiyeti,Title_Deed_Status_Kat İrtifakı,Title_Deed_Status_Müstakil Tapulu,Title_Deed_Status_Unknown
0,120,150.0,4.0,4.0,25,950000.0,1.0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
1,100,125.0,4.0,3.0,4,1250000.0,1.0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,89,95.0,3.0,4.0,0,1750000.0,1.0,0,0,0,...,1,0,0,0,1,0,0,0,0,1
3,40,55.0,2.0,6.0,0,1300000.0,1.0,0,0,0,...,1,0,0,0,1,0,0,0,0,1
4,140,150.0,4.0,0.0,7,1700000.0,1.0,0,0,0,...,1,0,0,0,1,0,1,0,0,0


In [35]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19609 entries, 0 to 19608
Data columns (total 29 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Net_Area                               19609 non-null  int64  
 1   Gross_Area                             19609 non-null  float64
 2   Room_Count                             19609 non-null  float64
 3   Floor_Location                         19609 non-null  float64
 4   Building_Age                           19609 non-null  int64  
 5   Price                                  19609 non-null  float64
 6   Bathroom_Count                         19609 non-null  float64
 7   Heating_Type_Doğalgaz Sobalı           19609 non-null  int64  
 8   Heating_Type_Güneş Enerjisi            19609 non-null  int64  
 9   Heating_Type_Isıtma Yok                19609 non-null  int64  
 10  Heating_Type_Jeotermal                 19609 non-null  int64  
 11  He

In [36]:
data.select_dtypes(include='object').columns


Index([], dtype='object')

In [37]:
data.to_csv("Hybrid_Encoding.csv", index=False)

#### Making a one with all OHE 

In [38]:
data2 = pd.read_csv("final_cleaned_home_price_data.csv")

In [39]:
# making the City Tiering" (Clustering)

# 1. Calculate Median Price per City
city_medians = data2.groupby("City")["Price"].median()

# 2. Define the Cutoffs (Quartiles)
q1 = city_medians.quantile(0.25)
q3 = city_medians.quantile(0.75)

# 3. Define the Tiers (The Lists)
tier_1 = city_medians[city_medians > q3].index.tolist()
tier_2 = city_medians[(city_medians <= q3) & (city_medians >= q1)].index.tolist()
tier_3 = city_medians[city_medians < q1].index.tolist()

# 4. Create a Function to Map City -> Tier
def map_city_tier(city):
    if city in tier_1:
        return "Tier_1"
    elif city in tier_2:
        return "Tier_2"
    else:
        # Fallback for Tier 3 and any unknown cities
        return "Tier_3"

# 5. Apply it IN-PLACE (No new column)
data2['City'] = data2['City'].apply(map_city_tier)

# Check the result
print(data2['City'].value_counts())

City
Tier_2    12365
Tier_1     6085
Tier_3     1159
Name: count, dtype: int64


In [40]:
dum = pd.get_dummies(data2[["Heating_Type","City","Occupancy_Status","Investment_Eligibility","Title_Deed_Status","Building_Age","Floor_Location"]] , dtype=int,drop_first=True)

In [41]:
data2 = data2.drop(columns=["Heating_Type","City","Occupancy_Status","Investment_Eligibility","Title_Deed_Status","Building_Age","Floor_Location"])
data2 = pd.concat([data2, dum], axis=1)

In [42]:
data2.select_dtypes(include='object').columns


Index([], dtype='object')

In [43]:
data2.head()

Unnamed: 0,Net_Area,Gross_Area,Room_Count,Price,Bathroom_Count,Heating_Type_Doğalgaz Sobalı,Heating_Type_Güneş Enerjisi,Heating_Type_Isıtma Yok,Heating_Type_Jeotermal,Heating_Type_Kat Kaloriferi,...,Floor_Location_Düz Giriş (Zemin),Floor_Location_Kot 1 (-1).Kat,Floor_Location_Kot 2 (-2).Kat,Floor_Location_Kot 3 (-3).Kat,Floor_Location_Kot 4 (-4).Kat,Floor_Location_Müstakil,Floor_Location_Villa Tipi,Floor_Location_Yüksek Giriş,Floor_Location_Çatı Dubleks,Floor_Location_Çatı Katı
0,120,150.0,4.0,950000.0,1.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,100,125.0,4.0,1250000.0,1.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,89,95.0,3.0,1750000.0,1.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,40,55.0,2.0,1300000.0,1.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,140,150.0,4.0,1700000.0,1.0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [44]:
data2.to_csv("data_OHE.csv", index=False)

In [3]:
# making the City Tiering" (Clustering)

# 1. Calculate Median Price per City
city_medians2 = city_tier.groupby("City")["Price"].median()

# 2. Define the Cutoffs (Quartiles)
q1 = city_medians2.quantile(0.25)
q3 = city_medians2.quantile(0.75)
# 3. Define the Tiers (The Lists)
tier_1 = city_medians2[city_medians2 > q3].index.tolist()
tier_2 = city_medians2[(city_medians2 <= q3) & (city_medians2 >= q1)].index.tolist()
tier_3 = city_medians2[city_medians2 < q1].index.tolist()
# 4. Create a Function to Map City -> Tier
def map_city_tier(city):
    if city in tier_1:
        return "Tier_1"
    elif city in tier_2:
        return "Tier_2"
    else:
        # Fallback for Tier 3 and any unknown cities
        return "Tier_3"

# 5. Apply it IN-PLACE (No new column)
city_tier['City'] = city_tier['City'].apply(map_city_tier)

# Check the result
print(city_tier['City'].value_counts())

City
Tier_2    12365
Tier_1     6085
Tier_3     1159
Name: count, dtype: int64


In [4]:
city_tier["City"].unique()

array(['Tier_2', 'Tier_3', 'Tier_1'], dtype=object)

In [5]:
city_tier.to_csv("city_tiered_data.csv", index=False)