### Real Estate Multi-Target Prediction Pipeline
#### Sequential approach: BHK Type Presence -> Unit Counts -> Carpet Areas -> Project Metrics

In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, mean_absolute_error, classification_report, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier, XGBRegressor
import warnings
import pickle
warnings.filterwarnings('ignore')

In [4]:
#Change current working directory to one folder above
import os
os.chdir('..')

In [5]:
df = pd.read_csv("processing_outputs/cleaned_data.csv")
df

Unnamed: 0,rera_ids,far,number_of_towers,total_unit_count,1_5BHK_count,1BHK_count,1RK_count,2_5BHK_count,2BHK_count,3_5BHK_count,...,5BHK,avg_price,airport_distance_kms,ksr_jn_distance_kms,yeshwantpur_jn_distance_kms,nearest_metro_dist_kms,nearest_metro_station,nearest_major_road_dist_kms,nearest_major_road,project_duration
0,PRM/KA/RERA/1251/446/PR/191022/005340,2.000000,1.0,126,0.0,0.0,0.0,0.0,61.0,0.0,...,0,8.50,30.743633,18.789358,24.737635,6.197776,Nallurahalli,4.717755,Outer Ring Road,3.819302
1,PRM/KA/RERA/1251/446/PR/150224/006619,2.250000,1.0,84,0.0,0.0,0.0,0.0,52.0,0.0,...,0,6.25,17.722085,16.062128,18.537574,4.735450,Singayyanapalya,1.975241,Old Madras Road,2.045175
2,PRM/KA/RERA/1251/446/PR/300924/007102,2.200000,1.0,60,0.0,0.0,0.0,0.0,28.0,0.0,...,0,8.02,16.838685,11.915246,11.929444,7.787598,Benniganahalli,2.774341,Nagawara Flyover,2.932238
3,PRM/KA/RERA/1251/310/PR/051224/007268,2.490000,1.0,126,0.0,2.0,0.0,0.0,0.0,0.0,...,0,9.00,37.695984,25.095496,31.528721,13.506856,Nallurahalli,11.878441,Outer Ring Road Underpass,3.496235
4,PRM/KA/RERA/1251/308/PR/210524/006935,1.560000,1.0,166,0.0,0.0,0.0,0.0,0.0,0.0,...,0,13.43,40.058996,18.659459,25.630953,12.408801,Yelachenahalli,2.580445,Hosur Road,4.996578
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
942,PRM/KA/RERA/1251/310/PR/180122/004645,2.175072,1.0,225,0.0,31.0,0.0,64.0,33.0,0.0,...,0,,30.122094,1.579987,7.486775,1.181454,Magadi Road,0.506145,"['BGS Flyover', 'Bala Gangadhara Swamy Flyover']",4.835044
943,PRM/KA/RERA/1251/309/PR/070222/004675,2.250000,1.0,104,0.0,0.0,0.0,0.0,18.0,0.0,...,0,,21.164953,13.711264,17.396020,1.161749,Singayyanapalya,0.123218,Old Madras Road,2.888433
944,PRM/KA/RERA/1251/446/PR/240122/004654,2.390000,1.0,384,0.0,2.0,48.0,0.0,0.0,0.0,...,0,,20.095579,21.528500,24.798591,3.257451,Whitefield (Kadugodi),5.437946,Old Madras Road,4.158795
945,PRM/KA/RERA/1251/446/PR/240122/004656,2.200200,1.0,312,0.0,0.0,0.0,0.0,14.0,0.0,...,0,,34.492445,17.614179,24.253059,10.198788,Kundalahalli,5.285258,Outer Ring Road,3.493498


In [None]:
df.columns

Index(['rera_ids', 'far', 'number_of_towers', 'total_unit_count',
       '1_5BHK_count', '1BHK_count', '1RK_count', '2_5BHK_count', '2BHK_count',
       '3_5BHK_count', '3BHK_count', '4_5BHK_count', '4BHK_count',
       '5BHK_count', '1_5BHK_mean_carpet_area', '1BHK_mean_carpet_area',
       '1RK_mean_carpet_area', '2_5BHK_mean_carpet_area',
       '2BHK_mean_carpet_area', '3_5BHK_mean_carpet_area',
       '3BHK_mean_carpet_area', '4_5BHK_mean_carpet_area',
       '4BHK_mean_carpet_area', '5BHK_mean_carpet_area', 'cost_of_land_inr',
       'district', 'latitude', 'longitude', 'project_completion_date',
       'project_name', 'project_start_date', 'project_type', 'taluk',
       'total_area_of_land_sqm', 'total_covered_area_sqm',
       'total_open_area_sqm', 'total_project_cost_inr', 'area', 'product_type',
       '1RK', '1BHK', '1_5BHK', '2BHK', '2_5BHK', '3BHK', '3_5BHK', '4BHK',
       '4_5BHK', '5BHK', 'avg_price', 'airport_distance_kms',
       'ksr_jn_distance_kms', 'yeshwantpur_

In [4]:
df["built_up_area"] = df["far"] * df["total_area_of_land_sqm"]

In [5]:
df= pd.get_dummies(df, columns=['product_type'], dtype=int)
df

Unnamed: 0,rera_ids,far,number_of_towers,total_unit_count,1_5BHK_count,1BHK_count,1RK_count,2_5BHK_count,2BHK_count,3_5BHK_count,...,nearest_metro_dist_kms,nearest_metro_station,nearest_major_road_dist_kms,nearest_major_road,project_duration,built_up_area,product_type_Apartment,product_type_Mixed,product_type_Row Houses,product_type_Villa
0,PRM/KA/RERA/1251/446/PR/191022/005340,2.000000,1.0,126,0.0,0.0,0.0,0.0,61.0,0.0,...,6.197776,Nallurahalli,4.717755,Outer Ring Road,3.819302,11934.000000,1,0,0,0
1,PRM/KA/RERA/1251/446/PR/150224/006619,2.250000,1.0,84,0.0,0.0,0.0,0.0,52.0,0.0,...,4.735450,Singayyanapalya,1.975241,Old Madras Road,2.045175,8534.250000,1,0,0,0
2,PRM/KA/RERA/1251/446/PR/300924/007102,2.200000,1.0,60,0.0,0.0,0.0,0.0,28.0,0.0,...,7.787598,Benniganahalli,2.774341,Nagawara Flyover,2.932238,5841.000000,1,0,0,0
3,PRM/KA/RERA/1251/310/PR/051224/007268,2.490000,1.0,126,0.0,2.0,0.0,0.0,0.0,0.0,...,13.506856,Nallurahalli,11.878441,Outer Ring Road Underpass,3.496235,21916.980000,1,0,0,0
4,PRM/KA/RERA/1251/308/PR/210524/006935,1.560000,1.0,166,0.0,0.0,0.0,0.0,0.0,0.0,...,12.408801,Yelachenahalli,2.580445,Hosur Road,4.996578,13894.920000,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
942,PRM/KA/RERA/1251/310/PR/180122/004645,2.175072,1.0,225,0.0,31.0,0.0,64.0,33.0,0.0,...,1.181454,Magadi Road,0.506145,"['BGS Flyover', 'Bala Gangadhara Swamy Flyover']",4.835044,22429.346923,1,0,0,0
943,PRM/KA/RERA/1251/309/PR/070222/004675,2.250000,1.0,104,0.0,0.0,0.0,0.0,18.0,0.0,...,1.161749,Singayyanapalya,0.123218,Old Madras Road,2.888433,14616.000000,1,0,0,0
944,PRM/KA/RERA/1251/446/PR/240122/004654,2.390000,1.0,384,0.0,2.0,48.0,0.0,0.0,0.0,...,3.257451,Whitefield (Kadugodi),5.437946,Old Madras Road,4.158795,24528.570000,1,0,0,0
945,PRM/KA/RERA/1251/446/PR/240122/004656,2.200200,1.0,312,0.0,0.0,0.0,0.0,14.0,0.0,...,10.198788,Kundalahalli,5.285258,Outer Ring Road,3.493498,24303.414109,1,0,0,0


In [6]:
# Create one-hot encoding for taluk
taluk_dummies = pd.get_dummies(df['taluk'], prefix='taluk', dtype=int)
base_features = [
        'total_area_of_land_sqm', 
        'far',
        #'built_up_area',
        'airport_distance_kms', 'ksr_jn_distance_kms', 'yeshwantpur_jn_distance_kms',
        'nearest_metro_dist_kms', 'nearest_major_road_dist_kms',
        'product_type_Apartment', 'product_type_Villa'
    ]
    
# Combine base features with taluk dummies
input_features = base_features + taluk_dummies.columns.tolist()
input_features

['total_area_of_land_sqm',
 'far',
 'airport_distance_kms',
 'ksr_jn_distance_kms',
 'yeshwantpur_jn_distance_kms',
 'nearest_metro_dist_kms',
 'nearest_major_road_dist_kms',
 'product_type_Apartment',
 'product_type_Villa',
 'taluk_Anekal',
 'taluk_Bengaluru East',
 'taluk_Bengaluru North',
 'taluk_Bengaluru South',
 'taluk_Devanahalli',
 'taluk_Hosakote',
 'taluk_Yelahanka']

In [7]:
bhk_presence_targets = ['1RK', '1BHK', '1_5BHK', '2BHK', '2_5BHK', '3BHK', '3_5BHK', '4BHK', '4_5BHK', '5BHK']
bhk_count_targets = ["1RK_count", "1BHK_count", "1_5BHK_count", "2BHK_count", 
                     "2_5BHK_count", "3BHK_count", "3_5BHK_count", "4BHK_count", 
                     "4_5BHK_count", "5BHK_count"]
bhk_mean_carpet_area_targets = ["1RK_mean_carpet_area", "1BHK_mean_carpet_area", "1_5BHK_mean_carpet_area", "2BHK_mean_carpet_area", 
                     "2_5BHK_mean_carpet_area", "3BHK_mean_carpet_area", "3_5BHK_mean_carpet_area", "4BHK_mean_carpet_area", 
                     "4_5BHK_mean_carpet_area", "5BHK_mean_carpet_area"]

In [8]:
df = pd.concat([df, taluk_dummies], axis=1)
df

Unnamed: 0,rera_ids,far,number_of_towers,total_unit_count,1_5BHK_count,1BHK_count,1RK_count,2_5BHK_count,2BHK_count,3_5BHK_count,...,product_type_Mixed,product_type_Row Houses,product_type_Villa,taluk_Anekal,taluk_Bengaluru East,taluk_Bengaluru North,taluk_Bengaluru South,taluk_Devanahalli,taluk_Hosakote,taluk_Yelahanka
0,PRM/KA/RERA/1251/446/PR/191022/005340,2.000000,1.0,126,0.0,0.0,0.0,0.0,61.0,0.0,...,0,0,0,0,1,0,0,0,0,0
1,PRM/KA/RERA/1251/446/PR/150224/006619,2.250000,1.0,84,0.0,0.0,0.0,0.0,52.0,0.0,...,0,0,0,0,1,0,0,0,0,0
2,PRM/KA/RERA/1251/446/PR/300924/007102,2.200000,1.0,60,0.0,0.0,0.0,0.0,28.0,0.0,...,0,0,0,0,1,0,0,0,0,0
3,PRM/KA/RERA/1251/310/PR/051224/007268,2.490000,1.0,126,0.0,2.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
4,PRM/KA/RERA/1251/308/PR/210524/006935,1.560000,1.0,166,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
942,PRM/KA/RERA/1251/310/PR/180122/004645,2.175072,1.0,225,0.0,31.0,0.0,64.0,33.0,0.0,...,0,0,0,0,0,0,1,0,0,0
943,PRM/KA/RERA/1251/309/PR/070222/004675,2.250000,1.0,104,0.0,0.0,0.0,0.0,18.0,0.0,...,0,0,0,0,0,1,0,0,0,0
944,PRM/KA/RERA/1251/446/PR/240122/004654,2.390000,1.0,384,0.0,2.0,48.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
945,PRM/KA/RERA/1251/446/PR/240122/004656,2.200200,1.0,312,0.0,0.0,0.0,0.0,14.0,0.0,...,0,0,0,0,1,0,0,0,0,0


In [9]:
# Handle missing values - combine base features with taluk dummies
X_base = df[base_features].fillna(0)
X = pd.concat([X_base, taluk_dummies], axis=1)

In [10]:
y_presence = df[bhk_presence_targets].fillna(0)

### MODEL 1 - PREDICT THE PRESENCE OF INVETORY

In [11]:
model_dir = "models"

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y_presence, test_size=0.2, random_state=42)
    
    # Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train model
xgb = XGBClassifier(objective='reg:squarederror', random_state=42)

# Hyperparameter space
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'max_depth': [3, 5, 7, 10],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 1, 5],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [1, 1.5, 2]
}

# Randomized Search
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=50,                   # Number of parameter settings sampled
    scoring='r2',                # You can use 'neg_mean_squared_error', 'r2', etc.
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)
random_search.fit(X_train_scaled, y_train)

model = random_search.best_estimator_

# Predictions
y_pred = model.predict(X_test_scaled)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"Overall Accuracy: {accuracy:.3f}")

# Per-BHK type accuracy
for i, col in enumerate(y_presence.columns):
    acc = accuracy_score(y_test.iloc[:, i], y_pred[:, i])
    print(f"{col} Accuracy: {acc:.3f}")

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Overall Accuracy: 0.405
1RK Accuracy: 0.984
1BHK Accuracy: 0.747
1_5BHK Accuracy: 0.995
2BHK Accuracy: 0.832
2_5BHK Accuracy: 0.884
3BHK Accuracy: 0.889
3_5BHK Accuracy: 0.963
4BHK Accuracy: 0.800
4_5BHK Accuracy: 0.995
5BHK Accuracy: 0.968


In [13]:
# Save model and scaler
with open(f'{model_dir}/bhk_presence_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open(f'{model_dir}/bhk_presence_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

### MODEL 2 - PREDICT THE TOTAL NO. OF UNITS

In [14]:
X_enhanced = pd.concat([
    X.reset_index(drop=True),
    y_presence.reset_index(drop=True),
], axis=1)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_enhanced, df["total_unit_count"], test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

# Hyperparameter space
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'max_depth': [3, 5, 7, 10],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 1, 5],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [1, 1.5, 2]
}

# Randomized Search
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=50,                   # Number of parameter settings sampled
    scoring='r2',                # You can use 'neg_mean_squared_error', 'r2', etc.
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train_scaled, y_train)
# Train model
model = random_search.best_estimator_

# Best parameters and score
print("Best parameters:", random_search.best_params_)
print("Best CV R² score:", random_search.best_score_)

# Predictions
y_pred = model.predict(X_test_scaled)

# Evaluate
r2 = r2_score(y_test, y_pred)
print(f"R2 Score: {r2:.3f}")


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters: {'subsample': 0.6, 'reg_lambda': 2, 'reg_alpha': 0, 'n_estimators': 400, 'max_depth': 5, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.8}
Best CV R² score: 0.7222982883453369
R2 Score: 0.702


In [15]:
with open(f'{model_dir}/unit_count_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open(f'{model_dir}/unit_count_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

### FEATURE ENGINEERING FOR BHK-LEVEL PREDICTIONS

In [16]:
bhk_mapping = {
    '1RK': 0.5,
    '1BHK': 1,
    '1_5BHK': 1.5,
    '2BHK': 2,
    '2_5BHK': 2.5,
    '3BHK': 3,
    '3_5BHK': 3.5,
    '4BHK': 4,
    '4_5BHK': 4.5,
    '5BHK': 5
}

bhk_count_cols = [f"{bhk}_count" for bhk in bhk_mapping.keys()]
bhk_carpet_cols = [f"{bhk}_mean_carpet_area" for bhk in bhk_mapping.keys()]
bhk_presence_cols = [f"{bhk}" for bhk in bhk_mapping.keys()]

In [17]:
count_melted = df.melt(
    id_vars=['rera_ids'], 
    value_vars=bhk_count_cols, 
    var_name='bhk_label', 
    value_name='bhk_count'
)
count_melted['bhk_str'] = count_melted['bhk_label'].str.replace('_count', '', regex=False)
count_melted['bhk_type'] = count_melted['bhk_str'].map(bhk_mapping)

carpet_melted = df.melt(
    id_vars=['rera_ids'], 
    value_vars=bhk_carpet_cols, 
    var_name='bhk_label', 
    value_name='mean_carpet_area'
)
carpet_melted['bhk_str'] = carpet_melted['bhk_label'].str.replace('_mean_carpet_area', '', regex=False)
carpet_melted['bhk_type'] = carpet_melted['bhk_str'].map(bhk_mapping)
carpet_melted

Unnamed: 0,rera_ids,bhk_label,mean_carpet_area,bhk_str,bhk_type
0,PRM/KA/RERA/1251/446/PR/191022/005340,1RK_mean_carpet_area,0.0,1RK,0.5
1,PRM/KA/RERA/1251/446/PR/150224/006619,1RK_mean_carpet_area,0.0,1RK,0.5
2,PRM/KA/RERA/1251/446/PR/300924/007102,1RK_mean_carpet_area,0.0,1RK,0.5
3,PRM/KA/RERA/1251/310/PR/051224/007268,1RK_mean_carpet_area,0.0,1RK,0.5
4,PRM/KA/RERA/1251/308/PR/210524/006935,1RK_mean_carpet_area,0.0,1RK,0.5
...,...,...,...,...,...
9465,PRM/KA/RERA/1251/310/PR/180122/004645,5BHK_mean_carpet_area,0.0,5BHK,5.0
9466,PRM/KA/RERA/1251/309/PR/070222/004675,5BHK_mean_carpet_area,0.0,5BHK,5.0
9467,PRM/KA/RERA/1251/446/PR/240122/004654,5BHK_mean_carpet_area,0.0,5BHK,5.0
9468,PRM/KA/RERA/1251/446/PR/240122/004656,5BHK_mean_carpet_area,0.0,5BHK,5.0


In [18]:
df_bhk = pd.merge(
    count_melted[['rera_ids', 'bhk_type', 'bhk_count']],
    carpet_melted[['rera_ids', 'bhk_type', 'mean_carpet_area']],
    on=['rera_ids', 'bhk_type'],
    how='inner'
)
df_bhk = df_bhk[df_bhk['bhk_count'].fillna(0) > 0].copy()
df_bhk.reset_index(drop=True, inplace=True)

df_bhk = df_bhk.merge(df.drop(columns=bhk_presence_cols+bhk_count_cols+bhk_carpet_cols), how='left', on='rera_ids')
df_bhk["total_carpet_area"] = df_bhk["bhk_count"] * df_bhk["mean_carpet_area"]
df_bhk

Unnamed: 0,rera_ids,bhk_type,bhk_count,mean_carpet_area,far,number_of_towers,total_unit_count,cost_of_land_inr,district,latitude,...,product_type_Row Houses,product_type_Villa,taluk_Anekal,taluk_Bengaluru East,taluk_Bengaluru North,taluk_Bengaluru South,taluk_Devanahalli,taluk_Hosakote,taluk_Yelahanka,total_carpet_area
0,PRM/KA/RERA/1250/303/PR/080525/007730,0.5,60.0,26.990000,2.499,16.0,1077,972000000,Bengaluru Rural,13.206809,...,0,0,0,0,0,0,1,0,0,1619.400000
1,PRM/KA/RERA/1251/446/PR/300924/007105,0.5,95.0,24.596526,2.247,8.0,698,1098268515,Bengaluru Urban,12.889460,...,0,0,0,1,0,0,0,0,0,2336.670000
2,PRM/KA/RERA/1251/309/PR/060324/006692,0.5,72.0,16.160000,3.240,2.0,358,377577120,Bengaluru Urban,13.147780,...,0,0,0,0,1,0,0,0,0,1163.520000
3,PRM/KA/RERA/1251/472/PR/080525/007728,0.5,44.0,24.800000,3.000,4.0,456,3543753837,Bengaluru Urban,13.122500,...,0,0,0,0,0,0,0,0,1,1091.200000
4,PRM/KA/RERA/1250/303/PR/110522/004869,0.5,176.0,19.731136,2.361,9.0,1001,273287989,Bengaluru Rural,13.210587,...,0,0,0,0,0,0,1,0,0,3472.680000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2165,PRM/KA/RERA/1251/446/PR/271123/006439,5.0,2.0,472.270000,2.248,1.0,44,127755245,Bengaluru Urban,13.068867,...,0,0,0,1,0,0,0,0,0,944.540000
2166,PRM/KA/RERA/1251/446/PR/281222/005565,5.0,2.0,500.000000,2.790,1.0,130,438031801,Bengaluru Urban,12.988918,...,0,0,0,1,0,0,0,0,0,1000.000000
2167,PRM/KA/RERA/1251/446/PR/070622/004966,5.0,1.0,180.220000,1.810,1.0,14,177586500,Bengaluru Urban,13.002872,...,0,0,0,1,0,0,0,0,0,180.220000
2168,PRM/KA/RERA/1251/446/PR/210422/004844,5.0,1.0,281.370000,2.750,1.0,35,874029534,Bengaluru Urban,12.984779,...,0,0,0,1,0,0,0,0,0,281.370000


In [19]:
input_features

['total_area_of_land_sqm',
 'far',
 'airport_distance_kms',
 'ksr_jn_distance_kms',
 'yeshwantpur_jn_distance_kms',
 'nearest_metro_dist_kms',
 'nearest_major_road_dist_kms',
 'product_type_Apartment',
 'product_type_Villa',
 'taluk_Anekal',
 'taluk_Bengaluru East',
 'taluk_Bengaluru North',
 'taluk_Bengaluru South',
 'taluk_Devanahalli',
 'taluk_Hosakote',
 'taluk_Yelahanka']

### MODEL 3: MEAN CARPET AREA PREDICTIONDS

In [20]:
X_bhk = df_bhk[input_features+["bhk_type",
                               #"total_unit_count"
                               ]]
y_bhk = df_bhk[["mean_carpet_area"]]
#print(X_enhanced)
# Split data
X_train, X_test, y_train, y_test = train_test_split(X_bhk, y_bhk, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Your base model
xgb = XGBRegressor(objective='reg:squarederror',
                            random_state=42)

param_dist = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
}

# Random Search
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=30,
    scoring='r2',         # For multi-output, this is averaged R²
    cv=5,
    verbose=1,
    n_jobs=-1
)

# Fit to data (X: features, Y: multiple targets as a DataFrame or 2D array)
random_search.fit(X_train_scaled, y_train)
print(random_search.best_score_)

model = random_search.best_estimator_
# Predictions
y_pred = model.predict(X_test_scaled)

# Evaluate
r2 = r2_score(y_test, y_pred)
print(f"R2_score {r2:.2f}")


Fitting 5 folds for each of 30 candidates, totalling 150 fits
0.7567290425300598
R2_score 0.67


In [21]:
with open(f'{model_dir}/carpet_area_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open(f'{model_dir}/carpet_area_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

### MODEL 4: BHK UNIT COUNT PREDICTION

In [22]:
X_bhk = df_bhk[input_features+["bhk_type",
                               "total_unit_count",
                               "mean_carpet_area", 
                               ]]
y_bhk = df_bhk[["bhk_count"]]
#print(X_enhanced)
# Split data
X_train, X_test, y_train, y_test = train_test_split(X_bhk, y_bhk, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Your base model
xgb = XGBRegressor(objective='reg:squarederror',
                            random_state=42)

param_dist = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
}

# Random Search
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=30,
    scoring='r2',         # For multi-output, this is averaged R²
    cv=5,
    verbose=1,
    n_jobs=-1
)

# Fit to data (X: features, Y: multiple targets as a DataFrame or 2D array)
random_search.fit(X_train_scaled, y_train)

model = random_search.best_estimator_
print(random_search.best_score_)
# Predictions
y_pred = model.predict(X_test_scaled)

# Evaluate
r2 = r2_score(y_test, y_pred)
print(f"R2_score {r2:.2f}")


Fitting 5 folds for each of 30 candidates, totalling 150 fits
0.43599399328231814
R2_score 0.41


In [23]:
with open(f'{model_dir}/bhk_unit_count_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open(f'{model_dir}/bhk_unit_count_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

### MODEL 5: AVG PRICE PREDICTION

In [24]:
X_bhk = df_bhk.dropna(subset=["avg_price"])[input_features+["bhk_type",
                               "mean_carpet_area"
                               ]]
y_bhk = df_bhk.dropna(subset=["avg_price"])[["avg_price"]]
#print(X_enhanced)
# Split data
X_train, X_test, y_train, y_test = train_test_split(X_bhk, y_bhk, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Your base model
xgb = XGBRegressor(objective='reg:squarederror',
                            random_state=42)

param_dist = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
}

# Random Search
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=30,
    scoring='r2',         # For multi-output, this is averaged R²
    cv=5,
    verbose=1,
    n_jobs=-1
)

# Fit to data (X: features, Y: multiple targets as a DataFrame or 2D array)
random_search.fit(X_train_scaled, y_train)

model = random_search.best_estimator_
print(random_search.best_score_)
# Predictions
y_pred = model.predict(X_test_scaled)

# Evaluate
r2 = r2_score(y_test, y_pred)
print(f"R2_score {r2:.2f}")


Fitting 5 folds for each of 30 candidates, totalling 150 fits
0.8603708148002625
R2_score 0.95


In [25]:
# Predicting missing avg_price_values
avg_price_input_features = input_features+["bhk_type",
                               "mean_carpet_area"
                               ]
missing_avg_prices = df_bhk[df_bhk["avg_price"].isna()][["rera_ids"]+avg_price_input_features]
missing_avg_prices["avg_price"] = model.predict(scaler.transform(missing_avg_prices[avg_price_input_features]))
missing_avg_prices

Unnamed: 0,rera_ids,total_area_of_land_sqm,far,airport_distance_kms,ksr_jn_distance_kms,yeshwantpur_jn_distance_kms,nearest_metro_dist_kms,nearest_major_road_dist_kms,product_type_Apartment,product_type_Villa,taluk_Anekal,taluk_Bengaluru East,taluk_Bengaluru North,taluk_Bengaluru South,taluk_Devanahalli,taluk_Hosakote,taluk_Yelahanka,bhk_type,mean_carpet_area,avg_price
2,PRM/KA/RERA/1251/309/PR/060324/006692,23598,3.2400,6.195783,22.402050,20.663908,16.363385,7.689442,1,0,0,0,1,0,0,0,0,0.5,16.160000,9.864559
9,PRM/KA/RERA/1251/309/PR/171123/006418,16630,2.9900,14.849597,13.768985,12.994268,9.597176,4.721571,1,0,0,0,1,0,0,0,0,0.5,30.560000,12.756759
10,PRM/KA/RERA/1251/310/PR/030325/007540,16139,1.6200,37.374600,9.182947,12.053858,0.771749,0.575341,0,0,0,0,0,1,0,0,0,0.5,23.780504,13.077938
11,PRM/KA/RERA/1251/310/PR/030225/007479,3507,1.5000,40.049109,13.861568,20.421902,5.126532,0.221533,1,0,0,0,0,1,0,0,0,0.5,24.041667,12.957695
12,PRM/KA/RERA/1251/310/PR/020824/006955,38546,0.8666,37.078529,13.693270,20.654913,7.560163,1.595020,1,0,0,0,0,1,0,0,0,0.5,25.730000,10.350155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2165,PRM/KA/RERA/1251/446/PR/271123/006439,11837,2.2480,14.618757,20.604765,22.287806,8.452677,5.696274,1,0,0,1,0,0,0,0,0,5.0,472.270000,11.108891
2166,PRM/KA/RERA/1251/446/PR/281222/005565,14530,2.7900,23.170633,16.342880,20.696421,0.986724,2.968998,1,0,0,1,0,0,0,0,0,5.0,500.000000,13.002255
2167,PRM/KA/RERA/1251/446/PR/070622/004966,1312,1.8100,24.571421,4.163935,7.495820,2.424686,1.782410,1,0,0,1,0,0,0,0,0,5.0,180.220000,13.966949
2168,PRM/KA/RERA/1251/446/PR/210422/004844,2748,2.7500,25.572541,4.930825,10.040568,1.319129,4.292345,1,0,0,1,0,0,0,0,0,5.0,281.370000,15.746445


In [26]:
#Apply the predicted avg_value to df by averaging the avg_price for each rera_ids and merging it with df
missing_avg_prices["rera_avg_price"] = missing_avg_prices.groupby("rera_ids")["avg_price"].transform("mean")
missing_avg_prices

Unnamed: 0,rera_ids,total_area_of_land_sqm,far,airport_distance_kms,ksr_jn_distance_kms,yeshwantpur_jn_distance_kms,nearest_metro_dist_kms,nearest_major_road_dist_kms,product_type_Apartment,product_type_Villa,...,taluk_Bengaluru East,taluk_Bengaluru North,taluk_Bengaluru South,taluk_Devanahalli,taluk_Hosakote,taluk_Yelahanka,bhk_type,mean_carpet_area,avg_price,rera_avg_price
2,PRM/KA/RERA/1251/309/PR/060324/006692,23598,3.2400,6.195783,22.402050,20.663908,16.363385,7.689442,1,0,...,0,1,0,0,0,0,0.5,16.160000,9.864559,10.217443
9,PRM/KA/RERA/1251/309/PR/171123/006418,16630,2.9900,14.849597,13.768985,12.994268,9.597176,4.721571,1,0,...,0,1,0,0,0,0,0.5,30.560000,12.756759,10.411276
10,PRM/KA/RERA/1251/310/PR/030325/007540,16139,1.6200,37.374600,9.182947,12.053858,0.771749,0.575341,0,0,...,0,0,1,0,0,0,0.5,23.780504,13.077938,12.926766
11,PRM/KA/RERA/1251/310/PR/030225/007479,3507,1.5000,40.049109,13.861568,20.421902,5.126532,0.221533,1,0,...,0,0,1,0,0,0,0.5,24.041667,12.957695,12.957695
12,PRM/KA/RERA/1251/310/PR/020824/006955,38546,0.8666,37.078529,13.693270,20.654913,7.560163,1.595020,1,0,...,0,0,1,0,0,0,0.5,25.730000,10.350155,10.565258
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2165,PRM/KA/RERA/1251/446/PR/271123/006439,11837,2.2480,14.618757,20.604765,22.287806,8.452677,5.696274,1,0,...,1,0,0,0,0,0,5.0,472.270000,11.108891,10.913790
2166,PRM/KA/RERA/1251/446/PR/281222/005565,14530,2.7900,23.170633,16.342880,20.696421,0.986724,2.968998,1,0,...,1,0,0,0,0,0,5.0,500.000000,13.002255,12.645586
2167,PRM/KA/RERA/1251/446/PR/070622/004966,1312,1.8100,24.571421,4.163935,7.495820,2.424686,1.782410,1,0,...,1,0,0,0,0,0,5.0,180.220000,13.966949,13.636357
2168,PRM/KA/RERA/1251/446/PR/210422/004844,2748,2.7500,25.572541,4.930825,10.040568,1.319129,4.292345,1,0,...,1,0,0,0,0,0,5.0,281.370000,15.746445,15.591856


In [27]:
cleaned_df = pd.read_csv("processing_outputs/cleaned_data.csv")
print(cleaned_df.shape)
cleaned_df = cleaned_df.merge(missing_avg_prices[["rera_ids", "rera_avg_price"]].drop_duplicates(subset=["rera_ids"]), on="rera_ids", how="left")
#combine columns rera_avg_price and avg_price such that if avg_price is null then rera_avg_price is used
cleaned_df["avg_price"] = cleaned_df["avg_price"].combine_first(cleaned_df["rera_avg_price"])
cleaned_df.drop(columns=["rera_avg_price"], inplace=True)
cleaned_df.dropna(subset=["avg_price"], inplace=True)
cleaned_df.to_csv("processing_outputs/price_predicted_data.csv", index=False)

(947, 58)


In [28]:
with open(f'{model_dir}/avg_price_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open(f'{model_dir}/avg_price_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

### MODEL 6 - TOTAL PROJECT COST PREDICTION

In [29]:

X_enhanced = pd.concat([
    df[input_features].reset_index(drop=True),
    df[bhk_presence_targets].reset_index(drop=True),
    df[bhk_count_targets].reset_index(drop=True),
    df[bhk_mean_carpet_area_targets].reset_index(drop=True),   
    df["total_unit_count"].reset_index(drop=True)
], axis=1)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_enhanced, df[["total_project_cost_inr"]], test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

param_dist = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
}

# Random Search
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=30,
    scoring='r2',         # For multi-output, this is averaged R²
    cv=5,
    verbose=1,
    n_jobs=-1
)

random_search.fit(X_train_scaled, y_train)
print(random_search.best_score_)

model = random_search.best_estimator_
# Predictions
y_pred = model.predict(X_test_scaled)

# Evaluate
r2 = r2_score(y_test, y_pred)
print(f"Overall R2: {r2:.2f}")

Fitting 5 folds for each of 30 candidates, totalling 150 fits
0.7794438242912293
Overall R2: 0.83


In [30]:
with open(f'{model_dir}/total_project_cost_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open(f'{model_dir}/total_project_cost_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

### MODEL 7 - OPEN AREA PREDICTION

In [31]:

X_enhanced = pd.concat([
    df[input_features].reset_index(drop=True),
    df[bhk_presence_targets].reset_index(drop=True),
    df[bhk_count_targets].reset_index(drop=True),
    df[bhk_mean_carpet_area_targets].reset_index(drop=True),   
    df["total_unit_count"].reset_index(drop=True)
], axis=1)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_enhanced, df[["total_open_area_sqm"]], test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

param_dist = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
}

# Random Search
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=30,
    scoring='r2',         # For multi-output, this is averaged R²
    cv=5,
    verbose=1,
    n_jobs=-1
)

random_search.fit(X_train_scaled, y_train)
print(random_search.best_score_)

model = random_search.best_estimator_
# Predictions
y_pred = model.predict(X_test_scaled)

# Evaluate
r2 = r2_score(y_test, y_pred)
print(f"Overall R2: {r2:.2f}")

Fitting 5 folds for each of 30 candidates, totalling 150 fits
0.8430047988891601
Overall R2: 0.97


In [32]:
with open(f'{model_dir}/open_area_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open(f'{model_dir}/open_area_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [33]:
with open(f'{model_dir}/feature_names.pkl', 'wb') as f:
    pickle.dump(input_features, f)

print("✓ Feature names saved!")


✓ Feature names saved!


In [34]:
len(input_features), input_features

(16,
 ['total_area_of_land_sqm',
  'far',
  'airport_distance_kms',
  'ksr_jn_distance_kms',
  'yeshwantpur_jn_distance_kms',
  'nearest_metro_dist_kms',
  'nearest_major_road_dist_kms',
  'product_type_Apartment',
  'product_type_Villa',
  'taluk_Anekal',
  'taluk_Bengaluru East',
  'taluk_Bengaluru North',
  'taluk_Bengaluru South',
  'taluk_Devanahalli',
  'taluk_Hosakote',
  'taluk_Yelahanka'])