### Real Estate Multi-Target Prediction Pipeline
#### Sequential approach: BHK Type Presence -> Unit Counts -> Carpet Areas -> Project Metrics

In [106]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, mean_absolute_error, classification_report, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier, XGBRegressor
import warnings
import pickle
warnings.filterwarnings('ignore')

In [107]:
df = pd.read_csv("processing_outputs/cleaned_data.csv")
df

Unnamed: 0,rera_ids,far,number_of_towers,total_unit_count,1_5BHK_count,1BHK_count,1RK_count,2_5BHK_count,2BHK_count,3_5BHK_count,...,1RK_unit_portion,1BHK_unit_portion,1_5BHK_unit_portion,2BHK_unit_portion,2_5BHK_unit_portion,3BHK_unit_portion,3_5BHK_unit_portion,4BHK_unit_portion,4_5BHK_unit_portion,5BHK_unit_portion
0,PRM/KA/RERA/1251/446/PR/191022/005340,2.00000,1.0,126,0.0,0.0,0.0,0.0,61.0,0.0,...,0.0,0.000000,0.0,0.484127,0.0,0.515873,0.0,0.000000,0.0,0.0
1,PRM/KA/RERA/1251/446/PR/150224/006619,2.25000,1.0,84,0.0,0.0,0.0,0.0,52.0,0.0,...,0.0,0.000000,0.0,0.619048,0.0,0.380952,0.0,0.000000,0.0,0.0
2,PRM/KA/RERA/1251/446/PR/300924/007102,2.20000,1.0,60,0.0,0.0,0.0,0.0,28.0,0.0,...,0.0,0.000000,0.0,0.466667,0.0,0.533333,0.0,0.000000,0.0,0.0
3,PRM/KA/RERA/1251/310/PR/051224/007268,2.49000,1.0,126,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.015873,0.0,0.000000,0.0,0.936508,0.0,0.031746,0.0,0.0
4,PRM/KA/RERA/1251/308/PR/210524/006935,1.56000,1.0,166,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.0,0.493976,0.0,0.506024,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272,PRM/KA/RERA/1250/303/PR/300125/007458,2.08240,6.0,463,0.0,0.0,0.0,0.0,72.0,0.0,...,0.0,0.000000,0.0,0.155508,0.0,0.766739,0.0,0.077754,0.0,0.0
273,PRM/KA/RERA/1251/308/PR/091224/007276,1.48000,1.0,33,0.0,0.0,0.0,0.0,33.0,0.0,...,0.0,0.000000,0.0,1.000000,0.0,0.000000,0.0,0.000000,0.0,0.0
274,PRM/KA/RERA/1251/309/PR/180823/006187,2.55325,2.0,264,0.0,28.0,0.0,0.0,118.0,0.0,...,0.0,0.106061,0.0,0.446970,0.0,0.446970,0.0,0.000000,0.0,0.0
275,PRM/KA/RERA/1251/308/PR/230522/004901,2.74900,6.0,1104,0.0,0.0,0.0,0.0,24.0,0.0,...,0.0,0.000000,0.0,0.021739,0.0,0.978261,0.0,0.000000,0.0,0.0


In [108]:
df= pd.get_dummies(df, columns=['product_type'], dtype=int)
df

Unnamed: 0,rera_ids,far,number_of_towers,total_unit_count,1_5BHK_count,1BHK_count,1RK_count,2_5BHK_count,2BHK_count,3_5BHK_count,...,1_5BHK_unit_portion,2BHK_unit_portion,2_5BHK_unit_portion,3BHK_unit_portion,3_5BHK_unit_portion,4BHK_unit_portion,4_5BHK_unit_portion,5BHK_unit_portion,product_type_Apartment,product_type_Villa
0,PRM/KA/RERA/1251/446/PR/191022/005340,2.00000,1.0,126,0.0,0.0,0.0,0.0,61.0,0.0,...,0.0,0.484127,0.0,0.515873,0.0,0.000000,0.0,0.0,1,0
1,PRM/KA/RERA/1251/446/PR/150224/006619,2.25000,1.0,84,0.0,0.0,0.0,0.0,52.0,0.0,...,0.0,0.619048,0.0,0.380952,0.0,0.000000,0.0,0.0,1,0
2,PRM/KA/RERA/1251/446/PR/300924/007102,2.20000,1.0,60,0.0,0.0,0.0,0.0,28.0,0.0,...,0.0,0.466667,0.0,0.533333,0.0,0.000000,0.0,0.0,1,0
3,PRM/KA/RERA/1251/310/PR/051224/007268,2.49000,1.0,126,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.936508,0.0,0.031746,0.0,0.0,1,0
4,PRM/KA/RERA/1251/308/PR/210524/006935,1.56000,1.0,166,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.493976,0.0,0.506024,0.0,0.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272,PRM/KA/RERA/1250/303/PR/300125/007458,2.08240,6.0,463,0.0,0.0,0.0,0.0,72.0,0.0,...,0.0,0.155508,0.0,0.766739,0.0,0.077754,0.0,0.0,1,0
273,PRM/KA/RERA/1251/308/PR/091224/007276,1.48000,1.0,33,0.0,0.0,0.0,0.0,33.0,0.0,...,0.0,1.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,1,0
274,PRM/KA/RERA/1251/309/PR/180823/006187,2.55325,2.0,264,0.0,28.0,0.0,0.0,118.0,0.0,...,0.0,0.446970,0.0,0.446970,0.0,0.000000,0.0,0.0,1,0
275,PRM/KA/RERA/1251/308/PR/230522/004901,2.74900,6.0,1104,0.0,0.0,0.0,0.0,24.0,0.0,...,0.0,0.021739,0.0,0.978261,0.0,0.000000,0.0,0.0,1,0


In [None]:
# Create one-hot encoding for taluk
taluk_dummies = pd.get_dummies(df['taluk'], prefix='taluk', dtype=int)
base_features = [
        'total_area_of_land_sqm', 'far',
        'airport_distance_kms', 'ksr_jn_distance_kms', 'yeshwantpur_jn_distance_kms',
        'nearest_metro_dist_kms', 'nearest_major_road_dist_kms',
        'product_type_Apartment', 'product_type_Villa'
    ]
    
# Combine base features with taluk dummies
input_features = base_features + taluk_dummies.columns.tolist()
input_features

['total_area_of_land_sqm',
 'far',
 'airport_distance_kms',
 'ksr_jn_distance_kms',
 'yeshwantpur_jn_distance_kms',
 'nearest_metro_dist_kms',
 'nearest_major_road_dist_kms',
 'product_type_Apartment',
 'product_type_Villa',
 'taluk_Anekal',
 'taluk_Bengaluru East',
 'taluk_Bengaluru North',
 'taluk_Bengaluru South',
 'taluk_Devanahalli',
 'taluk_Hosakote',
 'taluk_Yelahanka']

In [110]:
bhk_presence_targets = ['1RK', '1BHK', '1_5BHK', '2BHK', '2_5BHK', '3BHK', '3_5BHK', '4BHK', '4_5BHK', '5BHK']

In [152]:
# BHK presence targets (binary)
bhk_presence_targets = ['1RK', '1BHK', '1_5BHK', '2BHK', '2_5BHK', '3BHK', '3_5BHK', '4BHK', '4_5BHK', '5BHK']

# BHK count targets
bhk_count_targets = ['1RK_count', '1BHK_count', '1_5BHK_count', '2BHK_count', '2_5BHK_count', 
                    '3BHK_count', '3_5BHK_count', '4BHK_count', '4_5BHK_count', '5BHK_count']

# Carpet area targets
mean_carpet_area_targets = ['1RK_mean_carpet_area', '1BHK_mean_carpet_area', '1_5BHK_mean_carpet_area',
                        '2BHK_mean_carpet_area', '2_5BHK_mean_carpet_area', '3BHK_mean_carpet_area',
                        '3_5BHK_mean_carpet_area', '4BHK_mean_carpet_area', '4_5BHK_mean_carpet_area',
                        '5BHK_mean_carpet_area']

total_carpet_area_targets = ['1RK_total_carpet_area', '1BHK_total_carpet_area', '1_5BHK_total_carpet_area',
                        '2BHK_total_carpet_area', '2_5BHK_total_carpet_area', '3BHK_total_carpet_area',
                        '3_5BHK_total_carpet_area', '4BHK_total_carpet_area', '4_5BHK_total_carpet_area',
                        '5BHK_total_carpet_area']

total_carpet_area_portion_targets = ['1RK_total_carpet_area_portion', '1BHK_total_carpet_area_portion', '1_5BHK_total_carpet_area_portion',
                        '2BHK_total_carpet_area_portion', '2_5BHK_total_carpet_area_portion', '3BHK_total_carpet_area_portion',
                        '3_5BHK_total_carpet_area_portion', '4BHK_total_carpet_area_portion', '4_5BHK_total_carpet_area_portion',
                        '5BHK_total_carpet_area_portion']

unit_portion_targets = ['1RK_unit_portion', '1BHK_unit_portion', '1_5BHK_unit_portion',
                        '2BHK_unit_portion', '2_5BHK_unit_portion', '3BHK_unit_portion',
                        '3_5BHK_unit_portion', '4BHK_unit_portion', '4_5BHK_unit_portion',
                        '5BHK_unit_portion']
# Project-level targets
project_targets = ['total_project_cost_inr', 'avg_price']

open_area_target = 'total_open_area_sqm'

total_units_target = "total_unit_count"

carpet_area_target = "mean_carpet_area"

bhk_count_target = "bhk_count"

bhk_portion_target = "bhk_unit_portion"

total_carpet_area_target = "total_carpet_area"

In [112]:
# Handle missing values - combine base features with taluk dummies
X_base = df[base_features].fillna(0)
X = pd.concat([X_base, taluk_dummies], axis=1)

In [113]:
y_presence = df[bhk_presence_targets].fillna(0)
y_counts = df[bhk_count_targets].fillna(0)

In [114]:
y_open_area = df[open_area_target].copy().fillna(df[open_area_target].median())

In [115]:
y_mean_carpet_areas = df[mean_carpet_area_targets].copy()
y_project = df[project_targets].fillna(df[project_targets].median())

In [116]:
y_total_carpet_areas = df[total_carpet_area_targets].copy()

In [117]:
y_total_carpet_areas_portion = df[total_carpet_area_portion_targets].copy()

In [118]:
y_unit_count = df[total_units_target].copy()

In [119]:
y_unit_portion = df[unit_portion_targets].copy()

### MODEL 1 - PREDICT THE PRESENCE OF INVETORY

In [120]:
model_dir = "models"

In [121]:
X_train, X_test, y_train, y_test = train_test_split(X, y_presence, test_size=0.2, random_state=42)
    
    # Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train model
model = MultiOutputClassifier(
    RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
)

model.fit(X_train_scaled, y_train)

# Predictions
y_pred = model.predict(X_test_scaled)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"Overall Accuracy: {accuracy:.3f}")

# Per-BHK type accuracy
for i, col in enumerate(y_presence.columns):
    acc = accuracy_score(y_test.iloc[:, i], y_pred[:, i])
    print(f"{col} Accuracy: {acc:.3f}")

Overall Accuracy: 0.357
1RK Accuracy: 0.929
1BHK Accuracy: 0.786
1_5BHK Accuracy: 0.982
2BHK Accuracy: 0.839
2_5BHK Accuracy: 0.750
3BHK Accuracy: 0.964
3_5BHK Accuracy: 0.911
4BHK Accuracy: 0.732
4_5BHK Accuracy: 1.000
5BHK Accuracy: 0.929


In [122]:
# Save model and scaler
with open(f'{model_dir}/bhk_presence_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open(f'{model_dir}/bhk_presence_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

### MODEL 2 - PREDICT THE TOTAL No. of Units

In [123]:
X_enhanced = pd.concat([
    X.reset_index(drop=True),
    y_presence.reset_index(drop=True),
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y_unit_count, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

# Hyperparameter space
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'max_depth': [3, 5, 7, 10],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 1, 5],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [1, 1.5, 2]
}

# Randomized Search
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=50,                   # Number of parameter settings sampled
    scoring='r2',                # You can use 'neg_mean_squared_error', 'r2', etc.
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train_scaled, y_train)
# Train model
model = random_search.best_estimator_

# Best parameters and score
print("Best parameters:", random_search.best_params_)
print("Best CV R² score:", random_search.best_score_)

# Predictions
y_pred = model.predict(X_test_scaled)

# Evaluate
r2 = r2_score(y_test, y_pred)
print(f"R2 Score: {r2:.3f}")


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters: {'subsample': 0.6, 'reg_lambda': 1, 'reg_alpha': 1, 'n_estimators': 100, 'max_depth': 7, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 1.0}
Best CV R² score: 0.7462587237358094
R2 Score: 0.666


In [171]:
with open(f'{model_dir}/unit_count_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open(f'{model_dir}/unit_count_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

### MODEL 3 - UNIT COUNT AND CARPET AREA PREDICTOR

In [153]:
df_bhk = pd.read_csv('processing_outputs/bhk_data.csv')
df_bhk

Unnamed: 0,rera_ids,bhk_type,bhk_count,mean_carpet_area,far,number_of_towers,total_unit_count,cost_of_land_inr,district,latitude,...,avg_price,airport_distance_kms,ksr_jn_distance_kms,yeshwantpur_jn_distance_kms,nearest_metro_dist_kms,nearest_metro_station,nearest_major_road_dist_kms,nearest_major_road,bhk_unit_portion,total_carpet_area
0,PRM/KA/RERA/1250/303/PR/080525/007730,0.5,60.0,26.990000,2.499,16.0,1077,972000000,Bengaluru Rural,13.206809,...,11.060000,5.138203,27.243837,23.964028,22.972411,Krishnarajapura,10.024705,Bellary Road,0.055710,1619.400
1,PRM/KA/RERA/1251/446/PR/300924/007105,0.5,95.0,24.596526,2.247,8.0,698,1098268515,Bengaluru Urban,12.889460,...,20.215000,34.123025,17.588655,24.182315,9.807534,Kundalahalli,5.050651,Outer Ring Road,0.136103,2336.670
2,PRM/KA/RERA/1251/309/PR/060324/006692,0.5,72.0,16.160000,3.240,2.0,358,377577120,Bengaluru Urban,13.147780,...,11.285588,6.195783,22.402050,20.663908,16.363385,Krishnarajapura,7.689442,Bellary Road,0.201117,1163.520
3,PRM/KA/RERA/1251/472/PR/080525/007728,0.5,44.0,24.800000,3.000,4.0,456,3543753837,Bengaluru Urban,13.122500,...,13.430000,11.446540,17.547426,15.010138,14.274460,Yeshwantpur,2.313017,Bellary Road,0.096491,1091.200
4,PRM/KA/RERA/1250/303/PR/110522/004869,0.5,176.0,19.731136,2.361,9.0,1001,273287989,Bengaluru Rural,13.210587,...,10.250000,3.885663,28.164816,25.132968,23.314115,Krishnarajapura,11.169648,Bellary Road,0.175824,3472.680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
724,PRM/KA/RERA/1251/472/PR/171224/007305,5.0,32.0,259.853437,1.000,5.0,44,100731621,Bengaluru Urban,13.198990,...,12.870000,17.056838,24.700970,18.909049,17.501096,Manjunathanagara,10.629116,Bellary Road,0.727273,8315.310
725,PRM/KA/RERA/1251/310/PR/220324/006728,5.0,2.0,370.260000,2.240,2.0,232,72200771,Bengaluru Urban,12.872198,...,12.640000,37.571633,12.269992,19.093212,4.901586,Yelachenahalli,2.891765,NICE Peripheral Ring Road,0.008621,740.520
726,PRM/KA/RERA/1251/446/PR/170524/006883,5.0,2.0,337.880000,2.250,1.0,176,1807284478,Bengaluru Urban,12.956625,...,23.160000,27.459114,8.601718,14.614251,2.562136,Indiranagar,3.577113,Outer Ring Road,0.011364,675.760
727,PRM/KA/RERA/1251/446/PR/210325/007608,5.0,2.0,249.566000,3.250,2.0,142,474325208,Bengaluru Urban,13.034503,...,11.500000,18.335341,18.935374,21.767994,4.980628,Whitefield (Kadugodi),2.686520,Old Madras Road,0.014085,499.132


In [154]:
df_bhk= pd.get_dummies(df_bhk, columns=['product_type'], dtype=int)
df_bhk = pd.get_dummies(df_bhk, columns=['taluk'], prefix='taluk', dtype=int)
df_bhk

Unnamed: 0,rera_ids,bhk_type,bhk_count,mean_carpet_area,far,number_of_towers,total_unit_count,cost_of_land_inr,district,latitude,...,total_carpet_area,product_type_Apartment,product_type_Villa,taluk_Anekal,taluk_Bengaluru East,taluk_Bengaluru North,taluk_Bengaluru South,taluk_Devanahalli,taluk_Hosakote,taluk_Yelahanka
0,PRM/KA/RERA/1250/303/PR/080525/007730,0.5,60.0,26.990000,2.499,16.0,1077,972000000,Bengaluru Rural,13.206809,...,1619.400,1,0,0,0,0,0,1,0,0
1,PRM/KA/RERA/1251/446/PR/300924/007105,0.5,95.0,24.596526,2.247,8.0,698,1098268515,Bengaluru Urban,12.889460,...,2336.670,1,0,0,1,0,0,0,0,0
2,PRM/KA/RERA/1251/309/PR/060324/006692,0.5,72.0,16.160000,3.240,2.0,358,377577120,Bengaluru Urban,13.147780,...,1163.520,1,0,0,0,1,0,0,0,0
3,PRM/KA/RERA/1251/472/PR/080525/007728,0.5,44.0,24.800000,3.000,4.0,456,3543753837,Bengaluru Urban,13.122500,...,1091.200,1,0,0,0,0,0,0,0,1
4,PRM/KA/RERA/1250/303/PR/110522/004869,0.5,176.0,19.731136,2.361,9.0,1001,273287989,Bengaluru Rural,13.210587,...,3472.680,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
724,PRM/KA/RERA/1251/472/PR/171224/007305,5.0,32.0,259.853437,1.000,5.0,44,100731621,Bengaluru Urban,13.198990,...,8315.310,0,1,0,0,0,0,0,0,1
725,PRM/KA/RERA/1251/310/PR/220324/006728,5.0,2.0,370.260000,2.240,2.0,232,72200771,Bengaluru Urban,12.872198,...,740.520,1,0,0,0,0,1,0,0,0
726,PRM/KA/RERA/1251/446/PR/170524/006883,5.0,2.0,337.880000,2.250,1.0,176,1807284478,Bengaluru Urban,12.956625,...,675.760,1,0,0,1,0,0,0,0,0
727,PRM/KA/RERA/1251/446/PR/210325/007608,5.0,2.0,249.566000,3.250,2.0,142,474325208,Bengaluru Urban,13.034503,...,499.132,1,0,0,1,0,0,0,0,0


In [155]:
input_features

['total_area_of_land_sqm',
 'far',
 'airport_distance_kms',
 'ksr_jn_distance_kms',
 'yeshwantpur_jn_distance_kms',
 'nearest_metro_dist_kms',
 'nearest_major_road_dist_kms',
 'product_type_Apartment',
 'product_type_Villa',
 'taluk_Anekal',
 'taluk_Bengaluru East',
 'taluk_Bengaluru North',
 'taluk_Bengaluru South',
 'taluk_Devanahalli',
 'taluk_Hosakote',
 'taluk_Yelahanka']

In [166]:
X_bhk = df_bhk[input_features+[total_units_target]]
y_bhk = df_bhk[[carpet_area_target]]
#print(X_enhanced)
# Split data
X_train, X_test, y_train, y_test = train_test_split(X_bhk, y_bhk, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Your base model
xgb = XGBRegressor(objective='reg:squarederror',
                            random_state=42)

param_dist = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
}

# Random Search
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=30,
    scoring='r2',         # For multi-output, this is averaged R²
    cv=5,
    verbose=1,
    n_jobs=-1
)

# Fit to data (X: features, Y: multiple targets as a DataFrame or 2D array)
random_search.fit(X_train_scaled, y_train)

model = random_search.best_estimator_
# Predictions
y_pred = model.predict(X_test_scaled)

# Evaluate
r2 = r2_score(y_test, y_pred)
print(f"R2_score {r2:.2f}")


Fitting 5 folds for each of 30 candidates, totalling 150 fits
R2_score 0.75


In [None]:
with open(f'{model_dir}/carpet_area_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open(f'{model_dir}/carpet_area_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [170]:
X_bhk = df_bhk[input_features+[total_units_target]+[carpet_area_target]]
y_bhk = df_bhk[[bhk_count_target]]
#print(X_enhanced)
# Split data
X_train, X_test, y_train, y_test = train_test_split(X_bhk, y_bhk, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Your base model
xgb = XGBRegressor(objective='reg:squarederror',
                            random_state=42)

param_dist = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
}

# Random Search
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=30,
    scoring='r2',         # For multi-output, this is averaged R²
    cv=5,
    verbose=1,
    n_jobs=-1
)

# Fit to data (X: features, Y: multiple targets as a DataFrame or 2D array)
random_search.fit(X_train_scaled, y_train)

model = random_search.best_estimator_
print(random_search.best_score_)
# Predictions
y_pred = model.predict(X_test_scaled)

# Evaluate
r2 = r2_score(y_test, y_pred)
print(f"R2_score {r2:.2f}")


Fitting 5 folds for each of 30 candidates, totalling 150 fits
0.24993747472763062
R2_score 0.22


In [None]:
with open(f'{model_dir}/bhk_unit_count_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open(f'{model_dir}/bhk_unit_count_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [177]:
X_bhk = df_bhk[input_features+[total_units_target,carpet_area_target,bhk_count_target]]
y_bhk = df_bhk[["avg_price"]]
#print(X_enhanced)
# Split data
X_train, X_test, y_train, y_test = train_test_split(X_bhk, y_bhk, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Your base model
xgb = XGBRegressor(objective='reg:squarederror',
                            random_state=42)

param_dist = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
}

# Random Search
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=30,
    scoring='r2',         # For multi-output, this is averaged R²
    cv=5,
    verbose=1,
    n_jobs=-1
)

# Fit to data (X: features, Y: multiple targets as a DataFrame or 2D array)
random_search.fit(X_train_scaled, y_train)

model = random_search.best_estimator_
print(random_search.best_score_)
# Predictions
y_pred = model.predict(X_test_scaled)

# Evaluate
r2 = r2_score(y_test, y_pred)
print(f"R2_score {r2:.2f}")


Fitting 5 folds for each of 30 candidates, totalling 150 fits
0.8384384870529175
R2_score 0.90


In [None]:
with open(f'{model_dir}/avg_price_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open(f'{model_dir}/avg_price_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

### MODEL 4 - PORJECTS METRICS PREDICTOR

In [176]:

X_enhanced = pd.concat([
    X.reset_index(drop=True),
    y_presence.reset_index(drop=True),
    y_counts.reset_index(drop=True),
    y_mean_carpet_areas.reset_index(drop=True)
], axis=1)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_enhanced, df[["total_project_cost_inr"]], test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

# # Wrap it
# multi_output_model = MultiOutputRegressor(xgb)

# # Define parameter grid — prefix with 'estimator__'
# param_dist = {
#     'estimator__n_estimators': [100, 200, 300],
#     'estimator__learning_rate': [0.05, 0.1, 0.2],
#     'estimator__max_depth': [3, 5, 7],
#     'estimator__subsample': [0.7, 0.8, 1.0],
#     'estimator__colsample_bytree': [0.7, 0.8, 1.0],
# }

param_dist = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
}

# Random Search
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=30,
    scoring='r2',         # For multi-output, this is averaged R²
    cv=5,
    verbose=1,
    n_jobs=-1
)

# # Random Search
# random_search = RandomizedSearchCV(
#     estimator=multi_output_model,
#     param_distributions=param_dist,
#     n_iter=30,
#     scoring='r2',         # For multi-output, this is averaged R²
#     cv=5,
#     verbose=1,
#     n_jobs=-1
# )

# Fit to data (X: features, Y: multiple targets as a DataFrame or 2D array)
random_search.fit(X_train_scaled, y_train)

model = random_search.best_estimator_
# Predictions
y_pred = model.predict(X_test_scaled)

# Evaluate
r2 = r2_score(y_test, y_pred)
print(f"Overall R2: {r2:.2f}")

# # Per-target MAE
# for i, col in enumerate(y_project.columns):
#     r2 = r2_score(y_test.iloc[:, i], y_pred[:, i])
#     print(f"{col} R2: {r2:.2f}")

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Overall R2: 0.85


In [68]:
with open(f'{model_dir}/project_metrics_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open(f'{model_dir}/project_metrics_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [69]:
with open(f'{model_dir}/feature_names.pkl', 'wb') as f:
    pickle.dump(input_features, f)

print("✓ Feature names saved!")


✓ Feature names saved!
