### Real Estate Multi-Target Prediction Pipeline
#### Sequential approach: BHK Type Presence -> Unit Counts -> Carpet Areas -> Project Metrics

In [50]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, mean_absolute_error, classification_report
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import warnings
import pickle
warnings.filterwarnings('ignore')

In [51]:
df = pd.read_csv("processing_outputs/cleaned_data.csv")
df

Unnamed: 0,rera_ids,far,number_of_towers,total_unit_count,1_5BHK_count,1BHK_count,1RK_count,2_5BHK_count,2BHK_count,3_5BHK_count,...,4_5BHK,5BHK,avg_price,airport_distance_kms,ksr_jn_distance_kms,yeshwantpur_jn_distance_kms,nearest_metro_dist_kms,nearest_metro_station,nearest_major_road_dist_kms,nearest_major_road
0,PRM/KA/RERA/1251/446/PR/191022/005340,2.00000,1.0,126,0.0,0.0,0.0,0.0,61.0,0.0,...,0,0,8.500000,30.743633,18.789358,24.737635,6.197776,Nallurahalli,0.471653,Kadabeesanahalli Main Road
1,PRM/KA/RERA/1251/446/PR/150224/006619,2.25000,1.0,84,0.0,0.0,0.0,0.0,52.0,0.0,...,0,0,6.250000,17.722085,16.062128,18.537574,4.735450,Singayyanapalya,1.975241,Old Madras Road
2,PRM/KA/RERA/1251/446/PR/300924/007102,2.20000,1.0,60,0.0,0.0,0.0,0.0,28.0,0.0,...,0,0,8.020000,16.838685,11.915246,11.929444,7.787598,Benniganahalli,0.638142,Thanisandra Main Road
3,PRM/KA/RERA/1251/310/PR/051224/007268,2.49000,1.0,126,0.0,2.0,0.0,0.0,0.0,0.0,...,0,0,9.000000,37.695984,25.095496,31.528721,13.506856,Nallurahalli,7.823073,Kadabeesanahalli Main Road
4,PRM/KA/RERA/1251/308/PR/210524/006935,1.56000,1.0,166,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,13.430000,40.058996,18.659459,25.630953,12.408801,Yelachenahalli,2.580445,Hosur Road
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272,PRM/KA/RERA/1250/303/PR/300125/007458,2.08240,6.0,463,0.0,0.0,0.0,0.0,72.0,0.0,...,0,0,10.000000,4.344371,25.641484,22.806291,20.835486,Krishnarajapura,8.243584,Bagalur Road
273,PRM/KA/RERA/1251/308/PR/091224/007276,1.48000,1.0,33,0.0,0.0,0.0,0.0,33.0,0.0,...,0,0,5.080000,32.779344,24.129898,29.924016,8.981898,Hopefarm Channasandra,4.949467,Varthur Main Road
274,PRM/KA/RERA/1251/309/PR/180823/006187,2.55325,2.0,264,0.0,28.0,0.0,0.0,118.0,0.0,...,0,0,11.950000,5.368229,23.205871,21.311835,17.252628,Krishnarajapura,6.586075,Bagalur Road
275,PRM/KA/RERA/1251/308/PR/230522/004901,2.74900,6.0,1104,0.0,0.0,0.0,0.0,24.0,0.0,...,0,0,8.901250,36.016173,23.841043,30.163816,11.810772,Nallurahalli,6.142877,Kadabeesanahalli Main Road


In [52]:
df= pd.get_dummies(df, columns=['product_type'], dtype=int)
df

Unnamed: 0,rera_ids,far,number_of_towers,total_unit_count,1_5BHK_count,1BHK_count,1RK_count,2_5BHK_count,2BHK_count,3_5BHK_count,...,avg_price,airport_distance_kms,ksr_jn_distance_kms,yeshwantpur_jn_distance_kms,nearest_metro_dist_kms,nearest_metro_station,nearest_major_road_dist_kms,nearest_major_road,product_type_Apartment,product_type_Villa
0,PRM/KA/RERA/1251/446/PR/191022/005340,2.00000,1.0,126,0.0,0.0,0.0,0.0,61.0,0.0,...,8.500000,30.743633,18.789358,24.737635,6.197776,Nallurahalli,0.471653,Kadabeesanahalli Main Road,1,0
1,PRM/KA/RERA/1251/446/PR/150224/006619,2.25000,1.0,84,0.0,0.0,0.0,0.0,52.0,0.0,...,6.250000,17.722085,16.062128,18.537574,4.735450,Singayyanapalya,1.975241,Old Madras Road,1,0
2,PRM/KA/RERA/1251/446/PR/300924/007102,2.20000,1.0,60,0.0,0.0,0.0,0.0,28.0,0.0,...,8.020000,16.838685,11.915246,11.929444,7.787598,Benniganahalli,0.638142,Thanisandra Main Road,1,0
3,PRM/KA/RERA/1251/310/PR/051224/007268,2.49000,1.0,126,0.0,2.0,0.0,0.0,0.0,0.0,...,9.000000,37.695984,25.095496,31.528721,13.506856,Nallurahalli,7.823073,Kadabeesanahalli Main Road,1,0
4,PRM/KA/RERA/1251/308/PR/210524/006935,1.56000,1.0,166,0.0,0.0,0.0,0.0,0.0,0.0,...,13.430000,40.058996,18.659459,25.630953,12.408801,Yelachenahalli,2.580445,Hosur Road,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272,PRM/KA/RERA/1250/303/PR/300125/007458,2.08240,6.0,463,0.0,0.0,0.0,0.0,72.0,0.0,...,10.000000,4.344371,25.641484,22.806291,20.835486,Krishnarajapura,8.243584,Bagalur Road,1,0
273,PRM/KA/RERA/1251/308/PR/091224/007276,1.48000,1.0,33,0.0,0.0,0.0,0.0,33.0,0.0,...,5.080000,32.779344,24.129898,29.924016,8.981898,Hopefarm Channasandra,4.949467,Varthur Main Road,1,0
274,PRM/KA/RERA/1251/309/PR/180823/006187,2.55325,2.0,264,0.0,28.0,0.0,0.0,118.0,0.0,...,11.950000,5.368229,23.205871,21.311835,17.252628,Krishnarajapura,6.586075,Bagalur Road,1,0
275,PRM/KA/RERA/1251/308/PR/230522/004901,2.74900,6.0,1104,0.0,0.0,0.0,0.0,24.0,0.0,...,8.901250,36.016173,23.841043,30.163816,11.810772,Nallurahalli,6.142877,Kadabeesanahalli Main Road,1,0


In [53]:
#Inputs:
    #lat, lon -> taluk, airport distance, station distance, metro distance, major road distance
    #BHK: yes/no
    #product type
    #far
    #

# To predict:
    #BHKs count
    #BHK carpet area
    #total open area
    #total project cost
    #avg price
    


In [54]:
# Create one-hot encoding for taluk
taluk_dummies = pd.get_dummies(df['taluk'], prefix='taluk')
base_features = [
        'total_area_of_land_sqm', 'far',
        'airport_distance_kms', 'ksr_jn_distance_kms', 'yeshwantpur_jn_distance_kms',
        'nearest_metro_dist_kms', 'nearest_major_road_dist_kms',
        'product_type_Apartment', 'product_type_Villa'
    ]
    
# Combine base features with taluk dummies
input_features = base_features + taluk_dummies.columns.tolist()
input_features

['total_area_of_land_sqm',
 'far',
 'airport_distance_kms',
 'ksr_jn_distance_kms',
 'yeshwantpur_jn_distance_kms',
 'nearest_metro_dist_kms',
 'nearest_major_road_dist_kms',
 'product_type_Apartment',
 'product_type_Villa',
 'taluk_Anekal',
 'taluk_Bengaluru East',
 'taluk_Bengaluru North',
 'taluk_Bengaluru South',
 'taluk_Devanahalli',
 'taluk_Hosakote',
 'taluk_Yelahanka']

In [55]:
bhk_presence_targets = ['1RK', '1BHK', '1_5BHK', '2BHK', '2_5BHK', '3BHK', '3_5BHK', '4BHK', '4_5BHK', '5BHK']

In [56]:
# BHK presence targets (binary)
bhk_presence_targets = ['1RK', '1BHK', '1_5BHK', '2BHK', '2_5BHK', '3BHK', '3_5BHK', '4BHK', '4_5BHK', '5BHK']

# BHK count targets
bhk_count_targets = ['1RK_count', '1BHK_count', '1_5BHK_count', '2BHK_count', '2_5BHK_count', 
                    '3BHK_count', '3_5BHK_count', '4BHK_count', '4_5BHK_count', '5BHK_count']

# Carpet area targets
carpet_area_targets = ['1RK_mean_carpet_area', '1BHK_mean_carpet_area', '1_5BHK_mean_carpet_area',
                        '2BHK_mean_carpet_area', '2_5BHK_mean_carpet_area', '3BHK_mean_carpet_area',
                        '3_5BHK_mean_carpet_area', '4BHK_mean_carpet_area', '4_5BHK_mean_carpet_area',
                        '5BHK_mean_carpet_area']

# Project-level targets
project_targets = ['total_open_area_sqm', 'total_project_cost_inr', 'avg_price']


In [57]:
# Handle missing values - combine base features with taluk dummies
X_base = df[base_features].fillna(0)
X = pd.concat([X_base, taluk_dummies], axis=1)

In [58]:
y_presence = df[bhk_presence_targets].fillna(0)
y_counts = df[bhk_count_targets].fillna(0)

In [59]:
y_carpet_areas = df[carpet_area_targets].copy()
y_project = df[project_targets].fillna(df[project_targets].median())

### MODEL 1 - PREDICT THE PRESENCE OF INVETORY

In [60]:
model_dir = "models"

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y_presence, test_size=0.2, random_state=42)
    
    # Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train model
model = MultiOutputClassifier(
    RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
)

model.fit(X_train_scaled, y_train)

# Predictions
y_pred = model.predict(X_test_scaled)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"Overall Accuracy: {accuracy:.3f}")

# Per-BHK type accuracy
for i, col in enumerate(y_presence.columns):
    acc = accuracy_score(y_test.iloc[:, i], y_pred[:, i])
    print(f"{col} Accuracy: {acc:.3f}")

Overall Accuracy: 0.286
1RK Accuracy: 0.946
1BHK Accuracy: 0.750
1_5BHK Accuracy: 0.982
2BHK Accuracy: 0.804
2_5BHK Accuracy: 0.750
3BHK Accuracy: 0.946
3_5BHK Accuracy: 0.911
4BHK Accuracy: 0.696
4_5BHK Accuracy: 1.000
5BHK Accuracy: 0.929


In [62]:
# Save model and scaler
with open(f'{model_dir}/bhk_presence_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open(f'{model_dir}/bhk_presence_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

### MODEL 2 - PREDICT THE UNIT COUNT OF EACH BHK TYPE

In [63]:
X_enhanced = pd.concat([X.reset_index(drop=True), y_presence.reset_index(drop=True)], axis=1)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_enhanced, y_counts, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train model
model = MultiOutputRegressor(
    RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
)

model.fit(X_train_scaled, y_train)

# Predictions
y_pred = model.predict(X_test_scaled)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
print(f"Overall MAE: {mae:.2f}")

# Per-BHK type MAE
for i, col in enumerate(y_counts.columns):
    mae_bhk = mean_absolute_error(y_test.iloc[:, i], y_pred[:, i])
    print(f"{col} MAE: {mae_bhk:.2f}")


Overall MAE: 16.81
1RK_count MAE: 2.41
1BHK_count MAE: 13.53
1_5BHK_count MAE: 0.72
2BHK_count MAE: 60.65
2_5BHK_count MAE: 9.17
3BHK_count MAE: 63.30
3_5BHK_count MAE: 5.77
4BHK_count MAE: 11.59
4_5BHK_count MAE: 0.00
5BHK_count MAE: 0.96


In [64]:
with open(f'{model_dir}/unit_count_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open(f'{model_dir}/unit_count_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

### MODEL 3 - CARPET AREA PREDICTOR

In [65]:
X_enhanced = pd.concat([
    X.reset_index(drop=True), 
    y_presence.reset_index(drop=True),
    y_counts.reset_index(drop=True)
], axis=1)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_enhanced, y_carpet_areas, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train model
model = MultiOutputRegressor(
    RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
)

model.fit(X_train_scaled, y_train)

# Predictions
y_pred = model.predict(X_test_scaled)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
print(f"Overall MAE: {mae:.2f}")

# Per-BHK type MAE
for i, col in enumerate(y_carpet_areas.columns):
    mae_bhk = mean_absolute_error(y_test.iloc[:, i], y_pred[:, i])
    print(f"{col} MAE: {mae_bhk:.2f}")


Overall MAE: 16.36
1RK_mean_carpet_area MAE: 0.77
1BHK_mean_carpet_area MAE: 8.42
1_5BHK_mean_carpet_area MAE: 0.31
2BHK_mean_carpet_area MAE: 40.72
2_5BHK_mean_carpet_area MAE: 41.63
3BHK_mean_carpet_area MAE: 59.01
3_5BHK_mean_carpet_area MAE: 1.36
4BHK_mean_carpet_area MAE: 9.48
4_5BHK_mean_carpet_area MAE: 0.05
5BHK_mean_carpet_area MAE: 1.85


In [66]:
with open(f'{model_dir}/carpet_area_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open(f'{model_dir}/carpet_area_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

### MODEL 4 - PORJECTS METRICS PREDICTOR

In [67]:

X_enhanced = pd.concat([
    X.reset_index(drop=True),
    y_presence.reset_index(drop=True),
    y_counts.reset_index(drop=True),
    y_carpet_areas.reset_index(drop=True)
], axis=1)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_enhanced, y_project, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train model
model = MultiOutputRegressor(
    RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
)

model.fit(X_train_scaled, y_train)

# Predictions
y_pred = model.predict(X_test_scaled)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
print(f"Overall MAE: {mae:.2f}")

# Per-target MAE
for i, col in enumerate(y_project.columns):
    mae_target = mean_absolute_error(y_test.iloc[:, i], y_pred[:, i])
    print(f"{col} MAE: {mae_target:.2f}")



Overall MAE: 316835607.02
total_open_area_sqm MAE: 2536.62
total_project_cost_inr MAE: 950504282.49
avg_price MAE: 1.96


In [68]:
with open(f'{model_dir}/project_metrics_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open(f'{model_dir}/project_metrics_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [69]:
with open(f'{model_dir}/feature_names.pkl', 'wb') as f:
    pickle.dump(input_features, f)

print("✓ Feature names saved!")


✓ Feature names saved!
