In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings

# --- Standard CPU Libraries (no change) ---
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict, train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, PowerTransformer, TargetEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor # This can be GPU-accelerated with a parameter
# from lightgbm import LGBMRegressor # This can be GPU-accelerated with a parameter
# from catboost import CatBoostRegressor # This can be GPU-accelerated with a parameter
import category_encoders as ce


# # --- GPU-Accelerated Replacements from RAPIDS cuML ---
# from cuml.linear_model import LinearRegression as cuMLLinearRegression
# from cuml.linear_model import Ridge as cuMLRidge
# from cuml.linear_model import Lasso as cuMLLasso
# from cuml.ensemble import RandomForestRegressor as cuMLRandomForest
# from cuml.svm import SVR as cuMLSVR
# from cuml.preprocessing import StandardScaler as cuMLStandardScaler
# from cuml.decomposition import PCA as cuMLPCA

In [2]:
# This will ignore all UserWarnings coming from the cuML library
warnings.filterwarnings("ignore", category=UserWarning, module="cuml")

In [2]:
df = pd.read_parquet(r'C:\Users\aryan\Desktop\Capstone Project\Data Preprocessing New\gurgaon_properties_final_df.parquet')

In [None]:
# df = df[~(df['Property Age'] == 'Under Construction')]

In [5]:
df.to_parquet(r'C:\Users\aryan\Desktop\Capstone Project\Data Preprocessing New\gurgaon_properties_final_df.parquet')

In [3]:
df.shape

(9588, 20)

In [5]:
df.sample(1)

Unnamed: 0,Sector,Built Up Area,Bedroom,Bathroom,Balcony,Servant Room,Store Room,Study Room,Floor Num,Total Floor,Property Age,Furnishing,Power Backup,Covered_Parking,Open_Parking,Total Parking,Rating,Nearby,Overlooking,Price
7710,Sector 67,2300,4,4,3,0,0,0,9,14,1 to 5 Year Old,Semi Furnished,Full,1,1,2,3.8,Education,Main Road,3.55


In [4]:
X = df.iloc[:, :-1]
y = df['Price']

In [5]:
ordinal_features = ['Property Age', 'Power Backup', 'Furnishing']

property_age_categories = ['10+ Year Old', '5 to 10 Year Old', '1 to 5 Year Old', '0 to 1 Year Old']
power_backup_categories = ['None', 'Partial', 'Full']
furnishing_categories = ['Unfurnished', 'Semi Furnished', 'Furnished']

onehot_features = ['Nearby', 'Overlooking', 'Sector']
numeric_features = ['Built Up Area', 'Bedroom', 'Bathroom', 'Balcony', 'Floor Num', 'Total Floor', 'Rating']

**OneHot Encoding with Ordinal Encoding**

In [9]:
preprocessor = ColumnTransformer(
    transformers= [
        ('ordinal', OrdinalEncoder(categories= [property_age_categories, power_backup_categories, furnishing_categories]), ordinal_features),
        ('onehot', OneHotEncoder(handle_unknown= 'ignore', drop= 'first', sparse_output= False), onehot_features),
        ('numerical', cuMLStandardScaler(), numeric_features)
    ],
    remainder= 'passthrough'
)

In [10]:
model_dict = {
    # === GPU Accelerated Models ===
    'linear_reg_gpu': cuMLLinearRegression(),
    'svr_gpu': cuMLSVR(),
    'ridge_gpu': cuMLRidge(),
    'lasso_gpu': cuMLLasso(),
    'random_forest_gpu': cuMLRandomForest(),
    'xgboost': XGBRegressor(tree_method='hist', device='cuda'),
    # 'lightgbm_gpu': LGBMRegressor(device='gpu'),
    'catboost_gpu': CatBoostRegressor(task_type='GPU', verbose=0),

    # === CPU Models (for comparison) ===
    'decision_tree_cpu': DecisionTreeRegressor(),
    'extra_trees_cpu': ExtraTreesRegressor(n_jobs= -1),
    'adaboost_cpu': AdaBoostRegressor()
}

In [11]:
def scorer(model_name, model):
    results = [model_name]
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', TransformedTargetRegressor(
            regressor=model,
            transformer=PowerTransformer(method='box-cox')
        ))
    ])
    
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    
    preds = cross_val_predict(pipeline, X, y, cv=kfold)
    
    r2 = r2_score(y, preds)
    mae = mean_absolute_error(y, preds)
    
    results.extend([r2, mae])
    return results

In [None]:
model_output = []

for name, model in model_dict.items():
    model_output.append(scorer(name, model))

In [None]:
# Create results dataframe
results_df = pd.DataFrame(
    model_output,
    columns=['Model', 'R²', 'MAE']
)

# Display sorted results
print("Sorted by MAE:")
print(results_df.sort_values(by='MAE'))

Sorted by MAE:
               Model        R²       MAE
8    extra_trees_cpu  0.882726  0.225798
5            xgboost  0.888936  0.235407
6       catboost_gpu  0.879651  0.257566
1            svr_gpu  0.863794  0.259543
7  decision_tree_cpu  0.733388  0.332835
0     linear_reg_gpu  0.811157  0.339272
2          ridge_gpu  0.810898  0.339452
4  random_forest_gpu  0.772516  0.370016
9       adaboost_cpu  0.622363  0.533185
3          lasso_gpu -0.008214  0.862865


**Ordinal Encoding**

In [None]:
all_categorical_features = [
    'Property Age', 
    'Power Backup', 
    'Furnishing',
    'Nearby',
    'Overlooking',
    'Sector'
]

In [None]:
processor = ColumnTransformer(
    transformers= [
        ('ordinal', OrdinalEncoder(handle_unknown= 'use_encoded_value', unknown_value= -1), all_categorical_features),
        ('numeric', cuMLStandardScaler(), numeric_features)
    ],
    remainder= 'passthrough'
)

In [None]:
def scorer(model_name, model):
    results = [model_name]
    
    pipeline = Pipeline(
        steps= [
        ('preprocessor', preprocessor),
        ('regressor', TransformedTargetRegressor(
            regressor=model,
            transformer=PowerTransformer(method='box-cox')
        ))
    ])
    
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    
    preds = cross_val_predict(pipeline, X, y, cv=kfold)
    
    r2 = r2_score(y, preds)
    mae = mean_absolute_error(y, preds)
    
    results.extend([r2, mae])
    return results

In [None]:
model_output = []

for name, model in model_dict.items():
    model_output.append(scorer(name, model))



In [None]:
# Create results dataframe
results_df = pd.DataFrame(
    model_output,
    columns=['Model', 'R²', 'MAE']
)

# Display sorted results
print("Sorted by MAE:")
print(results_df.sort_values(by='MAE'))

Sorted by MAE:
               Model        R²       MAE
8    extra_trees_cpu  0.882122  0.226274
5            xgboost  0.888936  0.235407
6       catboost_gpu  0.879651  0.257566
1            svr_gpu  0.863794  0.259543
7  decision_tree_cpu  0.728469  0.330947
0     linear_reg_gpu  0.811157  0.339272
2          ridge_gpu  0.810898  0.339452
4  random_forest_gpu  0.772516  0.370016
9       adaboost_cpu  0.622021  0.535333
3          lasso_gpu -0.008214  0.862865


**OneHot with PCA**

In [None]:
ordinal_features = ['Property Age', 'Power Backup', 'Furnishing']

property_age_categories = ['10+ Year Old', '5 to 10 Year Old', '1 to 5 Year Old', '0 to 1 Year Old']
power_backup_categories = ['None', 'Partial', 'Full']
furnishing_categories = ['Unfurnished', 'Semi Furnished', 'Furnished']

onehot_features = ['Nearby', 'Overlooking', 'Sector']
numeric_features = ['Built Up Area', 'Bedroom', 'Bathroom', 'Balcony', 'Floor Num', 'Total Floor', 'Rating']

In [None]:
preprocessor = ColumnTransformer(
    transformers= [
        ('ordinal', OrdinalEncoder(categories= [property_age_categories, power_backup_categories, furnishing_categories]), ordinal_features),
        ('onehot', OneHotEncoder(handle_unknown= 'ignore', drop= 'first', sparse_output= False), onehot_features),
        ('numerical', cuMLStandardScaler(), numeric_features)
    ],
    remainder= 'passthrough'
)

*calculating cuMLPCA value*

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train_processed = processor.fit_transform(X_train)
pca_full = cuMLPCA().fit(X_train_processed)
cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)
n_components_95 = np.argmax(cumulative_variance >= 0.95) + 1



In [None]:
def scorer(model_name, model):
    results = [model_name]
    
    pipeline = Pipeline(
        steps= [
        ('preprocessor', preprocessor),
        ('pca', cuMLPCA(n_components= n_components_95, svd_solver= 'full')),
        ('regressor', TransformedTargetRegressor(
            regressor=model,
            transformer=PowerTransformer(method='box-cox')
        ))
    ])
    
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    
    preds = cross_val_predict(pipeline, X, y, cv=kfold)
    
    r2 = r2_score(y, preds)
    mae = mean_absolute_error(y, preds)
    
    results.extend([r2, mae])
    return results

In [None]:
model_output = []

for name, model in model_dict.items():
    model_output.append(scorer(name, model))



In [None]:
# Create results dataframe
results_df = pd.DataFrame(
    model_output,
    columns=['Model', 'R²', 'MAE']
)

# Display sorted results
print("Sorted by MAE:")
print(results_df.sort_values(by='MAE'))

Sorted by MAE:
               Model        R²       MAE
8    extra_trees_cpu  0.761816  0.367560
1            svr_gpu  0.749602  0.381871
5            xgboost  0.750524  0.386010
6       catboost_gpu  0.755636  0.391174
4  random_forest_gpu  0.658895  0.460164
0     linear_reg_gpu  0.639960  0.491359
2          ridge_gpu  0.639966  0.491361
7  decision_tree_cpu  0.461626  0.529407
9       adaboost_cpu  0.583507  0.564837
3          lasso_gpu  0.205025  0.759949


**Target Encoder - scikit learn**

In [None]:
ordinal_features = ['Property Age', 'Power Backup', 'Furnishing']

onehot_features = ['Nearby', 'Overlooking']
numeric_features = ['Built Up Area', 'Bedroom', 'Bathroom', 'Balcony', 'Floor Num', 'Total Floor', 'Rating']

target_encode_features = ['Sector']

In [None]:
preprocessor = ColumnTransformer(
    transformers= [
        ('ordinal', OrdinalEncoder(categories=[property_age_categories, power_backup_categories, furnishing_categories]), ordinal_features),
        ('onehot', OneHotEncoder(handle_unknown= 'ignore', drop= 'first', sparse_output= False), onehot_features),
        ('target_encode', TargetEncoder(target_type= 'continuous'), target_encode_features),
        ('numerial', cuMLStandardScaler(), numeric_features)
    ],
    remainder= 'passthrough'
)

In [None]:
def scorer(model_name, model):
    results = [model_name]
    
    pipeline = Pipeline(
        steps= [
        ('preprocessor', preprocessor),
        ('regressor', TransformedTargetRegressor(
            regressor=model,
            transformer=PowerTransformer(method='box-cox')
        ))
    ])
    
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    
    preds = cross_val_predict(pipeline, X, y, cv=kfold)
    
    r2 = r2_score(y, preds)
    mae = mean_absolute_error(y, preds)
    
    results.extend([r2, mae])
    return results

In [None]:
model_output = []

for name, model in model_dict.items():
    model_output.append(scorer(name, model))

In [None]:
# Create results dataframe
results_df = pd.DataFrame(
    model_output,
    columns=['Model', 'R²', 'MAE']
)

# Display sorted results
print("Sorted by MAE:")
print(results_df.sort_values(by='MAE'))

Sorted by MAE:
               Model        R²       MAE
8    extra_trees_cpu  0.892847  0.224438
5            xgboost  0.884751  0.240656
4  random_forest_gpu  0.868537  0.261415
6       catboost_gpu  0.876702  0.261788
7  decision_tree_cpu  0.794364  0.301434
1            svr_gpu  0.832724  0.305672
0     linear_reg_gpu  0.791406  0.364515
2          ridge_gpu  0.791327  0.364614
9       adaboost_cpu  0.737645  0.431698
3          lasso_gpu -0.008214  0.862865


**Category Encoder**

In [None]:
ordinal_features = ['Property Age', 'Power Backup', 'Furnishing']

onehot_features = ['Nearby', 'Overlooking']
numeric_features = ['Built Up Area', 'Bedroom', 'Bathroom', 'Balcony', 'Floor Num', 'Total Floor', 'Rating']

category_target_encode_features = ['Sector']

In [None]:
preprocessor = ColumnTransformer(
    transformers= [
        ('ordinal', OrdinalEncoder(categories=[property_age_categories, power_backup_categories, furnishing_categories]), ordinal_features),
        ('onehot', OneHotEncoder(handle_unknown= 'ignore', drop= 'first', sparse_output= False), onehot_features),
        ('target_category_encode', ce.TargetEncoder(), category_target_encode_features),
        ('numerical', cuMLStandardScaler(), numeric_features)
    ],
    remainder= 'passthrough'
)

In [None]:
def scorer(model_name, model):
    results = [model_name]
    
    pipeline = Pipeline(
        steps= [
        ('preprocessor', preprocessor),
        ('pca', cuMLPCA(n_components= n_components_95, svd_solver= 'full')),
        ('regressor', TransformedTargetRegressor(
            regressor=model,
            transformer=PowerTransformer(method='box-cox')
        ))
    ])
    
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    
    preds = cross_val_predict(pipeline, X, y, cv=kfold)
    
    r2 = r2_score(y, preds)
    mae = mean_absolute_error(y, preds)
    
    results.extend([r2, mae])
    return results

In [None]:
model_output = []

for name, model in model_dict.items():
    model_output.append(scorer(name, model))

  ret = func(*args, **kwargs)
  ret = func(*args, **kwargs)
  ret = func(*args, **kwargs)
  ret = func(*args, **kwargs)
  ret = func(*args, **kwargs)
  ret = func(*args, **kwargs)
  ret = func(*args, **kwargs)
  ret = func(*args, **kwargs)
  ret = func(*args, **kwargs)
  ret = func(*args, **kwargs)


In [None]:
# Create results dataframe
results_df = pd.DataFrame(
    model_output,
    columns=['Model', 'R²', 'MAE']
)

# Display sorted results
print("Sorted by MAE:")
print(results_df.sort_values(by='MAE'))

Sorted by MAE:
               Model        R²       MAE
1            svr_gpu  0.547423  0.540867
6       catboost_gpu  0.558553  0.543986
4  random_forest_gpu  0.553414  0.546974
5            xgboost  0.549606  0.549083
2          ridge_gpu  0.538098  0.562423
0     linear_reg_gpu  0.538096  0.562424
9       adaboost_cpu  0.513848  0.604804
8    extra_trees_cpu  0.287786  0.678463
3          lasso_gpu  0.251174  0.735453
7  decision_tree_cpu  0.148181  0.739804


**HyperParameter Tuning**

In [16]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

*extra trees*

In [8]:
ordinal_features = ['Property Age', 'Power Backup', 'Furnishing']

property_age_categories = ['10+ Year Old', '5 to 10 Year Old', '1 to 5 Year Old', '0 to 1 Year Old']
power_backup_categories = ['None', 'Partial', 'Full']
furnishing_categories = ['Unfurnished', 'Semi Furnished', 'Furnished']

onehot_features = ['Nearby', 'Overlooking', 'Sector']
numeric_features = ['Built Up Area', 'Bedroom', 'Bathroom', 'Balcony', 'Floor Num', 'Total Floor', 'Rating']

In [9]:
preprocessor = ColumnTransformer(
    transformers= [
        ('ordinal', OrdinalEncoder(categories= [property_age_categories, power_backup_categories, furnishing_categories]), ordinal_features),
        ('onehot', OneHotEncoder(handle_unknown= 'ignore', drop= 'first', sparse_output= False), onehot_features),
        ('numerical', cuMLStandardScaler(), numeric_features)
    ],
    remainder= 'passthrough'
)

In [10]:
pipeline = Pipeline(
    steps= [
        ('preprocessor', preprocessor),
        ('regressor', TransformedTargetRegressor(
            regressor= ExtraTreesRegressor(),
            transformer= PowerTransformer(method= 'box-cox')
        ))
    ]
)

In [16]:
# param_grid_et_grid = {
#     'regressor__regressor__n_estimators': [100, 200],
#     'regressor__regressor__max_depth': [10, 20, None],
#     'regressor__regressor__max_features': ['sqrt', 0.5]
# }

param_grid_et_random = {
    'regressor__regressor__n_estimators': [100, 200, 300, 500],
    'regressor__regressor__max_depth': [10, 20, 30, None],
    'regressor__regressor__max_features': ['sqrt', 'log2', 0.5, 0.7],
    'regressor__regressor__min_samples_split': [2, 5, 10],
    'regressor__regressor__min_samples_leaf': [1, 2, 4],
    'regressor__regressor__bootstrap': [True, False]
}

In [21]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# grid_search = GridSearchCV(
#     estimator=pipeline,
#     param_grid=param_grid_et_grid,
#     cv=5,
#     scoring={'r2': 'r2','mae': 'neg_mean_absolute_error'},
#     refit= 'mae',
#     n_jobs=-1,
#     verbose=1
# )

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid_et_random,
    n_iter=50,       # <-- You control how many combinations to try (e.g., 50)
    cv=5,
    scoring={'r2': 'r2','mae': 'neg_mean_absolute_error'},
    refit= 'mae',
    n_jobs=1,       # Safe to use for a CPU model like ExtraTrees
    random_state=42
)

In [None]:
random_search.fit(X, y)

In [None]:
# Best R² score and parameters
print("Best Params:", random_search.best_params_)

# MAE for the best R² model
best_mae = -random_search.cv_results_['mean_test_mae'][random_search.best_index_]
print("Best MAE:", best_mae)

# All results as DataFrame
grid_results = pd.DataFrame(random_search.cv_results_)
grid_results[['params', 'mean_test_r2', 'mean_test_mae']].sort_values('mean_test_mae', ascending= False).head(10)

Best Params: {'regressor__regressor__max_depth': None, 'regressor__regressor__max_features': 0.5, 'regressor__regressor__n_estimators': 200}
Best MAE: 0.2531497344921338


Unnamed: 0,params,mean_test_r2,mean_test_mae
11,"{'regressor__regressor__max_depth': None, 'reg...",0.848483,-0.25315
10,"{'regressor__regressor__max_depth': None, 'reg...",0.84651,-0.254876
9,"{'regressor__regressor__max_depth': None, 'reg...",0.849781,-0.260633
8,"{'regressor__regressor__max_depth': None, 'reg...",0.848857,-0.262034
6,"{'regressor__regressor__max_depth': 20, 'regre...",0.822069,-0.297202
7,"{'regressor__regressor__max_depth': 20, 'regre...",0.820927,-0.297372
5,"{'regressor__regressor__max_depth': 20, 'regre...",0.762664,-0.37295
4,"{'regressor__regressor__max_depth': 20, 'regre...",0.76207,-0.374156
2,"{'regressor__regressor__max_depth': 10, 'regre...",0.741584,-0.390597
3,"{'regressor__regressor__max_depth': 10, 'regre...",0.738912,-0.392166


*xgboost*

In [None]:
# # Parameter grid specifically for XGBoost
# param_grid_xgb = {
#     'regressor__regressor__n_estimators': [100, 200, 300, 500],
#     'regressor__regressor__max_depth': [3, 5, 7, 9],
#     'regressor__regressor__learning_rate': [0.05, 0.1, 0.2],
#     'regressor__regressor__subsample': [0.7, 0.8, 0.9, 1.0],
#     'regressor__regressor__colsample_bytree': [0.7, 0.8, 0.9, 1.0]
# }

param_grid_grid = {
    'regressor__regressor__n_estimators': [100, 200],
    'regressor__regressor__max_depth': [10, None],
    'regressor__regressor__max_features': ['sqrt', 0.5]
}

param_grid_random = {'regressor__regressor__n_estimators': [100, 200, 300],
    'regressor__regressor__max_depth': [10, 20, 30],
    'regressor__regressor__max_features': ['sqrt', 0.5],
    'regressor__regressor__min_samples_split': [2, 5]
    }

In [22]:
param_grid_random

{'regressor__regressor__n_estimators': [100, 200, 300],
 'regressor__regressor__max_depth': [10, 20, 30],
 'regressor__regressor__max_features': ['sqrt', 0.5],
 'regressor__regressor__min_samples_split': [2, 5]}

In [10]:
ordinal_features = ['Property Age', 'Power Backup', 'Furnishing']

property_age_categories = ['10+ Year Old', '5 to 10 Year Old', '1 to 5 Year Old', '0 to 1 Year Old']
power_backup_categories = ['None', 'Partial', 'Full']
furnishing_categories = ['Unfurnished', 'Semi Furnished', 'Furnished']

onehot_features = ['Nearby', 'Overlooking', 'Sector']
numeric_features = ['Built Up Area', 'Bedroom', 'Bathroom', 'Balcony', 'Floor Num', 'Total Floor', 'Rating']

In [11]:
preprocessor = ColumnTransformer(
    transformers= [
        ('ordinal', OrdinalEncoder(categories= [property_age_categories, power_backup_categories, furnishing_categories]), ordinal_features),
        ('onehot', OneHotEncoder(handle_unknown= 'ignore', drop= 'first', sparse_output= False), onehot_features),
        ('numerical', cuMLStandardScaler(), numeric_features)
    ],
    remainder= 'passthrough'
)

In [12]:
pipeline = Pipeline(
    steps= [
        ('preprocessor', preprocessor),
        ('regressor', TransformedTargetRegressor(
            regressor= XGBRegressor(device='cuda', random_state=42),
            transformer= PowerTransformer(method= 'box-cox')
        ))
    ]
)

In [13]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

In [20]:
random_search_xgb = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid_random,
    n_iter=50,  # Test 50 random combinations from the grid
    cv=5,
    scoring={'r2': 'r2', 'mae': 'neg_mean_absolute_error'},
    refit='mae',
    n_jobs=1,   # VERY IMPORTANT: Keep this at 1 for GPU tuning to prevent crashes
    verbose=1,
    random_state=42
)

grid_search_xgb = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid_random,
    cv=5,
    scoring={'r2': 'r2', 'mae': 'neg_mean_absolute_error'},
    refit='mae',
    n_jobs=1, # Safe for the CPU-based ExtraTreesRegressor
    verbose=1
)

In [21]:
grid_search_xgb.fit(X, y)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


Parameters: { "max_features", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_split" } are not us

0,1,2
,estimator,Pipeline(step...'box-cox')))])
,param_grid,"{'regressor__regressor__max_depth': [10, 20, ...], 'regressor__regressor__max_features': ['sqrt', 0.5], 'regressor__regressor__min_samples_split': [2, 5], 'regressor__regressor__n_estimators': [100, 200, ...]}"
,scoring,"{'mae': 'neg_mean_absolute_error', 'r2': 'r2'}"
,n_jobs,1
,refit,'mae'
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('ordinal', ...), ('onehot', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,"[['10+ Year Old', '5 to 10 Year Old', ...], ['None', 'Partial', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,regressor,"XGBRegressor(...obs=None, ...)"
,transformer,PowerTransfor...hod='box-cox')
,func,
,inverse_func,
,check_inverse,True

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,'cuda'
,early_stopping_rounds,
,enable_categorical,False

0,1,2
,method,'box-cox'
,standardize,True
,copy,True


In [23]:
# Best R² score and parameters
print("Best Params:", grid_search_xgb.best_params_)

# MAE for the best R² model
best_mae = -grid_search_xgb.cv_results_['mean_test_mae'][grid_search_xgb.best_index_]
print("Best MAE:", best_mae)

# All results as DataFrame
grid_results_xgb = pd.DataFrame(grid_search_xgb.cv_results_)
grid_results_xgb[['params', 'mean_test_r2', 'mean_test_mae']].sort_values('mean_test_mae', ascending= False).head(10)

Best Params: {'regressor__regressor__max_depth': 10, 'regressor__regressor__max_features': 'sqrt', 'regressor__regressor__min_samples_split': 2, 'regressor__regressor__n_estimators': 200}
Best MAE: 0.2523417353630066


Unnamed: 0,params,mean_test_r2,mean_test_mae
10,"{'regressor__regressor__max_depth': 10, 'regre...",0.836353,-0.252342
1,"{'regressor__regressor__max_depth': 10, 'regre...",0.836353,-0.252342
4,"{'regressor__regressor__max_depth': 10, 'regre...",0.836353,-0.252342
7,"{'regressor__regressor__max_depth': 10, 'regre...",0.836353,-0.252342
2,"{'regressor__regressor__max_depth': 10, 'regre...",0.835369,-0.25272
5,"{'regressor__regressor__max_depth': 10, 'regre...",0.835369,-0.25272
11,"{'regressor__regressor__max_depth': 10, 'regre...",0.835369,-0.25272
8,"{'regressor__regressor__max_depth': 10, 'regre...",0.835369,-0.25272
9,"{'regressor__regressor__max_depth': 10, 'regre...",0.83645,-0.255568
0,"{'regressor__regressor__max_depth': 10, 'regre...",0.83645,-0.255568


- xgboost:

RSCV - 0.86, 0.2383

**Default Extra Trees**

In [8]:
ordinal_features = ['Property Age', 'Power Backup', 'Furnishing']

property_age_categories = ['10+ Year Old', '5 to 10 Year Old', '1 to 5 Year Old', '0 to 1 Year Old']
power_backup_categories = ['None', 'Partial', 'Full']
furnishing_categories = ['Unfurnished', 'Semi Furnished', 'Furnished']

onehot_features = ['Nearby', 'Overlooking', 'Sector']
numeric_features = ['Built Up Area', 'Bedroom', 'Bathroom', 'Balcony', 'Total Floor', 'Floor Num', 'Rating']

In [9]:
preprocessor = ColumnTransformer(
    transformers= [
        ('ordinal', OrdinalEncoder(categories= [property_age_categories, power_backup_categories, furnishing_categories]), ordinal_features),
        ('onehot', OneHotEncoder(handle_unknown= 'ignore', drop= 'first', sparse_output= False), onehot_features),
        ('numerical', StandardScaler(), numeric_features)
    ],
    remainder= 'passthrough'
)

In [10]:
pipeline = Pipeline(
    steps= [
        ('preprocessor', preprocessor),
        ('regressor', TransformedTargetRegressor(
            regressor= ExtraTreesRegressor(n_jobs= -1),
            transformer= PowerTransformer(method= 'box-cox')
        ))
    ]
)

In [11]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

preds = cross_val_predict(pipeline, X, y, cv=kfold)
    
r2 = r2_score(y, preds)
mae = mean_absolute_error(y, preds)



In [12]:
print('R2', r2)
print('MAE', mae)

R2 0.8829219901510783
MAE 0.22571749865775811


In [13]:
df.sample(1)

Unnamed: 0,Sector,Built Up Area,Bedroom,Bathroom,Balcony,Servant Room,Store Room,Study Room,Floor Num,Total Floor,Property Age,Furnishing,Power Backup,Covered_Parking,Open_Parking,Total Parking,Rating,Nearby,Overlooking,Price
8147,Sector 90,490,1,1,4,0,0,0,6,14,1 to 5 Year Old,Unfurnished,Full,0,1,1,3.5,Education,Main Road,0.44


In [14]:
# np.set_printoptions(threshold=np.inf)

*without Total Floor*

In [26]:
ordinal_features = ['Property Age', 'Power Backup', 'Furnishing']

property_age_categories = ['10+ Year Old', '5 to 10 Year Old', '1 to 5 Year Old', '0 to 1 Year Old']
power_backup_categories = ['None', 'Partial', 'Full']
furnishing_categories = ['Unfurnished', 'Semi Furnished', 'Furnished']

onehot_features = ['Nearby', 'Overlooking', 'Sector']
numeric_features = ['Built Up Area', 'Bedroom', 'Bathroom', 'Balcony', 'Floor Num', 'Rating']

In [17]:
X.columns

Index(['Sector', 'Built Up Area', 'Bedroom', 'Bathroom', 'Balcony',
       'Servant Room', 'Store Room', 'Study Room', 'Floor Num', 'Total Floor',
       'Property Age', 'Furnishing', 'Power Backup', 'Covered_Parking',
       'Open_Parking', 'Total Parking', 'Rating', 'Nearby', 'Overlooking'],
      dtype='object')

In [22]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

preds = cross_val_predict(pipeline, X.drop(columns= 'Total Floor'), y, cv=kfold)
    
r2 = r2_score(y, preds)
mae = mean_absolute_error(y, preds)



In [23]:
print('R2', r2)
print('MAE', mae)

R2 0.8714188243584902
MAE 0.2378941618529339


In [24]:
df['Total Floor'].median()

np.float64(14.0)

**Predictions**

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42)

In [7]:
ordinal_features = ['Property Age', 'Power Backup', 'Furnishing']

property_age_categories = ['10+ Year Old', '5 to 10 Year Old', '1 to 5 Year Old', '0 to 1 Year Old']
power_backup_categories = ['None', 'Partial', 'Full']
furnishing_categories = ['Unfurnished', 'Semi Furnished', 'Furnished']

onehot_features = ['Nearby', 'Overlooking', 'Sector']
numeric_features = ['Built Up Area', 'Bedroom', 'Bathroom', 'Balcony', 'Total Floor', 'Floor Num', 'Rating']

In [8]:
preprocessor = ColumnTransformer(
    transformers= [
        ('ordinal', OrdinalEncoder(categories= [property_age_categories, power_backup_categories, furnishing_categories]), ordinal_features),
        ('onehot', OneHotEncoder(handle_unknown= 'ignore', drop= 'first', sparse_output= False), onehot_features),
        ('numerical', StandardScaler(), numeric_features)
    ],
    remainder= 'passthrough'
)

In [9]:
pipeline = Pipeline(
    steps= [
        ('preprocessor', preprocessor),
        ('regressor', TransformedTargetRegressor(
            regressor= ExtraTreesRegressor(n_jobs= -1),
            transformer= PowerTransformer(method= 'box-cox')
        ))
    ]
)

In [10]:
production_model = pipeline.fit(X, y)

In [45]:
new_property_data = {
    'Sector': ['Sector 28'],
    'Built Up Area': [1700],
    'Bedroom': [3],
    'Bathroom': [3],
    'Balcony': [2],
    'Servant Room': [1], # 0 because it's unchecked
    'Store Room': [0],  # 1 because it's checked
    'Study Room': [0],  # 1 because it's checked
    'Floor Num': [5],
    'Total Floor': [10],
    'Property Age': ['0 to 1 Year Old'],
    'Furnishing': ['Furnished'],
    'Power Backup': ['Full'],
    'Covered_Parking': [2],
    'Open_Parking': [2],
    'Total Parking': [4], # Calculated from 2 Covered + 4 Open
    'Rating': [3.8],
    'Nearby': ['Education'],
    'Overlooking': ['Club']
}

In [46]:
test_df = pd.DataFrame(new_property_data)

In [47]:
predicted_price = production_model.predict(test_df)[0]

In [48]:
# print(f"Property's actual price is: {actual_price:.2f} Crores")
print(f"The model predicted the price to be: {predicted_price:.2f} Crores")

The model predicted the price to be: 3.23 Crores


**Exporting**

In [15]:
import joblib

# --- Save the Model ---
joblib.dump(production_model, r'C:\Users\aryan\Desktop\Capstone Project\Joblib\property_price_model.pkl')
print("Model saved successfully!")

Model saved successfully!
