In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import (StandardScaler, OrdinalEncoder, FunctionTransformer, PowerTransformer, OneHotEncoder)
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import (RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor)
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error, r2_score

In [2]:
# Load and prepare data
df = pd.read_csv(r'C:\Users\aryan\OneDrive\Desktop\Capstone Project\Data Preprocessing\gurgaon_properties_post_feature_selection_v3.csv')

In [3]:
df.head()

Unnamed: 0,property_type,sector,price_per_sqft,bedRoom,bathroom,balcony,floorNum,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_score,price
0,flat,36,7586.0,3,2,2,2.0,New Property,850.0,0,0,0,8,0.82
1,flat,89,8597.0,2,2,2,4.0,New Property,1226.0,1,0,0,38,0.95
2,flat,33,5470.0,2,2,1,17.0,New Property,1000.0,0,0,0,49,0.32
3,flat,92,8020.0,3,4,3+,10.0,Relatively New,1615.0,1,0,1,174,1.6
4,flat,102,9023.0,2,2,1,5.0,Relatively New,582.0,0,1,0,159,0.48


In [4]:
# Feature engineering functions
def categorize_floor(floor):
    if 0 <= floor <= 2: return "Low Floor"
    elif 3 <= floor <= 10: return "Mid Floor"
    elif 11 <= floor <= 51: return "High Floor"
    return None

def categorize_luxury(score):
    if 0 <= score < 50: return 'Low'
    elif 50 <= score < 140: return 'Medium'
    elif 140 <= score <= 175: return 'High'
    return None

In [5]:
# Apply transformations
df['floorNum'] = df['floorNum'].apply(categorize_floor)
df['luxury_score'] = df['luxury_score'].apply(categorize_luxury)
df['furnishing_type'] = df['furnishing_type'].replace(
    {0.0: 'unfurnished', 1.0: 'semifurnished', 2.0: 'furnished'}
)

In [6]:
df.rename(columns= {'floorNum': 'floor_category', 'luxury_score': 'luxury_category'}, inplace= True)

In [7]:
df.head()

Unnamed: 0,property_type,sector,price_per_sqft,bedRoom,bathroom,balcony,floor_category,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,price
0,flat,36,7586.0,3,2,2,Low Floor,New Property,850.0,0,0,unfurnished,Low,0.82
1,flat,89,8597.0,2,2,2,Mid Floor,New Property,1226.0,1,0,unfurnished,Low,0.95
2,flat,33,5470.0,2,2,1,High Floor,New Property,1000.0,0,0,unfurnished,Low,0.32
3,flat,92,8020.0,3,4,3+,Mid Floor,Relatively New,1615.0,1,0,semifurnished,High,1.6
4,flat,102,9023.0,2,2,1,Mid Floor,Relatively New,582.0,0,1,unfurnished,High,0.48


In [8]:
df['balcony'].value_counts()

balcony
3+    1107
3     1059
2      862
1      354
0      171
Name: count, dtype: int64

In [9]:
# Split data
X = df.drop(columns=['price', 'price_per_sqft'])
y = df['price']

# Preprocessing
columns_to_encode = [0, 4, 5, 6, 10, 11]  # Update with your actual column indices

In [10]:
X.columns

Index(['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'floor_category', 'agePossession', 'built_up_area', 'servant room',
       'store room', 'furnishing_type', 'luxury_category'],
      dtype='object')

**Ordinal Encoding**

In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), [1, 2, 3, 7, 8, 9]),
        ('cat', OrdinalEncoder(
            handle_unknown='use_encoded_value',
            unknown_value=-1
        ), columns_to_encode)
    ],
    remainder='passthrough'
)


In [12]:
def scorer(model_name, model):
    results = [model_name]
    
    # Log Transformation Pipeline
    log_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', TransformedTargetRegressor(
            regressor=model,
            func=np.log1p,
            inverse_func=np.expm1
        ))
    ])
    
    # box Transformation Pipeline
    box_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', TransformedTargetRegressor(
            regressor=model,
            transformer=PowerTransformer(method='box-cox')
        ))
    ])
    
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    
    # Evaluate Log Transformation
    log_r2 = cross_val_score(log_pipeline, X, y, cv=kfold, scoring='r2', n_jobs=-1).mean()
    log_preds = cross_val_predict(log_pipeline, X, y, cv=kfold, n_jobs=-1)
    log_mae = mean_absolute_error(y, log_preds)
    
    # Evaluate box Transformation
    box_r2 = cross_val_score(box_pipeline, X, y, cv=kfold, scoring='r2', n_jobs=-1).mean()
    box_preds = cross_val_predict(box_pipeline, X, y, cv=kfold, n_jobs=-1)
    box_mae = mean_absolute_error(y, box_preds)
    
    results.extend([log_r2, log_mae, box_r2, box_mae])
    return results

In [13]:
# Model dictionary
model_dict = {
    'Linear Regression': LinearRegression(),
    'SVR': SVR(),
    'Ridge': Ridge(),
    'LASSO': Lasso(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(n_jobs=-1),
    'Extra Trees': ExtraTreesRegressor(n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(),
    'AdaBoost': AdaBoostRegressor(),
    'XGBoost': XGBRegressor()
}

In [14]:
# Evaluate models
model_output = []
for name, model in model_dict.items():
    model_output.append(scorer(name, model))
    

In [15]:
# Create results dataframe
results_df = pd.DataFrame(
    model_output,
    columns=['Model', 'Log R²', 'Log MAE', 'Box-Cox R²', 'Box-Cox MAE']
)

# Display sorted results
print("Sorted by Box-Cox MAE:")
print(results_df.sort_values(by='Box-Cox MAE'))

Sorted by Box-Cox MAE:
               Model    Log R²   Log MAE  Box-Cox R²  Box-Cox MAE
9            XGBoost  0.816618  0.508136    0.811467     0.517114
5      Random Forest  0.814113  0.511713    0.808932     0.517480
6        Extra Trees  0.777220  0.570405    0.773619     0.571593
7  Gradient Boosting  0.817782  0.570342    0.811854     0.579381
4      Decision Tree  0.655050  0.680018    0.663796     0.673014
1                SVR  0.743191  0.713735    0.728411     0.724221
8           AdaBoost  0.709956  0.836611    0.587023     0.864343
2              Ridge  0.492081  0.908547   -5.528757     1.373684
0  Linear Regression  0.491769  0.908595   -5.543255     1.374280
3              LASSO -0.046248  1.558920   -0.101158     1.522746


**One Hot Encoding**

In [16]:
columns_to_encode

[0, 4, 5, 6, 10, 11]

In [17]:
X.columns

Index(['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'floor_category', 'agePossession', 'built_up_area', 'servant room',
       'store room', 'furnishing_type', 'luxury_category'],
      dtype='object')

In [18]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), [2, 3, 7, 8, 9]),
        ('cat_ordinal', OrdinalEncoder(
            handle_unknown='use_encoded_value',
            unknown_value=-1
        ), [4, 10, 11]),
        ('cat_hot', OneHotEncoder(drop= 'first'), [0, 1, 5, 6])  # Fixed line
    ],
    remainder='passthrough'
)

In [19]:
def scorer(model_name, model):
    results = [model_name]
    
    # Log Transformation Pipeline
    log_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', TransformedTargetRegressor(
            regressor=model,
            func=np.log1p,
            inverse_func=np.expm1
        ))
    ])
    
    # box Transformation Pipeline
    box_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', TransformedTargetRegressor(
            regressor=model,
            transformer=PowerTransformer(method='box-cox')
        ))
    ])
    
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    
    # Evaluate Log Transformation
    log_r2 = cross_val_score(log_pipeline, X, y, cv=kfold, scoring='r2', n_jobs=-1).mean()
    log_preds = cross_val_predict(log_pipeline, X, y, cv=kfold, n_jobs=-1)
    log_mae = mean_absolute_error(y, log_preds)
    
    # Evaluate box Transformation
    box_r2 = cross_val_score(box_pipeline, X, y, cv=kfold, scoring='r2', n_jobs=-1).mean()
    box_preds = cross_val_predict(box_pipeline, X, y, cv=kfold, n_jobs=-1)
    box_mae = mean_absolute_error(y, box_preds)
    
    results.extend([log_r2, log_mae, box_r2, box_mae])
    return results

In [20]:
# Model dictionary
model_dict = {
    'Linear Regression': LinearRegression(),
    'SVR': SVR(),
    'Ridge': Ridge(),
    'LASSO': Lasso(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(n_jobs=-1),
    'Extra Trees': ExtraTreesRegressor(n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(),
    'AdaBoost': AdaBoostRegressor(),
    'XGBoost': XGBRegressor()
}

In [21]:
# Evaluate models
model_output = []
for name, model in model_dict.items():
    model_output.append(scorer(name, model))

In [22]:
# Create results dataframe
results_df = pd.DataFrame(
    model_output,
    columns=['Model', 'Log R²', 'Log MAE', 'Box-Cox R²', 'Box-Cox MAE']
)

# Display sorted results
print("Sorted by Box-Cox MAE:")
print(results_df.sort_values(by='Log MAE'))

Sorted by Box-Cox MAE:
               Model    Log R²   Log MAE  Box-Cox R²  Box-Cox MAE
6        Extra Trees  0.817374  0.500648    0.808734     0.510696
9            XGBoost  0.825566  0.519219    0.821836     0.528240
5      Random Forest  0.801713  0.548758    0.798783     0.552843
1                SVR  0.807079  0.554705    0.800201     0.559750
7  Gradient Boosting  0.807915  0.614966    0.789280     0.630224
0  Linear Regression  0.672739  0.667792   -1.698908     0.972289
2              Ridge  0.670160  0.673511   -1.786241     0.979523
4      Decision Tree  0.664633  0.702863    0.635239     0.709655
8           AdaBoost  0.669596  0.894102    0.569595     0.908713
3              LASSO -0.046248  1.558920   -0.101158     1.522746


**One Hot Encoding with PCA**

In [23]:
X.columns

Index(['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'floor_category', 'agePossession', 'built_up_area', 'servant room',
       'store room', 'furnishing_type', 'luxury_category'],
      dtype='object')

In [24]:
preprocessor = ColumnTransformer(
    transformers= [
        ('num', StandardScaler(), [2, 3, 7, 8, 9]),
        ('cat_ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), [4, 10, 11]),
        ('cat_hot', OneHotEncoder(drop= 'first', sparse_output= False), [0, 1, 5, 6])
], remainder= 'passthrough')

In [25]:
def scorer(model_name, model):
    results = [model_name]
    
    # log pipeline
    log_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components= 0.95, svd_solver= 'full')),
        ('regressor', TransformedTargetRegressor(regressor= model, func= np.log1p, inverse_func= np.expm1))
    ])
    
    # box pipeline
    box_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components= 0.95, svd_solver= 'full')),
        ('regressor', TransformedTargetRegressor(regressor= model, transformer= PowerTransformer(method= 'box-cox')))
    ])
    
    kfold = KFold(n_splits= 10, shuffle= True, random_state= 42)
    
    # Evaluate Log Transformation
    log_r2 = cross_val_score(log_pipeline, X, y, cv= kfold, scoring= 'r2', n_jobs= -1).mean()
    log_preds = cross_val_predict(log_pipeline, X, y, cv= kfold, n_jobs= -1)
    log_mae = mean_absolute_error(y, log_preds)
    
    # Evaluate Box Transformation
    box_r2 = cross_val_score(box_pipeline, X, y, cv= kfold, scoring= 'r2', n_jobs= -1).mean()
    box_preds = cross_val_predict(box_pipeline, X, y, cv= kfold, n_jobs= -1)
    box_mae = mean_absolute_error(y, box_preds)
    
    results.extend([log_r2, log_mae, box_r2, box_mae])
    return results
    

In [26]:
# Model dictionary
model_dict = {
    'Linear Regression': LinearRegression(),
    'SVR': SVR(),
    'Ridge': Ridge(),
    'LASSO': Lasso(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(n_jobs=-1),
    'Extra Trees': ExtraTreesRegressor(n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(),
    'AdaBoost': AdaBoostRegressor(),
    'XGBoost': XGBRegressor()
}

In [27]:
# Evaluate models
model_output = []
for name, model in model_dict.items():
    model_output.append(scorer(name, model))

In [28]:
# Create results dataframe
results_df = pd.DataFrame(
    model_output,
    columns=['Model', 'Log R²', 'Log MAE', 'Box-Cox R²', 'Box-Cox MAE']
)

# Display sorted results
print("Sorted by Box-Cox MAE:")
print(results_df.sort_values(by='Log MAE'))

Sorted by Box-Cox MAE:
               Model    Log R²   Log MAE  Box-Cox R²  Box-Cox MAE
6        Extra Trees  0.762356  0.640975    0.753739     0.640064
9            XGBoost  0.757902  0.656068    0.735853     0.665611
5      Random Forest  0.745045  0.674316    0.741279     0.669038
1                SVR  0.725634  0.678098    0.718076     0.677945
7  Gradient Boosting  0.739200  0.705166    0.725264     0.709408
2              Ridge  0.519527  0.857395   -5.157967     1.293328
0  Linear Regression  0.519204  0.857490   -5.188929     1.294076
8           AdaBoost  0.665109  0.941022    0.569703     0.919490
4      Decision Tree  0.470719  0.948404    0.548688     0.921505
3              LASSO -0.046248  1.558920    0.080228     1.331472


**Target Encoder**

In [29]:
X.columns

Index(['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'floor_category', 'agePossession', 'built_up_area', 'servant room',
       'store room', 'furnishing_type', 'luxury_category'],
      dtype='object')

In [30]:
import category_encoders as ce

columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

preprocessor = ColumnTransformer(
    transformers= [
        ('num', StandardScaler(), [2, 3, 7, 8, 9]),
        ('cat_ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), [4, 10, 11]),
        ('cat_hot', OneHotEncoder(drop= 'first', sparse_output= False), [0, 5, 6]),
        ('target_enc', ce.TargetEncoder(), [1])
], remainder= 'passthrough')

In [31]:
def scorer(model_name, model):
    results = [model_name]
    
    # log pipeline
    log_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', TransformedTargetRegressor(regressor= model, func= np.log1p, inverse_func= np.expm1))
    ])
    
    # box pipeline
    box_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', TransformedTargetRegressor(regressor= model, transformer= PowerTransformer(method= 'box-cox')))
    ])
    
    kfold = KFold(n_splits= 10, shuffle= True, random_state= 42)
    
    # Evaluate Log Transformation
    log_r2 = cross_val_score(log_pipeline, X, y, cv= kfold, scoring= 'r2', n_jobs= -1).mean()
    log_preds = cross_val_predict(log_pipeline, X, y, cv= kfold, n_jobs= -1)
    log_mae = mean_absolute_error(y, log_preds)
    
    # Evaluate Box Transformation
    box_r2 = cross_val_score(box_pipeline, X, y, cv= kfold, scoring= 'r2', n_jobs= -1).mean()
    box_preds = cross_val_predict(box_pipeline, X, y, cv= kfold, n_jobs= -1)
    box_mae = mean_absolute_error(y, box_preds)
    
    results.extend([log_r2, log_mae, box_r2, box_mae])
    return results

In [32]:
# Model dictionary
model_dict = {
    'Linear Regression': LinearRegression(),
    'SVR': SVR(),
    'Ridge': Ridge(),
    'LASSO': Lasso(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(n_jobs=-1),
    'Extra Trees': ExtraTreesRegressor(n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(),
    'AdaBoost': AdaBoostRegressor(),
    'XGBoost': XGBRegressor()
}

In [33]:
# Evaluate models
model_output = []
for name, model in model_dict.items():
    model_output.append(scorer(name, model))

In [34]:
# Create results dataframe
results_df = pd.DataFrame(
    model_output,
    columns=['Model', 'Log R²', 'Log MAE', 'Box-Cox R²', 'Box-Cox MAE']
)

# Display sorted results
print("Sorted by Box-Cox MAE:")
print(results_df.sort_values(by='Log MAE'))

Sorted by Box-Cox MAE:
               Model    Log R²   Log MAE  Box-Cox R²  Box-Cox MAE
9            XGBoost  0.816183  0.512875    0.813025     0.516200
5      Random Forest  0.814281  0.513404    0.807629     0.517601
6        Extra Trees  0.774606  0.569869    0.771414     0.570376
7  Gradient Boosting  0.816632  0.573239    0.806843     0.582889
4      Decision Tree  0.637792  0.671822    0.641133     0.669910
8           AdaBoost  0.710712  0.830623    0.601284     0.869080
1                SVR  0.588056  0.867353    0.354047     0.964938
2              Ridge  0.500877  0.898761   -4.598669     1.336529
0  Linear Regression  0.500489  0.898829   -4.614059     1.337179
3              LASSO -0.039557  1.555654   -0.099753     1.522946


**Hyperparameter Tuning**

In [35]:
from sklearn.model_selection import GridSearchCV

**Random Forest**

In [36]:
X.columns

Index(['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'floor_category', 'agePossession', 'built_up_area', 'servant room',
       'store room', 'furnishing_type', 'luxury_category'],
      dtype='object')

In [72]:
columns_to_encode = [0, 4, 5, 6, 10, 11]

In [65]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), [1, 2, 3, 7, 8, 9]),
        ('cat', OrdinalEncoder(
            handle_unknown='use_encoded_value',
            unknown_value=-1
        ), columns_to_encode)
    ],
    remainder='passthrough'
)

In [77]:
# Define pipelines
log_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', TransformedTargetRegressor(
        regressor=RandomForestRegressor(random_state=42),
        func=np.log1p,
        inverse_func=np.expm1
    ))
])

box_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', TransformedTargetRegressor(
        regressor=RandomForestRegressor(random_state=42),
        transformer=PowerTransformer(method='box-cox')
    ))
])

param_grid = {
    'regressor__regressor__n_estimators': [100, 300, 500],
    'regressor__regressor__max_depth': [None, 20],
    'regressor__regressor__max_features': ['sqrt', 0.3],
    'regressor__regressor__min_samples_leaf': [1, 2],
    'regressor__regressor__bootstrap': [True, False]
}

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# Log transformation search
log_search = GridSearchCV(
    log_pipeline,
    param_grid=param_grid,
    cv=kfold,
    scoring={'r2': 'r2', 'mae': 'neg_mean_absolute_error'},
    refit='mae',
    n_jobs=-1,
    verbose=2
)
log_search.fit(X, y)

# Box-Cox search (ensure y > 0 for Box-Cox)
if (y > 0).all():
    box_search = GridSearchCV(
        box_pipeline,
        param_grid=param_grid,
        cv=kfold,
        scoring={'r2': 'r2', 'mae': 'neg_mean_absolute_error'},
        refit='mae',
        n_jobs=-1,
        verbose=2
    )
    box_search.fit(X, y)
else:
    print("Box-Cox requires positive y values. Skipping...")


Fitting 10 folds for each of 48 candidates, totalling 480 fits
Fitting 10 folds for each of 48 candidates, totalling 480 fits


In [80]:
# Best R² score and parameters
print("Best R² (Log):", log_search.best_score_)
print("Best Params (Log):", log_search.best_params_)

# MAE for the best R² model
best_log_mae = -log_search.cv_results_['mean_test_mae'][log_search.best_index_]
print("Best MAE (Log):", best_log_mae)

# All results as DataFrame
log_results = pd.DataFrame(log_search.cv_results_)
log_results[['params', 'mean_test_r2', 'mean_test_mae']].sort_values('mean_test_r2', ascending=False).head(10)


Best R² (Log): -0.5431106077240935
Best Params (Log): {'regressor__regressor__bootstrap': False, 'regressor__regressor__max_depth': None, 'regressor__regressor__max_features': 'sqrt', 'regressor__regressor__min_samples_leaf': 1, 'regressor__regressor__n_estimators': 100}
Best MAE (Log): 0.5431106077240935


Unnamed: 0,params,mean_test_r2,mean_test_mae
24,"{'regressor__regressor__bootstrap': False, 're...",0.803431,-0.543111
30,"{'regressor__regressor__bootstrap': False, 're...",0.803431,-0.543111
36,"{'regressor__regressor__bootstrap': False, 're...",0.803184,-0.544593
42,"{'regressor__regressor__bootstrap': False, 're...",0.803184,-0.544593
43,"{'regressor__regressor__bootstrap': False, 're...",0.802551,-0.544487
37,"{'regressor__regressor__bootstrap': False, 're...",0.802551,-0.544487
38,"{'regressor__regressor__bootstrap': False, 're...",0.801955,-0.545881
44,"{'regressor__regressor__bootstrap': False, 're...",0.801955,-0.545881
32,"{'regressor__regressor__bootstrap': False, 're...",0.801205,-0.544929
26,"{'regressor__regressor__bootstrap': False, 're...",0.801205,-0.544929


In [81]:
# Best R² score and parameters
print("Best R² (Box):", box_search.best_score_)
print("Best Params (Box):", box_search.best_params_)

# MAE for the best R² model
best_box_mae = -box_search.cv_results_['mean_test_mae'][box_search.best_index_]
print("Best MAE (Box):", best_box_mae)

# All results as DataFrame
box_results = pd.DataFrame(box_search.cv_results_)
box_results[['params', 'mean_test_r2', 'mean_test_mae']].sort_values('mean_test_r2', ascending=False).head(10)


Best R² (Box): -0.5500086985438588
Best Params (Box): {'regressor__regressor__bootstrap': False, 'regressor__regressor__max_depth': 20, 'regressor__regressor__max_features': 'sqrt', 'regressor__regressor__min_samples_leaf': 1, 'regressor__regressor__n_estimators': 500}
Best MAE (Box): 0.5500086985438588


Unnamed: 0,params,mean_test_r2,mean_test_mae
38,"{'regressor__regressor__bootstrap': False, 're...",0.794926,-0.550009
44,"{'regressor__regressor__bootstrap': False, 're...",0.794926,-0.550009
25,"{'regressor__regressor__bootstrap': False, 're...",0.794627,-0.550328
31,"{'regressor__regressor__bootstrap': False, 're...",0.794627,-0.550328
32,"{'regressor__regressor__bootstrap': False, 're...",0.794126,-0.550283
26,"{'regressor__regressor__bootstrap': False, 're...",0.794126,-0.550283
43,"{'regressor__regressor__bootstrap': False, 're...",0.793506,-0.551325
37,"{'regressor__regressor__bootstrap': False, 're...",0.793506,-0.551325
24,"{'regressor__regressor__bootstrap': False, 're...",0.792747,-0.553932
30,"{'regressor__regressor__bootstrap': False, 're...",0.792747,-0.553932


**xgboost**

In [85]:
X.columns

Index(['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'floor_category', 'agePossession', 'built_up_area', 'servant room',
       'store room', 'furnishing_type', 'luxury_category'],
      dtype='object')

In [94]:
columns_to_encode = [0, 1, 2, 3, 4, 5, 6, 10, 11]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), [7, 8, 9]),
        ('cat', OrdinalEncoder(
            handle_unknown='use_encoded_value',
            unknown_value=-1
        ), columns_to_encode)
    ],
    remainder='passthrough'
)

In [95]:
# Define pipelines
log_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', TransformedTargetRegressor(
        regressor= XGBRegressor(
    n_estimators=2000,          # Set high initial value
    # early_stopping_rounds=50,   # Stop if no improvement for 50 rounds
    eval_metric='mae',          # Metric to monitor
    learning_rate=0.1
),
        func=np.log1p,
        inverse_func=np.expm1
    ))
])

box_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', TransformedTargetRegressor(
        regressor= XGBRegressor(
    n_estimators=2000,          # Set high initial value
    # early_stopping_rounds=50,   # Stop if no improvement for 50 rounds
    eval_metric='mae',          # Metric to monitor
    learning_rate=0.1
),
        transformer=PowerTransformer(method='box-cox')
    ))
])

param_grid = {
    'regressor__regressor__n_estimators': [100, 200, 300],  # 3 options
    'regressor__regressor__max_depth': [3, 6, 10],               # 2 options
    'regressor__regressor__learning_rate': [0.05, 0.1],      # 2 options
    'regressor__regressor__subsample': [0.8, 1.0],           # 2 options
    'regressor__regressor__colsample_bytree': [0.8, 1.0],    # 2 options
    'regressor__regressor__gamma': [0, 0.1],                 # 2 options
}

kfold = KFold(n_splits=6, shuffle=True, random_state=42)

# Log transformation search
log_search = GridSearchCV(
    log_pipeline,
    param_grid=param_grid,
    cv=kfold,
    scoring={'r2': 'r2', 'mae': 'neg_mean_absolute_error'},
    refit='mae',
    n_jobs=-1,
    verbose=2
)
log_search.fit(X, y)

# Box-Cox search (ensure y > 0 for Box-Cox)
if (y > 0).all():
    box_search = GridSearchCV(
        box_pipeline,
        param_grid=param_grid,
        cv=kfold,
        scoring={'r2': 'r2', 'mae': 'neg_mean_absolute_error'},
        refit='mae',
        n_jobs=-1,
        verbose=2
    )
    box_search.fit(X, y)
else:
    print("Box-Cox requires positive y values. Skipping...")


Fitting 6 folds for each of 144 candidates, totalling 864 fits
Fitting 6 folds for each of 144 candidates, totalling 864 fits


In [96]:
# Best R² score and parameters
print("Best R² (Log):", log_search.best_score_)
print("Best Params (Log):", log_search.best_params_)

# MAE for the best R² model
best_log_mae = -log_search.cv_results_['mean_test_mae'][log_search.best_index_]
print("Best MAE (Log):", best_log_mae)

# All results as DataFrame
log_results = pd.DataFrame(log_search.cv_results_)
log_results[['params', 'mean_test_r2', 'mean_test_mae']].sort_values('mean_test_mae', ascending=False).head(10)

Best R² (Log): -0.48946882746447656
Best Params (Log): {'regressor__regressor__colsample_bytree': 0.8, 'regressor__regressor__gamma': 0, 'regressor__regressor__learning_rate': 0.1, 'regressor__regressor__max_depth': 6, 'regressor__regressor__n_estimators': 300, 'regressor__regressor__subsample': 1.0}
Best MAE (Log): 0.48946882746447656


Unnamed: 0,params,mean_test_r2,mean_test_mae
29,{'regressor__regressor__colsample_bytree': 0.8...,0.831391,-0.489469
27,{'regressor__regressor__colsample_bytree': 0.8...,0.830359,-0.493967
16,{'regressor__regressor__colsample_bytree': 0.8...,0.818584,-0.495839
11,{'regressor__regressor__colsample_bytree': 0.8...,0.836003,-0.495962
100,{'regressor__regressor__colsample_bytree': 1.0...,0.819629,-0.496323
98,{'regressor__regressor__colsample_bytree': 1.0...,0.821096,-0.49736
14,{'regressor__regressor__colsample_bytree': 0.8...,0.819014,-0.497458
28,{'regressor__regressor__colsample_bytree': 0.8...,0.820181,-0.498485
17,{'regressor__regressor__colsample_bytree': 0.8...,0.818007,-0.49853
15,{'regressor__regressor__colsample_bytree': 0.8...,0.817903,-0.500219


In [97]:
# Best R² score and parameters
print("Best R² (Box):", box_search.best_score_)
print("Best Params (Box):", box_search.best_params_)

# MAE for the best R² model
best_box_mae = -box_search.cv_results_['mean_test_mae'][box_search.best_index_]
print("Best MAE (Box):", best_box_mae)

# All results as DataFrame
box_results = pd.DataFrame(box_search.cv_results_)
box_results[['params', 'mean_test_r2', 'mean_test_mae']].sort_values('mean_test_mae', ascending= False).head(10)

Best R² (Box): -0.4935876053990851
Best Params (Box): {'regressor__regressor__colsample_bytree': 1.0, 'regressor__regressor__gamma': 0, 'regressor__regressor__learning_rate': 0.1, 'regressor__regressor__max_depth': 6, 'regressor__regressor__n_estimators': 300, 'regressor__regressor__subsample': 1.0}
Best MAE (Box): 0.4935876053990851


Unnamed: 0,params,mean_test_r2,mean_test_mae
101,{'regressor__regressor__colsample_bytree': 1.0...,0.826746,-0.493588
28,{'regressor__regressor__colsample_bytree': 0.8...,0.823202,-0.495185
17,{'regressor__regressor__colsample_bytree': 0.8...,0.816466,-0.495599
16,{'regressor__regressor__colsample_bytree': 0.8...,0.814998,-0.495845
29,{'regressor__regressor__colsample_bytree': 0.8...,0.821507,-0.496941
14,{'regressor__regressor__colsample_bytree': 0.8...,0.815293,-0.497448
99,{'regressor__regressor__colsample_bytree': 1.0...,0.826412,-0.497489
15,{'regressor__regressor__colsample_bytree': 0.8...,0.816251,-0.497872
10,{'regressor__regressor__colsample_bytree': 0.8...,0.826863,-0.499432
26,{'regressor__regressor__colsample_bytree': 0.8...,0.825111,-0.499499


**Predictions**

In [48]:
X.iloc[0].values

array(['flat', np.int64(36), np.int64(3), np.int64(2), '2', 'Low Floor',
       'New Property', np.float64(850.0), np.int64(0), np.int64(0),
       'unfurnished', 'Low'], dtype=object)

In [105]:
best_model = log_search.best_estimator_
final_pipe = log_search.best_params_

In [102]:
best_model

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,regressor,"XGBRegressor(...ree=None, ...)"
,transformer,PowerTransfor...hod='box-cox')
,func,
,inverse_func,
,check_inverse,True

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,1.0
,device,
,early_stopping_rounds,
,enable_categorical,False

0,1,2
,method,'box-cox'
,standardize,True
,copy,True


In [107]:
log_search.best_params_

{'regressor__regressor__colsample_bytree': 0.8,
 'regressor__regressor__gamma': 0,
 'regressor__regressor__learning_rate': 0.1,
 'regressor__regressor__max_depth': 6,
 'regressor__regressor__n_estimators': 300,
 'regressor__regressor__subsample': 1.0}

In [100]:
data = [['house', 102, 4, 3, '3+', 'New Property', 2750, 0, 0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,house,102,4,3,3+,New Property,2750,0,0,unfurnished,Low,Low Floor


In [101]:
best_model.predict(one_df)

array([2.969334], dtype=float32)

**Exporting the model**

In [106]:
columns_to_encode = [0, 1, 2, 3, 4, 5, 6, 10, 11]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), [7, 8, 9]),
        ('cat', OrdinalEncoder(
            handle_unknown='use_encoded_value',
            unknown_value=-1
        ), columns_to_encode)
    ],
    remainder='passthrough'
)

In [108]:
final_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', TransformedTargetRegressor(
        regressor=XGBRegressor(
            n_estimators=300,          # From best_params
            max_depth=6,               # From best_params
            learning_rate=0.1,         # From best_params
            subsample=1.0,             # From best_params
            colsample_bytree=0.8,      # From best_params
            gamma=0,                   # From best_params
            eval_metric='mae',
            random_state=42
        ),
        func=np.log1p,
        inverse_func=np.expm1
    ))
])

In [109]:
final_pipeline.fit(X, y)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,regressor,"XGBRegressor(...ree=None, ...)"
,transformer,
,func,<ufunc 'log1p'>
,inverse_func,<ufunc 'expm1'>
,check_inverse,True

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [110]:
import pickle

with open(r'C:\Users\aryan\OneDrive\Desktop\Capstone Project\Pickle\pipeline.pkl', 'wb') as file:
    pickle.dump(final_pipeline, file)

In [111]:
with open(r'C:\Users\aryan\OneDrive\Desktop\Capstone Project\Pickle\df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [118]:
df['luxury_category'].value_counts()

luxury_category
Low       1593
Medium    1380
High       580
Name: count, dtype: int64

In [119]:
print(df.dtypes)

property_type       object
sector               int64
price_per_sqft     float64
bedRoom              int64
bathroom             int64
balcony             object
floor_category      object
agePossession       object
built_up_area      float64
servant room         int64
store room           int64
furnishing_type     object
luxury_category     object
price              float64
dtype: object


In [120]:
X.columns

Index(['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'floor_category', 'agePossession', 'built_up_area', 'servant room',
       'store room', 'furnishing_type', 'luxury_category'],
      dtype='object')

In [127]:
final_pipeline.predict([['flat', 37, 2, 2, '2', 'Low Floor', 'New Property', 1068, 0, 0, 'unfurnished', 'Low']])



array([0.75758743], dtype=float32)