# Imports

In [1]:
from IPython.display import display, HTML

def displayHorizontalDataframes(dfs, titles):
    """
    Display a list of pandas dataframes horizontally with titles on top.
    
    Parameters:
    dfs (list): List of pandas DataFrames.
    titles (list): List of titles (strings) corresponding to each DataFrame.
    """
    if len(dfs) != len(titles):
        raise ValueError("The number of dataframes and titles must be the same.")

    # Start a flex container to align content horizontally and center them.
    html = (
        "<div style='display: flex; justify-content: center; "
        "align-items: flex-start;'>"
    )
    
    # Iterate over the dataframes and their corresponding titles.
    for df, title in zip(dfs, titles):
        title = " " if not title else title
        html += (
            "<div style='margin-left: 20px; margin-right: 20px; text-align: center;'>"
            f"<h3>{title}</h3>"
            f"{df.to_html(classes='dataframe', border=1)}"
            "</div>"
        )
    
    html += "</div>"
    display(HTML(html))

In [2]:
import pandas as pd
# import plotly.express as px
# import plotly.graph_objects as go
# from ydata_profiling import ProfileReport
import numpy as np

dtype_mapping = {
    'propertyId': pd.StringDtype(),
    'localityName': 'category',
    'landMarks': pd.StringDtype(),
    'locality': pd.StringDtype(),
    'price': pd.Int64Dtype(),
    'nameOfSociety': pd.StringDtype(),
    'projectName': pd.StringDtype(),
    'carpetArea': pd.Int64Dtype(),
    'coveredArea': pd.Int64Dtype(),
    'carpetAreaSqft': pd.Int64Dtype(),
    'possessionStatus': pd.StringDtype(),
    'developerName': pd.StringDtype(),
    'flooringType': pd.StringDtype(),
    'floorNumber': pd.Int64Dtype(),
    'unitCountonFloor': pd.Int64Dtype(),
    'totalFloorNumber': pd.Int64Dtype(),
    'electricityStatus': pd.StringDtype(),
    'waterStatus': pd.StringDtype(),
    'longitude': pd.Float64Dtype(),
    'latitude': pd.Float64Dtype(),
    'transactionType': 'category',
    'facing': pd.StringDtype(),
    'ownershipType': pd.StringDtype(),
    'carParking': pd.StringDtype(),
    'furnished': 'category',
    'bedrooms': pd.Int64Dtype(),
    'bathrooms': pd.Int64Dtype(),
    'numberOfBalconied': pd.Int64Dtype(),
    'propertyType': 'category',
    'additionalRooms': pd.StringDtype(),
    'bookingAmountExact': pd.Int64Dtype(),
    'maintenanceChargesFrequency': 'category',
    'maintenanceCharges': pd.Int64Dtype(),
    'ageofcons': 'category',
    'isVerified': 'category',
    'listingTypeDesc': 'category',
    'premiumProperty': pd.BooleanDtype(),
    'noOfLifts': pd.Int64Dtype(),
    'propertyAmenities': pd.StringDtype(),
    'facilitiesDesc': pd.StringDtype(),
    'uuid': pd.StringDtype(),
    'flooringType_Vitrified': pd.BooleanDtype(),
    'flooringType_CeramicTiles': pd.BooleanDtype(),
    'flooringType_Marble': pd.BooleanDtype(),
    'flooringType_NormalTilesKotahStone': pd.BooleanDtype(),
    'flooringType_Granite': pd.BooleanDtype(),
    'flooringType_Wooden': pd.BooleanDtype(),
    'flooringType_Mosaic': pd.BooleanDtype(),
    'flooringType_Marbonite': pd.BooleanDtype(),
    'additionalRoom_PujaRoom': pd.BooleanDtype(),
    'additionalRoom_Study': pd.BooleanDtype(),
    'additionalRoom_Store': pd.BooleanDtype(),
    'additionalRoom_ServantRoom': pd.BooleanDtype(),
    'carParking_Open': pd.Int64Dtype(),
    'carParking_Covered': pd.Int64Dtype(),
    'ReservedParking': pd.BooleanDtype(),
}

COLUMNS_TO_DROP = [
    'coveredArea',
    'ReservedParking',
] + [
        'unitCountonFloor',
        'electricityStatus',
        'waterStatus',
        'facing',
        'bookingAmountExact',
        'isVerified',
        'listingTypeDesc',
        'maintenanceCharges',
        'maintenanceChargesFrequency',
        'latitude',
        'longitude',
        'carParking_Open',
        'carParking_Covered',
        'numberOfBalconied',
        'premiumProperty',
        'projectName',
        'nameOfSociety',
        'url',
        # 'uuid',
        'carpetAreaSqft',
        'noOfLifts',
        'ownershipType',
        'possessionStatus',
        'propertyType',

        'flooringType_Vitrified',
        'flooringType_CeramicTiles',
        'flooringType_Marble',
        'flooringType_NormalTilesKotahStone',
        'flooringType_Granite',
        'flooringType_Wooden',
        'flooringType_Mosaic',
        'flooringType_Marbonite',

        'additionalRoom_PujaRoom',
        'additionalRoom_Study',
        'additionalRoom_Store',
        'additionalRoom_ServantRoom',
        
        'landMarks', 
        'locality', 
        'developerName',]

################################################################################
# ONLY USING THE RAW SETs, NOT IMPUTED SET
################################################################################
df_train = pd.read_csv(
    'Data/train.csv',
    dtype = dtype_mapping,
    index_col=0
)
df_train.drop(COLUMNS_TO_DROP, axis=1, inplace=True)

df_test = pd.read_csv(
    'Data/test.csv',
    dtype = dtype_mapping,
    index_col=0
)
df_test.drop(COLUMNS_TO_DROP, axis=1, inplace=True)

################################################################################
# DROPPING ALL ROWS WITH MISSING VALUES
################################################################################

print("Train Set Null values: ", df_train.isna().sum(), '\n')
print("Test Set Null values: ", df_test.isna().sum(), '\n')

df_train.dropna(axis=0, inplace=True)
df_test.dropna(axis=0, inplace=True)

Train Set Null values:  localityName           0
price                  0
carpetArea          3764
floorNumber            0
totalFloorNumber       0
transactionType        0
furnished             37
bedrooms               0
bathrooms              0
ageofcons           2571
dtype: int64 

Test Set Null values:  localityName          0
price                 0
carpetArea          977
floorNumber           0
totalFloorNumber      0
transactionType       0
furnished            14
bedrooms              0
bathrooms             0
ageofcons           693
dtype: int64 



In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13983 entries, 65067453 to 76736011
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   localityName      13983 non-null  category
 1   price             13983 non-null  Int64   
 2   carpetArea        13983 non-null  Int64   
 3   floorNumber       13983 non-null  Int64   
 4   totalFloorNumber  13983 non-null  Int64   
 5   transactionType   13983 non-null  category
 6   furnished         13983 non-null  category
 7   bedrooms          13983 non-null  Int64   
 8   bathrooms         13983 non-null  Int64   
 9   ageofcons         13983 non-null  category
dtypes: Int64(6), category(4)
memory usage: 925.2 KB


In [4]:
df_train['localityName'].nunique()

224

In [5]:
df_test['localityName'].nunique()

215

In [6]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3469 entries, 75987143 to 77784815
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   localityName      3469 non-null   category
 1   price             3469 non-null   Int64   
 2   carpetArea        3469 non-null   Int64   
 3   floorNumber       3469 non-null   Int64   
 4   totalFloorNumber  3469 non-null   Int64   
 5   transactionType   3469 non-null   category
 6   furnished         3469 non-null   category
 7   bedrooms          3469 non-null   Int64   
 8   bathrooms         3469 non-null   Int64   
 9   ageofcons         3469 non-null   category
dtypes: Int64(6), category(4)
memory usage: 237.4 KB


# Feature Encoding

In [7]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, TargetEncoder

# Assume that df_train and df_test are your already cleaned and imputed datasets.
X_train = df_train.drop("price", axis=1)
y_train = df_train["price"]

X_test = df_test.drop("price", axis=1)
y_test = df_test["price"]

# List of numeric features
numeric_cols = [
    "carpetArea",
    "floorNumber",
    "totalFloorNumber",
    "bedrooms",
    "bathrooms",
]

# For the two features that will be encoded differently:
cat_diff_cols = ["localityName", "transactionType"]

# The remaining categorical columns that are on an inherently ordinal scale.
# Note: even though localityName might appear here, you may choose a different
# encoding strategy depending on its cardinality. In this code, we are treating
# it specially in the one-hot transformation.
ordinal_cols = ["furnished", "ageofcons"]


# You can now proceed to train your models using X_train_linear / X_train_tree and 
# evaluate using X_test_linear / X_test_tree.


## Linear

In [8]:
# =============================================================================
# Pipeline for linear models
#   - Numerical features: standard scaled.
#   - For transactionType, furnished: one-hot encoded.
#   - For ordinal_cols (localityName, ageofcons): ordinal-encoded
#     and then scaled (so that all features are on a similar scale).
# =============================================================================

# Define transformers
numeric_transformer = StandardScaler()

onehot_transformer = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

# Here we create a pipeline that first ordinal-encodes then scales the result.
furnished_order = ['Unfurnished', 'Semi-Furnished', 'Furnished']
ordinal_transformer_furnished = Pipeline(
    steps=[
        ("ordinal", OrdinalEncoder(categories=[furnished_order])),
        ("scaler", StandardScaler()),
    ]
)

# # Here we create a pipeline that first ordinal-encodes then scales the result.
# ordinal_transformer_rs = Pipeline(
#     steps=[
#         ("ordinal", OrdinalEncoder()),
#         ("scaler", StandardScaler()),
#     ]
# )

# Here we create a pipeline that first ordinal-encodes then scales the result.
age_order = [
    'Under Construction',  # first: youngest / newest state
    'New Construction',
    'Less than 5 years',
    '5 to 10 years',
    '10 to 15 years',
    '15 to 20 years',
    'Above 20 years'       # last: oldest
]
ordinal_transformer_ageofcons = Pipeline(
    steps=[
        ("ordinal", OrdinalEncoder(categories=[age_order])),
        ("scaler", StandardScaler()),
    ]
)

# Create the ColumnTransformer for the linear pipeline.
lin_preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("onehot", onehot_transformer, cat_diff_cols),
        ("ord-furnished", ordinal_transformer_furnished, ["furnished"]),
        # ("ord-reservedparking", ordinal_transformer_rs, ["ReservedParking"]),
        ("ord-ageofcons", ordinal_transformer_ageofcons, ["ageofcons"]),
    ]
)

# The overall pipeline (here you could add a linear model as the final estimator)
lin_pipeline = Pipeline(steps=[("preprocessor", lin_preprocessor)])

# Now transform the training features for the linear model:
X_train_linear = lin_pipeline.fit_transform(X_train)
X_test_linear = lin_pipeline.transform(X_test)

## Tree (numeric only)

In [9]:
# =============================================================================
# Pipeline for tree-based models
#   - Numerical features: standard scaled.
#   - For all categorical features (transactionType, furnished, ReservedParking,
#     localityName, ageofcons): ordinal-encoded and then scaled.
# =============================================================================

# Create a pipeline for encoding the categorical features as ordinal then scaling.
tree_cat_transformer = Pipeline(
    steps=[
        ("ordinal", OrdinalEncoder()),
        ("scaler", StandardScaler()),
    ]
)

tree_preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("ord", tree_cat_transformer, cat_diff_cols),
        ("ord-furnished", ordinal_transformer_furnished, ["furnished"]),
        # ("ord-reservedparking", ordinal_transformer_rs, ["ReservedParking"]),
        ("ord-ageofcons", ordinal_transformer_ageofcons, ["ageofcons"]),
    ]
)

tree_pipeline = Pipeline(steps=[("preprocessor", tree_preprocessor)])
# Now transform the training features for the tree models:

X_train_tree = tree_pipeline.fit_transform(X_train)
X_test_tree = tree_pipeline.transform(X_test)

## Tree (numeric and categorical)

In [10]:
# Here we create a pipeline that first ordinal-encodes then scales the result.
furnished_order = ['Unfurnished', 'Semi-Furnished', 'Furnished']
ordinal_transformer_furnished = Pipeline(
    steps=[
        ("ordinal", OrdinalEncoder(categories=[furnished_order])),
        ("scaler", StandardScaler()),
    ]
)

# Here we create a pipeline that first ordinal-encodes then scales the result.
age_order = [
    'Under Construction',  # first: youngest / newest state
    'New Construction',
    'Less than 5 years',
    '5 to 10 years',
    '10 to 15 years',
    '15 to 20 years',
    'Above 20 years'       # last: oldest
]
ordinal_transformer_ageofcons = Pipeline(
    steps=[
        ("ordinal", OrdinalEncoder(categories=[age_order])),
        ("scaler", StandardScaler()),
    ]
)

tree_preprocessor_gb = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("passthrough", "passthrough", cat_diff_cols),
        ("ord-furnished", ordinal_transformer_furnished, ["furnished"]),
        # ("ord-reservedparking", ordinal_transformer_rs, ["ReservedParking"]),
        ("ord-ageofcons", ordinal_transformer_ageofcons, ["ageofcons"]),
    ]
)

tree_pipeline_gb = Pipeline(steps=[("preprocessor", tree_preprocessor_gb)])
# Now transform the training features for the tree models:
X_train_gb = tree_pipeline_gb.fit_transform(X_train)
X_test_gb = tree_pipeline_gb.transform(X_test)

## Tree + Target Encoding

In [11]:
# Here we create a pipeline that first ordinal-encodes then scales the result.
furnished_order = ['Unfurnished', 'Semi-Furnished', 'Furnished']
ordinal_transformer_furnished = Pipeline(
    steps=[
        ("ordinal", OrdinalEncoder(categories=[furnished_order])),
        ("scaler", StandardScaler()),
    ]
)

target_encoder_cats = Pipeline(
    steps=[
        (
            "target_encoder",
            TargetEncoder(
                categories='auto',         # Auto-detect categories
                target_type='continuous',  # Specify regression target
                smooth='auto',             # Use empirical Bayes smoothing
                cv=5,                      # Use 5-fold internal cross-fitting
                shuffle=True,              # Shuffle data before folding
                random_state=42            # For reproducible fold splits
            )
        ),
        ("scaler", StandardScaler()),
    ]
)

# Here we create a pipeline that first ordinal-encodes then scales the result.
age_order = [
    'Under Construction',  # first: youngest / newest state
    'New Construction',
    'Less than 5 years',
    '5 to 10 years',
    '10 to 15 years',
    '15 to 20 years',
    'Above 20 years'       # last: oldest
]
ordinal_transformer_ageofcons = Pipeline(
    steps=[
        ("ordinal", OrdinalEncoder(categories=[age_order])),
        ("scaler", StandardScaler()),
    ]
)

tree_preprocessor_target = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat_target", target_encoder_cats, cat_diff_cols),
        ("ord-furnished", ordinal_transformer_furnished, ["furnished"]),
        ("ord-ageofcons", ordinal_transformer_ageofcons, ["ageofcons"]),
    ]
)

tree_pipeline_target = Pipeline(steps=[("preprocessor", tree_preprocessor_target)])
# Now transform the training features for the tree models:
X_train_target = tree_pipeline_target.fit_transform(X_train, y_train)
X_test_target = tree_pipeline_target.transform(X_test)

In [12]:
print("Linear model feature shape: train:", X_train_linear.shape)
print("Tree model feature shape: train:", X_train_tree.shape)
print("Tree model for GB feature shape: train:", X_train_gb.shape)

Linear model feature shape: train: (13983, 233)
Tree model feature shape: train: (13983, 9)
Tree model for GB feature shape: train: (13983, 9)


# Model Training

## Defining Metrics and Flow

In [13]:
import numpy as np
import pandas as pd
import warnings

from sklearn.metrics import (mean_absolute_error,
                             mean_absolute_percentage_error,
                             r2_score,
                             make_scorer)
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import clone

import xgboost as xgb
from catboost import CatBoostRegressor
import lightgbm as lgb

# Ignore warnings for cleaner output
warnings.filterwarnings("ignore")

# ---------------------------------------------
# Custom adjusted R2 scorer
# ---------------------------------------------
def adjusted_r2_scoring(estimator, X, y):
    """
    Scoring function that computes adjusted R2.
    """
    y_pred = estimator.predict(X)
    base_r2 = r2_score(y, y_pred)
    n = len(y)
    p = X.shape[1]
    if (n - p - 1) == 0:
        return base_r2
    adj_r2 = 1 - (1 - base_r2) * ((n - 1) / (n - p - 1))
    print(f"Adjusted R2 score: {adj_r2:.4f}")
    return adj_r2


adj_r2_scorer = make_scorer(adjusted_r2_scoring, greater_is_better=True)

# ---------------------------------------------
# Define scoring dictionary for GridSearchCV.
# Note: For MAE and MAPE, we use the negative built-in scorers
# so that “higher” is better.
# ---------------------------------------------
scoring = {
    "MAE": "neg_mean_absolute_error",
    "MAPE": "neg_mean_absolute_percentage_error",
    "R2": "r2",
    # "adj_R2": adj_r2_scorer,
}

# Global dictionary to store results
results_dict = {}

# ---------------------------------------------
# Function to run GridSearchCV and store metrics
# ---------------------------------------------
def run_grid_search(model, param_grid, X_train, y_train, X_test, y_test, 
                    model_name):
    """
    Performs grid search with the given estimator and parameter grid, 
    computes cross validation and test metrics, then stores the results 
    in the global dictionary.
    """
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring=scoring,
        cv=5,
        refit="MAE",  # refit based on neg MAE (i.e. lower MAE is higher -MAE)
        return_train_score=True,
        n_jobs=-1,
    )
    if model not in ["LightGBM"]: 
        grid_search.fit(X_train, y_train)
    else:
        grid_search.fit(X_train, y_train, categorical_feature = ['localityName', 'transactionType'])
    best_index = grid_search.best_index_
    # Note: The built-in scorers return negative error values.
    cv_scores = {
        "MAE": -grid_search.cv_results_["mean_test_MAE"][best_index],
        "MAPE": -grid_search.cv_results_["mean_test_MAPE"][best_index],
        "R2": grid_search.cv_results_["mean_test_R2"][best_index],
        # "adj_R2": grid_search.cv_results_["mean_test_adj_R2"][best_index],
    }

    # Evaluate on test data
    y_pred = grid_search.best_estimator_.predict(X_test)
    test_mae = np.abs(y_test - y_pred)
    test_mape = (np.abs(y_test - y_pred) / (np.abs(y_test) + 1e-10))*100
    test_r2 = r2_score(y_test, y_pred)
    n_test = len(y_test)
    p_test = X_test.shape[1]
    test_adj_r2 = (
        1 - (1 - test_r2) * ((n_test - 1) / (n_test - p_test - 1))
        if (n_test - p_test - 1) != 0
        else test_r2
    )

    results_dict[model_name] = {
        "best_params": grid_search.best_params_,
        "cv_scores": cv_scores,
        "test_scores": {
            "MAE": test_mae,
            "MAPE": test_mape,
            "R2": test_r2,
            "adj_R2": test_adj_r2,
        },
    }

    return grid_search

# ---------------------------------------------
# Function to run the passed model with default settings.
# ---------------------------------------------
def run_default_model(model, X_train, y_train, X_test, y_test, model_name):
    """
    Trains the provided model with default parameters on the training set
    and evaluates it on the test set. The evaluation metrics are then stored
    in the global results_dict under the key '<model_name>_default'.
    """
    # Create a fresh copy of the model using clone()
    default_model = clone(model)

    # If the model is CatBoost or LightGBM, handle categorical features.
    if model_name == 'LightGBM':
        default_model.fit(X_train, y_train, categorical_feature=["localityName", 'transactionType'])
    else:
        default_model.fit(X_train, y_train)

    y_pred = default_model.predict(X_test)
    test_mae = np.abs(y_test - y_pred)
    test_mape = (np.abs(y_test - y_pred) / (np.abs(y_test) + 1e-10)) * 100
    test_r2 = r2_score(y_test, y_pred)

    n_test = len(y_test)
    p_test = X_test.shape[1]
    test_adj_r2 = (
        1 - (1 - test_r2) * ((n_test - 1) / (n_test - p_test - 1))
        if (n_test - p_test - 1) != 0
        else test_r2
    )

    if results_dict.get(model_name) is None:
        results_dict[model_name] = {'default':{}}
    results_dict[model_name]['default'] = {
        "test_scores": {
            "MAE": test_mae,
            "MAPE": test_mape,
            "R2": test_r2,
            "adj_R2": test_adj_r2,
        }
    }
# ===============================================================
# Use the appropriate training/testing sets according to the model:
# 1. Linear, ElasticNet, and KNN will use the "linear" set.
# 2. Decision Tree, Random Forest, XGBoost, and XGB RF will use the
#    "tree" set.
# 3. CatBoost and LightGBM will use the "GB" set.
# ===============================================================

## Models

### Linear Regression

In [27]:
# 1. Linear Regression (no hyperparameter tuning required)
lin_reg = LinearRegression()
# No hyperparameters to tune; we use an empty grid
run_grid_search(
    lin_reg, {}, X_train_linear, y_train, X_test_linear, y_test,
    "LinearRegression"
)
run_default_model(
    lin_reg, X_train_linear, y_train, X_test_linear, y_test, "LinearRegression"
)

### Elastic Net

In [28]:
# 2. ElasticNet
elastic_net = ElasticNet(random_state=42, max_iter=10000)
param_grid_en = {
    "alpha": [0.1, 0.5, 0.75, 1.0, 2.0, 2.5, 5.0, 10.0],
    "l1_ratio": [0.1, 0.2, 0.4, 0.6, 0.8, 0.9],
}
grid_search = run_grid_search(
    elastic_net, param_grid_en, X_train_linear, y_train, X_test_linear, y_test,
    "ElasticNet"
)
run_default_model(
    elastic_net, X_train_linear, y_train, X_test_linear, y_test, "ElasticNet"
)

### Decision Tree 

In [29]:
# 4. Decision Tree Regressor
dt = DecisionTreeRegressor(random_state=42)
param_grid_dt = {
    "max_depth": [None, 5, 10, 20, 25, 35, 50],
    "min_samples_split": [2, 5, 10, 15],
    "min_samples_leaf": [1, 2, 4, 6],
}
run_grid_search(
    dt, param_grid_dt, X_train_tree, y_train, X_test_tree, y_test,
    "DecisionTree"
)
run_default_model(
    dt, X_train_tree, y_train, X_test_tree, y_test, "DecisionTree"
)

### XGBoost Regressor

#### Linear Data

In [30]:
# # 6. XGBoost Regressor
# xgb_reg = xgb.XGBRegressor(
#     objective="reg:squarederror",
#     random_state=42,
#     verbosity=0,
# )
# param_grid_xgb = {
#     "n_estimators": [100, 150, 200, 250, 300, 500],
#     "max_depth": [3, 5, 7],
#     "max_leaves": [0, 10, 25, 50],
#     "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3, 0.5],
#     "subsample": [0.8, 1.0],
# }
# grid_search = run_grid_search(
#     xgb_reg, param_grid_xgb, X_train_linear, y_train, X_test_linear, y_test,
#     "XGBoostRegressor_Linear"
# )
# run_default_model(
#     xgb_reg, X_train_linear, y_train, X_test_linear, y_test, "XGBoostRegressor_Linear"
# )

#### Tree Data

In [14]:
# 6. XGBoost Regressor
xgb_reg = xgb.XGBRegressor(
    objective="reg:squarederror",
    random_state=42,
    verbosity=0,
)
param_grid_xgb = {
    "n_estimators": [100, 150, 200, 250, 300, 500], # 
    "max_depth": [3, 5, 7],
    "max_leaves": [0, 10, 25, 50],
    "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3, 0.5],
    "subsample": [0.8, 1.0],
}
grid_search = run_grid_search(
    xgb_reg, param_grid_xgb, X_train_tree, y_train, X_test_tree, y_test,
    "XGBoostRegressor"
)
run_default_model(
    xgb_reg, X_train_tree, y_train, X_test_tree, y_test, "XGBoostRegressor"
)

#### Tree + Target Encoded

In [32]:
# 6. XGBoost Regressor
xgb_reg = xgb.XGBRegressor(
    objective="reg:squarederror",
    random_state=42,
    verbosity=0,
)
param_grid_xgb = {
    "n_estimators": [100, 150, 200, 250, 300, 500],
    "max_depth": [3, 5, 7],
    "max_leaves": [0, 10, 25, 50],
    "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3, 0.5],
    "subsample": [0.8, 1.0],
}
grid_search = run_grid_search(
    xgb_reg, param_grid_xgb, X_train_target, y_train, X_test_target, y_test,
    "XGBoostRegressor_Target"
)
run_default_model(
    xgb_reg, X_train_target, y_train, X_test_target, y_test, "XGBoostRegressor_Target"
)

In [32]:
pd.DataFrame(X_train_linear).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13983 entries, 0 to 13982
Columns: 233 entries, 0 to 232
dtypes: float64(233)
memory usage: 24.9 MB


### Random Forest

In [33]:
# 5. Random Forest Regressor
rf = RandomForestRegressor(random_state=42)
param_grid_rf = {
    "n_estimators": [100, 150, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2],
}
run_grid_search(
    rf, param_grid_rf, X_train_tree, y_train, X_test_tree, y_test,
    "RandomForest"
)
run_default_model(
    rf, X_train_tree, y_train, X_test_tree, y_test, "RandomForest"
)

### KNN

In [34]:
# 3. K-Nearest Neighbors (KNN)
knn = KNeighborsRegressor()
param_grid_knn = {
    "n_neighbors": [10, 20, 30, 40, 50],
    "weights": ["uniform", "distance"],
    "algorithm": ["ball_tree", "kd_tree", "brute"],
    "p": [1, 2],
}
#run_grid_search(
#    knn, param_grid_knn, X_train_linear, y_train, X_test_linear, y_test, "KNN"
#)
run_default_model(
    knn, X_train_linear, y_train, X_test_linear, y_test, "KNN"
)

### XGBoost Random Forest

In [35]:
# 7. XGBoost Random Forest Regressor
xgb_rf = xgb.XGBRFRegressor(
    random_state=42,
    verbosity=0,
)
param_grid_xgb_rf = {
    "n_estimators": [100, 150, 200],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.2],
    "subsample": [0.8, 1.0],
}
run_grid_search(
    xgb_rf, param_grid_xgb_rf, X_train_tree, y_train, X_test_tree, y_test,
    "XGBoostRFRegressor"
)
run_default_model(
    xgb_rf, X_train_tree, y_train, X_test_tree, y_test, "XGBoostRFRegressor"
)

### LightGBM

#### With Tree dataset

In [15]:
# 9. LightGBM Regressor
lgbm_reg = lgb.LGBMRegressor(random_state=42)
param_grid_lgb = {
    "n_estimators": [100, 150, 200, 300, 500],
    "max_depth": [-1, 3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.2],
}

run_grid_search(
    lgbm_reg, param_grid_lgb, X_train_tree, y_train, X_test_tree, y_test, 
    "LightGBM_encoded"
)
run_default_model(
    lgbm_reg, X_train_tree, y_train, X_test_tree, y_test, "LightGBM_encoded"
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000820 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 588
[LightGBM] [Info] Number of data points in the train set: 13983, number of used features: 9
[LightGBM] [Info] Start training from score 11030358.469284
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000607 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 588
[LightGBM] [Info] Number of data points in the train set: 13983, number of used features: 9
[LightGBM] [Info] Start training from score 11030358.469284


#### Tree + Target

In [33]:
# 9. LightGBM Regressor
lgbm_reg = lgb.LGBMRegressor(random_state=42)
param_grid_lgb = {
    "n_estimators": [100, 150, 200, 300, 500],
    "max_depth": [-1, 3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.2],
}

run_grid_search(
    lgbm_reg, param_grid_lgb, X_train_target, y_train, X_test_target, y_test,
    "LightGBM_target"
)
run_default_model(
    lgbm_reg, X_train_target, y_train, X_test_target, y_test, "LightGBM_target"
)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001420 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 628
[LightGBM] [Info] Number of data points in the train set: 13983, number of used features: 9
[LightGBM] [Info] Start training from score 11030358.469284
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000265 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 628
[LightGBM] [Info] Number of data points in the train set: 13983, number of used features: 9
[LightGBM] [Info] Start training from score 11030358.469284


#### With unencoded data

In [16]:
# 9. LightGBM Regressor
lgbm_reg = lgb.LGBMRegressor(random_state=42)
param_grid_lgb = {
    "n_estimators": [100, 150, 200, 300, 500],
    "max_depth": [-1, 3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.2],
}

mapping = {
    "carpetArea": pd.Float64Dtype(),
    "floorNumber": pd.Float64Dtype(),
    "totalFloorNumber": pd.Float64Dtype(),
    "bedrooms": pd.Float64Dtype(),
    "bathrooms": pd.Float64Dtype(),
    "localityName": 'category',
    "transactionType": 'category',
    "furnished": pd.Float64Dtype(),
    "ageofcons": pd.Float64Dtype(),
}
X_train_gb_df = pd.DataFrame(X_train_gb, columns = numeric_cols + cat_diff_cols + ordinal_cols).astype(mapping)
X_test_gb_df = pd.DataFrame(X_test_gb,  columns = numeric_cols + cat_diff_cols + ordinal_cols).astype(mapping)
y_train_df = pd.Series(y_train)
y_test_df = pd.Series(y_test)

run_grid_search(
    lgbm_reg, param_grid_lgb, X_train_gb_df, y_train_df, X_test_gb_df, y_test_df,
    "LightGBM"
)
run_default_model(
    lgbm_reg, X_train_gb_df, y_train_df, X_test_gb_df, y_test_df, "LightGBM"
)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001170 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 588
[LightGBM] [Info] Number of data points in the train set: 13983, number of used features: 9
[LightGBM] [Info] Start training from score 11030358.469284
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000252 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 588
[LightGBM] [Info] Number of data points in the train set: 13983, number of used features: 9
[LightGBM] [Info] Start training from score 11030358.469284


### CatBoost

#### With Tree data

In [38]:
# # 8. CatBoost Regressor
# catboost_reg = CatBoostRegressor(random_state=42, verbose=0)
# param_grid_cat = {
#     "iterations": [250, 500],
#     "depth": [3, 5, 7],
#     "learning_rate": [0.01, 0.1, 0.2],
# }

# run_grid_search(
#     catboost_reg, param_grid_cat, X_train_tree, y_train, X_test_tree, y_test,
#     "CatBoost_encoded"
# )
# run_default_model(
#     catboost_reg, X_train_tree, y_train, X_test_tree, y_test, "CatBoost_encoded"
# )

#### With unencoded data

In [39]:
# 8. CatBoost Regressor
catboost_reg = CatBoostRegressor(random_state=42, verbose=0, cat_features = ['localityName', 'transactionType'])
param_grid_cat = {
    "iterations": [250, 500],
    "depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.2],
}
mapping = {
    "carpetArea": pd.Float64Dtype(),
    "floorNumber": pd.Float64Dtype(),
    "totalFloorNumber": pd.Float64Dtype(),
    "bedrooms": pd.Float64Dtype(),
    "bathrooms": pd.Float64Dtype(),
    "localityName": 'category',
    "transactionType": 'category',
    "furnished": pd.Float64Dtype(),
    "ageofcons": pd.Float64Dtype(),
}
X_train_gb_df = pd.DataFrame(X_train_gb, columns = numeric_cols + cat_diff_cols + ordinal_cols).astype(mapping)
X_test_gb_df = pd.DataFrame(X_test_gb,  columns = numeric_cols + cat_diff_cols + ordinal_cols).astype(mapping)
y_train_df = pd.Series(y_train)
y_test_df = pd.Series(y_test)
run_grid_search(
    catboost_reg, param_grid_cat, X_train_gb_df, y_train_df, X_test_gb_df, y_test_df,
    "CatBoost"
)
run_default_model(
    catboost_reg, X_train_gb_df, y_train_df, X_test_gb_df, y_test_df, "CatBoost"
)

## Aggregating Results

### Best Hyperparameter Configuration

In [20]:
print("Best hyperparameter config for all models:\n")
for model, values in results_dict.items():
    print(f"{model}: {values['best_params'] if values.get('best_params') else 'N/A'}")

with open("Results/BestHyperparameters.txt", "w") as f:
    for model, values in results_dict.items():
        f.write(f"{model}: {values['best_params'] if values.get('best_params') else 'N/A'}\n")

Best hyperparameter config for all models:

XGBoostRegressor: {'learning_rate': 0.2, 'max_depth': 5, 'max_leaves': 0, 'n_estimators': 500, 'subsample': 1.0}
LightGBM_encoded: {'learning_rate': 0.2, 'max_depth': -1, 'n_estimators': 500}
LightGBM: {'learning_rate': 0.1, 'max_depth': -1, 'n_estimators': 500}


### CV scores for best hyperparameters

In [21]:
temp = {
    "index":[],
    'MAE':[],
    'MAPE':[],
    'R2':[],
}
for model, values in results_dict.items():
    if values.get('cv_scores') is None:
        continue
    temp['index'].append(model)
    temp['MAE'].append(values['cv_scores']['MAE'])
    temp['MAPE'].append(values['cv_scores']['MAPE'])
    temp['R2'].append(values['cv_scores']['R2'])

cv_scores = pd.DataFrame(temp).set_index('index')
# cv_scores.sort_values(by='MAPE', ascending=True).to_csv("Results/cv_scores.csv", index=True)
cv_scores.sort_values(by='MAPE', ascending=True)

Unnamed: 0_level_0,MAE,MAPE,R2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LightGBM,1573628.0,0.150681,0.91792
XGBoostRegressor,1604337.0,0.154164,0.913222
LightGBM_encoded,1669862.0,0.158222,0.907645


### Test scores for Best Hyperparameter Config

In [22]:
custom_percentiles = sorted({0.05, 0.1, 0.2, 0.25, 0.5, 0.75, 0.8, 0.9, 0.95})

# Initialize dictionaries to store the summaries for each metric.
mae_data = {}
mape_data = {}
r2_data = {}
adj_r2_data = {}

# Loop over each model in your results_dict.
for model_name, model_results in results_dict.items():

    if model_results.get('test_scores') is None:
        continue

    # Get the dictionary that contains the test scores.
    test_scores = model_results['test_scores']
    
    # For each metric, compute the descriptive statistics using .describe.
    # pd.Series.describe returns a Series with index: count, mean, std, min,
    # the provided percentiles, and max.
    mae_data[model_name] = pd.Series(test_scores['MAE']).describe(
        percentiles=custom_percentiles
    )
    mape_data[model_name] = pd.Series(test_scores['MAPE']).describe(
        percentiles=custom_percentiles
    )
    r2_data[model_name] = pd.Series(test_scores['R2']).describe(
        percentiles=custom_percentiles
    )
    adj_r2_data[model_name] = pd.Series(test_scores['adj_R2']).describe(
        percentiles=custom_percentiles
    )

# Now, create dataframes for each metric with the model names as the index.
mae_df = pd.DataFrame.from_dict(mae_data, orient="index")
mape_df = pd.DataFrame.from_dict(mape_data, orient="index")
r2_df = pd.DataFrame.from_dict(r2_data, orient="index")
adj_r2_df = pd.DataFrame.from_dict(adj_r2_data, orient="index")

In [23]:
df_test['price'].describe()

count             3469.0
mean       11362982.3661
std      12005742.076629
min             900000.0
25%            5000000.0
50%            8000000.0
75%           13000000.0
max          150000000.0
Name: price, dtype: Float64

In [24]:
(mae_df/1000).sort_values(by="50%", ascending=True)

Unnamed: 0,count,mean,std,min,5%,10%,20%,25%,50%,75%,80%,90%,95%,max
LightGBM,3.469,1628.232156,2857.11233,0.378945,67.089893,128.568938,277.166458,354.043502,815.833692,1757.115044,2136.204932,3608.283156,5767.345292,51201.84233
LightGBM_encoded,3.469,1692.60327,3047.074635,1.651833,74.294989,145.276532,293.222155,366.032721,819.579929,1818.752514,2290.261989,3650.734544,5875.858319,54205.505606
XGBoostRegressor,3.469,1661.999798,3076.256228,0.118,70.56315,148.3034,289.48065,366.231,826.701,1741.529,2125.3636,3603.7492,6021.3564,76129.312


In [25]:
mape_df.sort_values(by="90%", ascending=True)

Unnamed: 0,count,mean,std,min,5%,10%,20%,25%,50%,75%,80%,90%,95%,max
LightGBM,3469.0,14.86841,15.453179,0.010827,0.941432,1.906193,3.716854,4.76563,10.458317,19.90411,22.589038,32.567333,42.610134,221.775446
XGBoostRegressor,3469.0,14.954389,14.688738,0.001124,1.016924,1.99436,4.112759,5.087438,10.823944,20.093266,22.752502,32.940537,41.498062,149.51059
LightGBM_encoded,3469.0,15.222631,14.939056,0.012609,1.042912,2.070999,4.105897,5.127227,10.929158,20.24935,23.341647,33.790781,42.922273,137.162541


In [28]:
r2_df.sort_values(by="50%", ascending=False)

Unnamed: 0,count,mean,std,min,5%,10%,20%,25%,50%,75%,80%,90%,95%,max
LightGBM,1.0,0.924968,,0.924968,0.924968,0.924968,0.924968,0.924968,0.924968,0.924968,0.924968,0.924968,0.924968,0.924968
LightGBM_encoded,1.0,0.915703,,0.915703,0.915703,0.915703,0.915703,0.915703,0.915703,0.915703,0.915703,0.915703,0.915703,0.915703
XGBoostRegressor,1.0,0.915176,,0.915176,0.915176,0.915176,0.915176,0.915176,0.915176,0.915176,0.915176,0.915176,0.915176,0.915176


### Test scores for models with default settings

In [25]:
custom_percentiles = sorted({0.05, 0.1, 0.2, 0.25, 0.5, 0.75, 0.8, 0.9, 0.95})

# Initialize dictionaries to store the summaries for each metric.
mae_data = {}
mape_data = {}
r2_data = {}
adj_r2_data = {}

# Loop over each model in your results_dict.
for model_name, model_results in results_dict.items():

    if model_results.get('default') is None:
        continue

    # Get the dictionary that contains the test scores.
    test_scores = model_results['default']['test_scores']
    
    # For each metric, compute the descriptive statistics using .describe.
    # pd.Series.describe returns a Series with index: count, mean, std, min,
    # the provided percentiles, and max.
    mae_data[model_name] = pd.Series(test_scores['MAE']).describe(
        percentiles=custom_percentiles
    )
    mape_data[model_name] = pd.Series(test_scores['MAPE']).describe(
        percentiles=custom_percentiles
    )
    r2_data[model_name] = pd.Series(test_scores['R2']).describe(
        percentiles=custom_percentiles
    )
    adj_r2_data[model_name] = pd.Series(test_scores['adj_R2']).describe(
        percentiles=custom_percentiles
    )

# Now, create dataframes for each metric with the model names as the index.
mae_df = pd.DataFrame.from_dict(mae_data, orient="index")
mape_df = pd.DataFrame.from_dict(mape_data, orient="index")
r2_df = pd.DataFrame.from_dict(r2_data, orient="index")
adj_r2_df = pd.DataFrame.from_dict(adj_r2_data, orient="index")

In [26]:
mape_df.T

Unnamed: 0,LightGBM_encoded,LightGBM,XGBoostRegressor
count,3469.0,3469.0,3469.0
mean,17.182531,15.412704,15.54568
std,17.035502,15.923893,15.204562
min,0.002895,0.004263,0.000357
5%,1.009988,0.888883,0.952234
10%,2.09049,1.946408,1.909942
20%,4.419424,4.040612,4.032719
25%,5.660052,5.233697,5.1875
50%,12.441651,10.938554,11.374674
75%,22.868418,20.428784,20.962187


In [49]:
mape_df.T

Unnamed: 0,LinearRegression,ElasticNet,DecisionTree,XGBoostRegressor,RandomForest,KNN,XGBoostRFRegressor,LightGBM,CatBoost
count,3469.0,3469.0,3469.0,3469.0,3469.0,3469.0,3469.0,3469.0,3469.0
mean,25.11517,34.262142,23.104223,15.54568,17.568837,19.408405,23.212094,15.412704,15.746216
std,28.513756,29.163628,27.685499,15.204562,18.203948,18.860335,21.429049,15.923893,14.928601
min,0.006817,0.032421,0.0,0.000357,0.00886,0.0,0.012537,0.004263,0.002835
5%,1.4909,2.317638,0.0,0.952234,0.9972,1.359868,1.689819,0.888883,1.157125
10%,2.963585,4.725553,1.836029,1.909942,1.943906,2.738911,3.28104,1.946408,2.228315
20%,5.791901,10.191055,4.651163,4.032719,4.119736,5.138404,6.664041,4.040612,4.164368
25%,7.421536,12.763087,6.060606,5.1875,5.347222,6.666667,8.331586,5.233697,5.268167
50%,17.233439,27.144656,15.151515,11.374674,12.071246,14.339623,17.548833,10.938554,11.756473
75%,33.038559,48.309059,30.666667,20.962187,23.39,26.575,31.234879,20.428784,21.70927


In [50]:
mae_df.T/1000

Unnamed: 0,LinearRegression,ElasticNet,DecisionTree,XGBoostRegressor,RandomForest,KNN,XGBoostRFRegressor,LightGBM,CatBoost
count,3.469,3.469,3.469,3.469,3.469,3.469,3.469,3.469,3.469
mean,2334.079651,3288.336918,2398.403256,1722.110519,1907.348031,2269.388371,2479.399542,1698.402172,1764.820522
std,3648.16913,5625.712411,4497.824488,3123.305261,3563.434435,4258.902518,4140.272174,2923.020635,3300.691407
min,0.647652,4.510008,0.0,0.05,0.3,0.0,1.424,0.490238,0.099219
5%,120.338622,210.251651,0.0,68.8882,73.28954,93.88,125.1402,63.392573,78.450068
10%,230.957491,433.048334,100.0,137.3796,155.758748,180.0,242.25825,133.437805,159.241238
20%,478.317498,842.916565,300.0,296.1532,308.8,355.312,465.2493,281.56692,303.058903
25%,622.376602,1044.965965,500.0,382.08525,387.55,449.999,612.48825,367.691271,384.221128
50%,1416.98713,2108.218918,1149.0,852.3895,926.5,1100.0,1358.7875,856.608597,886.741913
75%,2677.085296,3588.235378,2500.0,1778.85325,1978.0,2327.2,2741.653,1835.215068,1871.098977


In [51]:
mae_df.T/1000

Unnamed: 0,LinearRegression,ElasticNet,DecisionTree,XGBoostRegressor,RandomForest,KNN,XGBoostRFRegressor,LightGBM,CatBoost
count,3.469,3.469,3.469,3.469,3.469,3.469,3.469,3.469,3.469
mean,2334.079651,3288.336918,2398.403256,1722.110519,1907.348031,2269.388371,2479.399542,1698.402172,1764.820522
std,3648.16913,5625.712411,4497.824488,3123.305261,3563.434435,4258.902518,4140.272174,2923.020635,3300.691407
min,0.647652,4.510008,0.0,0.05,0.3,0.0,1.424,0.490238,0.099219
5%,120.338622,210.251651,0.0,68.8882,73.28954,93.88,125.1402,63.392573,78.450068
10%,230.957491,433.048334,100.0,137.3796,155.758748,180.0,242.25825,133.437805,159.241238
20%,478.317498,842.916565,300.0,296.1532,308.8,355.312,465.2493,281.56692,303.058903
25%,622.376602,1044.965965,500.0,382.08525,387.55,449.999,612.48825,367.691271,384.221128
50%,1416.98713,2108.218918,1149.0,852.3895,926.5,1100.0,1358.7875,856.608597,886.741913
75%,2677.085296,3588.235378,2500.0,1778.85325,1978.0,2327.2,2741.653,1835.215068,1871.098977


In [52]:
pd.Series(y_test).describe()

count             3469.0
mean       11362982.3661
std      12005742.076629
min             900000.0
25%            5000000.0
50%            8000000.0
75%           13000000.0
max          150000000.0
Name: price, dtype: Float64