# Imports

In [1]:
from IPython.display import display, HTML

def displayHorizontalDataframes(dfs, titles):
    """
    Display a list of pandas dataframes horizontally with titles on top.
    
    Parameters:
    dfs (list): List of pandas DataFrames.
    titles (list): List of titles (strings) corresponding to each DataFrame.
    """
    if len(dfs) != len(titles):
        raise ValueError("The number of dataframes and titles must be the same.")

    # Start a flex container to align content horizontally and center them.
    html = (
        "<div style='display: flex; justify-content: center; "
        "align-items: flex-start;'>"
    )
    
    # Iterate over the dataframes and their corresponding titles.
    for df, title in zip(dfs, titles):
        title = " " if not title else title
        html += (
            "<div style='margin-left: 20px; margin-right: 20px; text-align: center;'>"
            f"<h3>{title}</h3>"
            f"{df.to_html(classes='dataframe', border=1)}"
            "</div>"
        )
    
    html += "</div>"
    display(HTML(html))

In [2]:
import pandas as pd
# import plotly.express as px
# import plotly.graph_objects as go
# from ydata_profiling import ProfileReport
import numpy as np

dtype_mapping = {
    'propertyId': pd.StringDtype(),
    'localityName': 'category',
    'landMarks': pd.StringDtype(),
    'locality': pd.StringDtype(),
    'price': pd.Int64Dtype(),
    'nameOfSociety': pd.StringDtype(),
    'projectName': pd.StringDtype(),
    'carpetArea': pd.Int64Dtype(),
    'coveredArea': pd.Int64Dtype(),
    'carpetAreaSqft': pd.Int64Dtype(),
    'possessionStatus': pd.StringDtype(),
    'developerName': pd.StringDtype(),
    'flooringType': pd.StringDtype(),
    'floorNumber': pd.Int64Dtype(),
    'unitCountonFloor': pd.Int64Dtype(),
    'totalFloorNumber': pd.Int64Dtype(),
    'electricityStatus': pd.StringDtype(),
    'waterStatus': pd.StringDtype(),
    'longitude': pd.Float64Dtype(),
    'latitude': pd.Float64Dtype(),
    'transactionType': 'category',
    'facing': pd.StringDtype(),
    'ownershipType': pd.StringDtype(),
    'carParking': pd.StringDtype(),
    'furnished': 'category',
    'bedrooms': pd.Int64Dtype(),
    'bathrooms': pd.Int64Dtype(),
    'numberOfBalconied': pd.Int64Dtype(),
    'propertyType': 'category',
    'additionalRooms': pd.StringDtype(),
    'bookingAmountExact': pd.Int64Dtype(),
    'maintenanceChargesFrequency': 'category',
    'maintenanceCharges': pd.Int64Dtype(),
    'ageofcons': 'category',
    'isVerified': 'category',
    'listingTypeDesc': 'category',
    'premiumProperty': pd.BooleanDtype(),
    'noOfLifts': pd.Int64Dtype(),
    'propertyAmenities': pd.StringDtype(),
    'facilitiesDesc': pd.StringDtype(),
    'uuid': pd.StringDtype(),
    'flooringType_Vitrified': pd.BooleanDtype(),
    'flooringType_CeramicTiles': pd.BooleanDtype(),
    'flooringType_Marble': pd.BooleanDtype(),
    'flooringType_NormalTilesKotahStone': pd.BooleanDtype(),
    'flooringType_Granite': pd.BooleanDtype(),
    'flooringType_Wooden': pd.BooleanDtype(),
    'flooringType_Mosaic': pd.BooleanDtype(),
    'flooringType_Marbonite': pd.BooleanDtype(),
    'additionalRoom_PujaRoom': pd.BooleanDtype(),
    'additionalRoom_Study': pd.BooleanDtype(),
    'additionalRoom_Store': pd.BooleanDtype(),
    'additionalRoom_ServantRoom': pd.BooleanDtype(),
    'carParking_Open': pd.Int64Dtype(),
    'carParking_Covered': pd.Int64Dtype(),
    'ReservedParking': pd.BooleanDtype(),
}

COLUMNS_TO_DROP = [
    'coveredArea',
    # 'ReservedParking',
] + [
        'unitCountonFloor',
        'electricityStatus',
        'waterStatus',
        'facing',
        'bookingAmountExact',
        'isVerified',
        'listingTypeDesc',
        'maintenanceCharges',
        'maintenanceChargesFrequency',
        'latitude',
        'longitude',
        'carParking_Open',
        'carParking_Covered',
        'numberOfBalconied',
        'premiumProperty',
        'projectName',
        'nameOfSociety',
        'url',
        'uuid',
        'carpetAreaSqft',
        'noOfLifts',
        'ownershipType',
        'possessionStatus',
        'propertyType',

        'flooringType_Vitrified',
        'flooringType_CeramicTiles',
        'flooringType_Marble',
        'flooringType_NormalTilesKotahStone',
        'flooringType_Granite',
        'flooringType_Wooden',
        'flooringType_Mosaic',
        'flooringType_Marbonite',

        'additionalRoom_PujaRoom',
        'additionalRoom_Study',
        'additionalRoom_Store',
        'additionalRoom_ServantRoom',
        
        'landMarks', 
        'locality', 
        'developerName']

################################################################################
# ONLY USING THE RAW SETs, NOT IMPUTED SET
################################################################################
df_train = pd.read_csv(
    'Data/train.csv',
    dtype = dtype_mapping,
    index_col=0
)
df_train.drop(COLUMNS_TO_DROP, axis=1, inplace=True)

df_test = pd.read_csv(
    'Data/test.csv',
    dtype = dtype_mapping,
    index_col=0
)
df_test.drop(COLUMNS_TO_DROP, axis=1, inplace=True)

################################################################################
# DROPPING ALL ROWS WITH MISSING VALUES
################################################################################

df_train.dropna(axis=0, inplace=True)
df_test.dropna(axis=0, inplace=True)

ValueError: Unable to parse string "More than 10" at position 3698

In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17120 entries, 74208793 to 75703109
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   localityName      17120 non-null  category
 1   price             17120 non-null  Int64   
 2   carpetArea        17120 non-null  Int64   
 3   floorNumber       17120 non-null  Int64   
 4   totalFloorNumber  17120 non-null  Int64   
 5   transactionType   17120 non-null  category
 6   furnished         17120 non-null  category
 7   bedrooms          17120 non-null  Int64   
 8   bathrooms         17120 non-null  Int64   
 9   ageofcons         17120 non-null  category
dtypes: Int64(6), category(4)
memory usage: 1.1 MB


In [35]:
df_train['localityName'].nunique()

154

In [36]:
df_test['localityName'].nunique()

147

In [9]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4272 entries, 75658123 to 75682303
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   localityName      4272 non-null   category
 1   price             4272 non-null   Int64   
 2   carpetArea        4272 non-null   Int64   
 3   floorNumber       4272 non-null   Int64   
 4   totalFloorNumber  4272 non-null   Int64   
 5   transactionType   4272 non-null   category
 6   furnished         4272 non-null   category
 7   bedrooms          4272 non-null   Int64   
 8   bathrooms         4272 non-null   Int64   
 9   ageofcons         4272 non-null   category
dtypes: Int64(6), category(4)
memory usage: 414.3 KB


# Feature Encoding

In [10]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

# Assume that df_train and df_test are your already cleaned and imputed datasets.
X_train = df_train.drop("price", axis=1)
y_train = df_train["price"]

X_test = df_test.drop("price", axis=1)
y_test = df_test["price"]

# List of numeric features
numeric_cols = [
    "carpetArea",
    "floorNumber",
    "totalFloorNumber",
    "bedrooms",
    "bathrooms",
]

# For the two features that will be encoded differently:
cat_diff_cols = ["localityName", "transactionType"]

# The remaining categorical columns that are on an inherently ordinal scale.
# Note: even though localityName might appear here, you may choose a different
# encoding strategy depending on its cardinality. In this code, we are treating
# it specially in the one-hot transformation.
ordinal_cols = ["furnished", "ageofcons"]


# You can now proceed to train your models using X_train_linear / X_train_tree and 
# evaluate using X_test_linear / X_test_tree.


## Linear

In [11]:
# =============================================================================
# Pipeline for linear models
#   - Numerical features: standard scaled.
#   - For transactionType, furnished: one-hot encoded.
#   - For ordinal_cols (localityName, ageofcons): ordinal-encoded
#     and then scaled (so that all features are on a similar scale).
# =============================================================================

# Define transformers
numeric_transformer = StandardScaler()

onehot_transformer = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

# Here we create a pipeline that first ordinal-encodes then scales the result.
furnished_order = ['Unfurnished', 'Semi-Furnished', 'Furnished']
ordinal_transformer_furnished = Pipeline(
    steps=[
        ("ordinal", OrdinalEncoder(categories=[furnished_order])),
        ("scaler", StandardScaler()),
    ]
)

# # Here we create a pipeline that first ordinal-encodes then scales the result.
# ordinal_transformer_rs = Pipeline(
#     steps=[
#         ("ordinal", OrdinalEncoder()),
#         ("scaler", StandardScaler()),
#     ]
# )

# Here we create a pipeline that first ordinal-encodes then scales the result.
age_order = [
    'Under Construction',  # first: youngest / newest state
    'New Construction',
    'Less than 5 years',
    '5 to 10 years',
    '10 to 15 years',
    '15 to 20 years',
    'Above 20 years'       # last: oldest
]
ordinal_transformer_ageofcons = Pipeline(
    steps=[
        ("ordinal", OrdinalEncoder(categories=[age_order])),
        ("scaler", StandardScaler()),
    ]
)

# Create the ColumnTransformer for the linear pipeline.
lin_preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("onehot", onehot_transformer, cat_diff_cols),
        ("ord-furnished", ordinal_transformer_furnished, ["furnished"]),
        # ("ord-reservedparking", ordinal_transformer_rs, ["ReservedParking"]),
        ("ord-ageofcons", ordinal_transformer_ageofcons, ["ageofcons"]),
    ]
)

# The overall pipeline (here you could add a linear model as the final estimator)
lin_pipeline = Pipeline(steps=[("preprocessor", lin_preprocessor)])

# Now transform the training features for the linear model:
X_train_linear = lin_pipeline.fit_transform(X_train)
X_test_linear = lin_pipeline.transform(X_test)

## Tree (numeric only)

In [12]:
# =============================================================================
# Pipeline for tree-based models
#   - Numerical features: standard scaled.
#   - For all categorical features (transactionType, furnished, ReservedParking,
#     localityName, ageofcons): ordinal-encoded and then scaled.
# =============================================================================

# Create a pipeline for encoding the categorical features as ordinal then scaling.
tree_cat_transformer = Pipeline(
    steps=[
        ("ordinal", OrdinalEncoder()),
        ("scaler", StandardScaler()),
    ]
)

tree_preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("ord", tree_cat_transformer, cat_diff_cols),
        ("ord-furnished", ordinal_transformer_furnished, ["furnished"]),
        # ("ord-reservedparking", ordinal_transformer_rs, ["ReservedParking"]),
        ("ord-ageofcons", ordinal_transformer_ageofcons, ["ageofcons"]),
    ]
)

tree_pipeline = Pipeline(steps=[("preprocessor", tree_preprocessor)])
# Now transform the training features for the tree models:

X_train_tree = tree_pipeline.fit_transform(X_train)
X_test_tree = tree_pipeline.transform(X_test)

## Tree (numeric and categorical)

In [13]:
# =============================================================================
# Pipeline for models which handle categorical features:
#   - Numerical features: standard scaled.
#   - For categorical features (furnished, ReservedParking,
#     , ageofcons): ordinal-encoded and then scaled.
#   - For categorical features (transactionType, localityName): Kept as is.
# =============================================================================

# Create a pipeline for encoding the categorical features as ordinal then scaling.

tree_preprocessor_gb = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("passthrough", "passthrough", cat_diff_cols),
        ("ord-furnished", ordinal_transformer_furnished, ["furnished"]),
        # ("ord-reservedparking", ordinal_transformer_rs, ["ReservedParking"]),
        ("ord-ageofcons", ordinal_transformer_ageofcons, ["ageofcons"]),
    ]
)

tree_pipeline_gb = Pipeline(steps=[("preprocessor", tree_preprocessor_gb)])
# Now transform the training features for the tree models:
X_train_gb = tree_pipeline_gb.fit_transform(X_train)
X_test_gb = tree_pipeline_gb.transform(X_test)

In [14]:
print("Linear model feature shape: train:", X_train_linear.shape)
print("Tree model feature shape: train:", X_train_tree.shape)
print("Tree model for GB feature shape: train:", X_train_gb.shape)

Linear model feature shape: train: (17120, 163)
Tree model feature shape: train: (17120, 9)
Tree model for GB feature shape: train: (17120, 9)


# Model Training

## Defining Metrics and Flow

In [16]:
import numpy as np
import pandas as pd
import warnings

from sklearn.metrics import (mean_absolute_error,
                             mean_absolute_percentage_error,
                             r2_score,
                             make_scorer)
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import clone

import xgboost as xgb
from catboost import CatBoostRegressor
import lightgbm as lgb

# Ignore warnings for cleaner output
warnings.filterwarnings("ignore")

# ---------------------------------------------
# Custom adjusted R2 scorer
# ---------------------------------------------
def adjusted_r2_scoring(estimator, X, y):
    """
    Scoring function that computes adjusted R2.
    """
    y_pred = estimator.predict(X)
    base_r2 = r2_score(y, y_pred)
    n = len(y)
    p = X.shape[1]
    if (n - p - 1) == 0:
        return base_r2
    adj_r2 = 1 - (1 - base_r2) * ((n - 1) / (n - p - 1))
    print(f"Adjusted R2 score: {adj_r2:.4f}")
    return adj_r2


adj_r2_scorer = make_scorer(adjusted_r2_scoring, greater_is_better=True)

# ---------------------------------------------
# Define scoring dictionary for GridSearchCV.
# Note: For MAE and MAPE, we use the negative built-in scorers
# so that “higher” is better.
# ---------------------------------------------
scoring = {
    "MAE": "neg_mean_absolute_error",
    "MAPE": "neg_mean_absolute_percentage_error",
    "R2": "r2",
    # "adj_R2": adj_r2_scorer,
}

# Global dictionary to store results
results_dict = {}

# ---------------------------------------------
# Function to run GridSearchCV and store metrics
# ---------------------------------------------
def run_grid_search(model, param_grid, X_train, y_train, X_test, y_test, 
                    model_name):
    """
    Performs grid search with the given estimator and parameter grid, 
    computes cross validation and test metrics, then stores the results 
    in the global dictionary.
    """
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring=scoring,
        cv=5,
        refit="MAE",  # refit based on neg MAE (i.e. lower MAE is higher -MAE)
        return_train_score=True,
        n_jobs=-1,
    )
    if model not in ["LightGBM"]: 
        grid_search.fit(X_train, y_train)
    else:
        grid_search.fit(X_train, y_train, categorical_feature = ['localityName', 'transactionType'])
    best_index = grid_search.best_index_
    # Note: The built-in scorers return negative error values.
    cv_scores = {
        "MAE": -grid_search.cv_results_["mean_test_MAE"][best_index],
        "MAPE": -grid_search.cv_results_["mean_test_MAPE"][best_index],
        "R2": grid_search.cv_results_["mean_test_R2"][best_index],
        # "adj_R2": grid_search.cv_results_["mean_test_adj_R2"][best_index],
    }

    # Evaluate on test data
    y_pred = grid_search.best_estimator_.predict(X_test)
    test_mae = np.abs(y_test - y_pred)
    test_mape = (np.abs(y_test - y_pred) / (np.abs(y_test) + 1e-10))*100
    test_r2 = r2_score(y_test, y_pred)
    n_test = len(y_test)
    p_test = X_test.shape[1]
    test_adj_r2 = (
        1 - (1 - test_r2) * ((n_test - 1) / (n_test - p_test - 1))
        if (n_test - p_test - 1) != 0
        else test_r2
    )

    results_dict[model_name] = {
        "best_params": grid_search.best_params_,
        "cv_scores": cv_scores,
        "test_scores": {
            "MAE": test_mae,
            "MAPE": test_mape,
            "R2": test_r2,
            "adj_R2": test_adj_r2,
        },
    }

    return grid_search

# ---------------------------------------------
# Function to run the passed model with default settings.
# ---------------------------------------------
def run_default_model(model, X_train, y_train, X_test, y_test, model_name):
    """
    Trains the provided model with default parameters on the training set
    and evaluates it on the test set. The evaluation metrics are then stored
    in the global results_dict under the key '<model_name>_default'.
    """
    # Create a fresh copy of the model using clone()
    default_model = clone(model)

    # If the model is CatBoost or LightGBM, handle categorical features.
    if model_name == 'LightGBM':
        default_model.fit(X_train, y_train, categorical_feature=["localityName", 'transactionType'])
    else:
        default_model.fit(X_train, y_train)

    y_pred = default_model.predict(X_test)
    test_mae = np.abs(y_test - y_pred)
    test_mape = (np.abs(y_test - y_pred) / (np.abs(y_test) + 1e-10)) * 100
    test_r2 = r2_score(y_test, y_pred)

    n_test = len(y_test)
    p_test = X_test.shape[1]
    test_adj_r2 = (
        1 - (1 - test_r2) * ((n_test - 1) / (n_test - p_test - 1))
        if (n_test - p_test - 1) != 0
        else test_r2
    )

    if results_dict.get(model_name) is None:
        results_dict[model_name] = {'default':{}}
    results_dict[model_name]['default'] = {
        "test_scores": {
            "MAE": test_mae,
            "MAPE": test_mape,
            "R2": test_r2,
            "adj_R2": test_adj_r2,
        }
    }
# ===============================================================
# Use the appropriate training/testing sets according to the model:
# 1. Linear, ElasticNet, and KNN will use the "linear" set.
# 2. Decision Tree, Random Forest, XGBoost, and XGB RF will use the
#    "tree" set.
# 3. CatBoost and LightGBM will use the "GB" set.
# ===============================================================

## Models

### Linear Regression

In [17]:
# 1. Linear Regression (no hyperparameter tuning required)
lin_reg = LinearRegression()
# No hyperparameters to tune; we use an empty grid
run_grid_search(
    lin_reg, {}, X_train_linear, y_train, X_test_linear, y_test,
    "LinearRegression"
)
run_default_model(
    lin_reg, X_train_linear, y_train, X_test_linear, y_test, "LinearRegression"
)

### Elastic Net

In [18]:
# 2. ElasticNet
elastic_net = ElasticNet(random_state=42, max_iter=10000)
param_grid_en = {
    "alpha": [0.1, 0.5, 0.75, 1.0, 2.0, 2.5, 5.0, 10.0],
    "l1_ratio": [0.1, 0.2, 0.4, 0.6, 0.8, 0.9],
}
grid_search = run_grid_search(
    elastic_net, param_grid_en, X_train_linear, y_train, X_test_linear, y_test,
    "ElasticNet"
)
run_default_model(
    elastic_net, X_train_linear, y_train, X_test_linear, y_test, "ElasticNet"
)

### Decision Tree 

In [19]:
# 4. Decision Tree Regressor
dt = DecisionTreeRegressor(random_state=42)
param_grid_dt = {
    "max_depth": [None, 5, 10, 20, 25, 35, 50],
    "min_samples_split": [2, 5, 10, 15],
    "min_samples_leaf": [1, 2, 4, 6],
}
run_grid_search(
    dt, param_grid_dt, X_train_tree, y_train, X_test_tree, y_test,
    "DecisionTree"
)
run_default_model(
    dt, X_train_tree, y_train, X_test_tree, y_test, "DecisionTree"
)

### XGBoost Regressor

#### Linear Data

In [20]:
# # 6. XGBoost Regressor
# xgb_reg = xgb.XGBRegressor(
#     objective="reg:squarederror",
#     random_state=42,
#     verbosity=0,
# )
# param_grid_xgb = {
#     "n_estimators": [100, 150, 200, 250, 300, 500],
#     "max_depth": [3, 5, 7],
#     "max_leaves": [0, 10, 25, 50],
#     "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3, 0.5],
#     "subsample": [0.8, 1.0],
# }
# grid_search = run_grid_search(
#     xgb_reg, param_grid_xgb, X_train_linear, y_train, X_test_linear, y_test,
#     "XGBoostRegressor_Linear"
# )
# run_default_model(
#     xgb_reg, X_train_linear, y_train, X_test_linear, y_test, "XGBoostRegressor_Linear"
# )

#### Tree Data

In [21]:
# 6. XGBoost Regressor
xgb_reg = xgb.XGBRegressor(
    objective="reg:squarederror",
    random_state=42,
    verbosity=0,
)
param_grid_xgb = {
    "n_estimators": [100, 150, 200, 250, 300, 500],
    "max_depth": [3, 5, 7],
    "max_leaves": [0, 10, 25, 50],
    "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3, 0.5],
    "subsample": [0.8, 1.0],
}
grid_search = run_grid_search(
    xgb_reg, param_grid_xgb, X_train_tree, y_train, X_test_tree, y_test,
    "XGBoostRegressor"
)
run_default_model(
    xgb_reg, X_train_tree, y_train, X_test_tree, y_test, "XGBoostRegressor"
)

In [42]:
pd.DataFrame(X_train_linear).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17120 entries, 0 to 17119
Columns: 163 entries, 0 to 162
dtypes: float64(163)
memory usage: 21.3 MB


### Random Forest

In [22]:
# 5. Random Forest Regressor
rf = RandomForestRegressor(random_state=42)
param_grid_rf = {
    "n_estimators": [100, 150, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2],
}
run_grid_search(
    rf, param_grid_rf, X_train_tree, y_train, X_test_tree, y_test,
    "RandomForest"
)
run_default_model(
    rf, X_train_tree, y_train, X_test_tree, y_test, "RandomForest"
)

### KNN

In [23]:
# 3. K-Nearest Neighbors (KNN)
knn = KNeighborsRegressor()
param_grid_knn = {
    "n_neighbors": [10, 20, 30, 40, 50],
    "weights": ["uniform", "distance"],
    "algorithm": ["ball_tree", "kd_tree", "brute"],
    "p": [1, 2],
}
#run_grid_search(
#    knn, param_grid_knn, X_train_linear, y_train, X_test_linear, y_test, "KNN"
#)
run_default_model(
    knn, X_train_linear, y_train, X_test_linear, y_test, "KNN"
)

### XGBoost Random Forest

In [24]:
# 7. XGBoost Random Forest Regressor
xgb_rf = xgb.XGBRFRegressor(
    random_state=42,
    verbosity=0,
)
param_grid_xgb_rf = {
    "n_estimators": [100, 150, 200],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.2],
    "subsample": [0.8, 1.0],
}
run_grid_search(
    xgb_rf, param_grid_xgb_rf, X_train_tree, y_train, X_test_tree, y_test,
    "XGBoostRFRegressor"
)
run_default_model(
    xgb_rf, X_train_tree, y_train, X_test_tree, y_test, "XGBoostRFRegressor"
)

### LightGBM

#### With Tree dataset

In [25]:
# # 9. LightGBM Regressor
# lgbm_reg = lgb.LGBMRegressor(random_state=42)
# param_grid_lgb = {
#     "n_estimators": [100, 150, 200, 300, 500],
#     "max_depth": [-1, 3, 5, 7],
#     "learning_rate": [0.01, 0.1, 0.2],
# }

# run_grid_search(
#     lgbm_reg, param_grid_lgb, X_train_tree, y_train, X_test_tree, y_test, 
#     "LightGBM_encoded"
# )
# run_default_model(
#     lgbm_reg, X_train_tree, y_train, X_test_tree, y_test, "LightGBM_encoded"
# )

#### With unencoded data

In [26]:
# 9. LightGBM Regressor
lgbm_reg = lgb.LGBMRegressor(random_state=42)
param_grid_lgb = {
    "n_estimators": [100, 150, 200, 300, 500],
    "max_depth": [-1, 3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.2],
}

mapping = {
    "carpetArea": pd.Float64Dtype(),
    "floorNumber": pd.Float64Dtype(),
    "totalFloorNumber": pd.Float64Dtype(),
    "bedrooms": pd.Float64Dtype(),
    "bathrooms": pd.Float64Dtype(),
    "localityName": 'category',
    "transactionType": 'category',
    "furnished": pd.Float64Dtype(),
    "ageofcons": pd.Float64Dtype(),
}
X_train_gb_df = pd.DataFrame(X_train_gb, columns = numeric_cols + cat_diff_cols + ordinal_cols).astype(mapping)
X_test_gb_df = pd.DataFrame(X_test_gb,  columns = numeric_cols + cat_diff_cols + ordinal_cols).astype(mapping)
y_train_df = pd.Series(y_train)
y_test_df = pd.Series(y_test)

run_grid_search(
    lgbm_reg, param_grid_lgb, X_train_gb_df, y_train_df, X_test_gb_df, y_test_df,
    "LightGBM"
)
run_default_model(
    lgbm_reg, X_train_gb_df, y_train_df, X_test_gb_df, y_test_df, "LightGBM"
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000313 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 525
[LightGBM] [Info] Number of data points in the train set: 17120, number of used features: 9
[LightGBM] [Info] Start training from score 11499264.564953
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000240 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 525
[LightGBM] [Info] Number of data points in the train set: 17120, number of used features: 9
[LightGBM] [Info] Start training from score 11499264.564953


### CatBoost

#### With Tree data

In [27]:
# # 8. CatBoost Regressor
# catboost_reg = CatBoostRegressor(random_state=42, verbose=0)
# param_grid_cat = {
#     "iterations": [250, 500],
#     "depth": [3, 5, 7],
#     "learning_rate": [0.01, 0.1, 0.2],
# }

# run_grid_search(
#     catboost_reg, param_grid_cat, X_train_tree, y_train, X_test_tree, y_test,
#     "CatBoost_encoded"
# )
# run_default_model(
#     catboost_reg, X_train_tree, y_train, X_test_tree, y_test, "CatBoost_encoded"
# )

#### With unencoded data

In [28]:
# 8. CatBoost Regressor
catboost_reg = CatBoostRegressor(random_state=42, verbose=0, cat_features = ['localityName', 'transactionType'])
param_grid_cat = {
    "iterations": [250, 500],
    "depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.2],
}
mapping = {
    "carpetArea": pd.Float64Dtype(),
    "floorNumber": pd.Float64Dtype(),
    "totalFloorNumber": pd.Float64Dtype(),
    "bedrooms": pd.Float64Dtype(),
    "bathrooms": pd.Float64Dtype(),
    "localityName": 'category',
    "transactionType": 'category',
    "furnished": pd.Float64Dtype(),
    "ageofcons": pd.Float64Dtype(),
}
X_train_gb_df = pd.DataFrame(X_train_gb, columns = numeric_cols + cat_diff_cols + ordinal_cols).astype(mapping)
X_test_gb_df = pd.DataFrame(X_test_gb,  columns = numeric_cols + cat_diff_cols + ordinal_cols).astype(mapping)
y_train_df = pd.Series(y_train)
y_test_df = pd.Series(y_test)
run_grid_search(
    catboost_reg, param_grid_cat, X_train_gb_df, y_train_df, X_test_gb_df, y_test_df,
    "CatBoost"
)
run_default_model(
    catboost_reg, X_train_gb_df, y_train_df, X_test_gb_df, y_test_df, "CatBoost"
)

## Aggregating Results

### Best Hyperparameter Configuration

In [30]:
print("Best hyperparameter config for all models:\n")
for model, values in results_dict.items():
    print(f"{model}: {values['best_params'] if values.get('best_params') else 'N/A'}")

with open("Results/BestHyperparameters.txt", "w") as f:
    for model, values in results_dict.items():
        f.write(f"{model}: {values['best_params'] if values.get('best_params') else 'N/A'}\n")

Best hyperparameter config for all models:

LinearRegression: N/A
ElasticNet: {'alpha': 0.1, 'l1_ratio': 0.9}
DecisionTree: {'max_depth': 25, 'min_samples_leaf': 6, 'min_samples_split': 15}
XGBoostRegressor: {'learning_rate': 0.2, 'max_depth': 5, 'max_leaves': 25, 'n_estimators': 500, 'subsample': 1.0}
RandomForest: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
KNN: N/A
XGBoostRFRegressor: {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 150, 'subsample': 0.8}
LightGBM: {'learning_rate': 0.1, 'max_depth': -1, 'n_estimators': 500}
CatBoost: {'depth': 7, 'iterations': 500, 'learning_rate': 0.2}


### CV scores for best hyperparameters

In [31]:
temp = {
    "index":[],
    'MAE':[],
    'MAPE':[],
    'R2':[],
}
for model, values in results_dict.items():
    if values.get('cv_scores') is None:
        continue
    temp['index'].append(model)
    temp['MAE'].append(values['cv_scores']['MAE'])
    temp['MAPE'].append(values['cv_scores']['MAPE'])
    temp['R2'].append(values['cv_scores']['R2'])

cv_scores = pd.DataFrame(temp).set_index('index')
cv_scores.sort_values(by='MAPE', ascending=True).to_csv("Results/cv_scores.csv", index=True)
cv_scores.sort_values(by='MAPE', ascending=True)

Unnamed: 0_level_0,MAE,MAPE,R2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LightGBM,1578611.0,0.143503,0.930773
XGBoostRegressor,1624097.0,0.147341,0.920523
CatBoost,1671615.0,0.152017,0.918016
RandomForest,1868819.0,0.169597,0.883445
DecisionTree,2272901.0,0.200524,0.831574
LinearRegression,2406075.0,0.253324,0.859641
ElasticNet,2515326.0,0.257833,0.839029
XGBoostRFRegressor,5922518.0,0.795244,0.31617


### Test scores for Best Hyperparameter Config

In [32]:
custom_percentiles = sorted({0.05, 0.1, 0.2, 0.25, 0.5, 0.75, 0.8, 0.9, 0.95})

# Initialize dictionaries to store the summaries for each metric.
mae_data = {}
mape_data = {}
r2_data = {}
adj_r2_data = {}

# Loop over each model in your results_dict.
for model_name, model_results in results_dict.items():

    if model_results.get('test_scores') is None:
        continue

    # Get the dictionary that contains the test scores.
    test_scores = model_results['test_scores']
    
    # For each metric, compute the descriptive statistics using .describe.
    # pd.Series.describe returns a Series with index: count, mean, std, min,
    # the provided percentiles, and max.
    mae_data[model_name] = pd.Series(test_scores['MAE']).describe(
        percentiles=custom_percentiles
    )
    mape_data[model_name] = pd.Series(test_scores['MAPE']).describe(
        percentiles=custom_percentiles
    )
    r2_data[model_name] = pd.Series(test_scores['R2']).describe(
        percentiles=custom_percentiles
    )
    adj_r2_data[model_name] = pd.Series(test_scores['adj_R2']).describe(
        percentiles=custom_percentiles
    )

# Now, create dataframes for each metric with the model names as the index.
mae_df = pd.DataFrame.from_dict(mae_data, orient="index")
mape_df = pd.DataFrame.from_dict(mape_data, orient="index")
r2_df = pd.DataFrame.from_dict(r2_data, orient="index")
adj_r2_df = pd.DataFrame.from_dict(adj_r2_data, orient="index")

In [33]:
(mae_df/1000).sort_values(by="50%", ascending=True)

Unnamed: 0,count,mean,std,min,5%,10%,20%,25%,50%,75%,80%,90%,95%,max
LightGBM,4.272,1574.008483,3338.006024,0.029688,66.505983,141.066352,266.383619,338.872324,801.173435,1688.631404,2032.66126,3391.711981,4982.02829,113630.084377
XGBoostRegressor,4.272,1676.118141,3344.354654,0.30275,75.071213,153.625225,289.0982,370.01525,822.83625,1749.7975,2097.4022,3612.42115,5627.7236,99327.826
CatBoost,4.272,1672.282035,3416.391907,0.053215,79.274096,149.044362,297.972813,380.812737,875.01766,1763.108156,2104.252323,3475.301146,5484.714475,117991.501401
RandomForest,4.272,1838.147277,4016.993855,0.026,68.8875,136.964018,295.78912,370.29575,889.088033,1911.193261,2327.730429,3880.759092,6083.56849,165926.177993
DecisionTree,4.272,2187.774155,4860.684564,0.0,82.603714,166.354667,355.687359,441.331393,1015.48125,2279.761905,2737.34,4624.476838,7522.25,187589.022429
LinearRegression,4.272,2431.251052,4299.962121,0.269386,130.511167,259.049874,506.038882,650.251679,1440.57427,2766.408394,3168.48811,4760.91766,7087.256089,141402.180621
ElasticNet,4.272,2540.658396,4720.563082,0.32284,138.760373,267.213464,555.95033,692.861302,1535.210881,2875.099806,3295.796676,4919.850563,7427.193884,165638.594121
XGBoostRFRegressor,4.272,5820.257101,9609.95734,0.822,546.3539,1050.3094,1949.7774,2329.08275,4238.625,6354.7135,6835.4248,8317.2826,16197.35345,305960.798


In [None]:
(mae_df/1000).sort_values(by="50%", ascending=True)

Unnamed: 0,count,mean,std,min,5%,10%,20%,25%,50%,75%,80%,90%,95%,max
LightGBM,5.501,1638.58555,3406.170456,0.595492,69.57812,135.964861,286.008493,367.952189,831.600055,1749.379334,2112.113695,3543.39626,5472.867577,147878.519643
XGBoostRegressor,5.501,1702.995778,3672.439902,0.046,67.073,142.382,295.3,370.72325,851.494,1752.136,2130.987,3532.376,5591.072,112138.418
XGBoostRegressor_Linear,5.501,1672.84175,3217.341838,0.0725,86.19525,169.542,326.588,400.267,868.014,1781.933,2146.717,3521.135,5359.1,98355.394
CatBoost,5.501,1694.384753,3490.148321,0.615414,72.825975,146.233573,295.172568,379.616675,871.089458,1781.876599,2137.28071,3625.283421,5574.98079,149362.243655
RandomForest,5.501,1781.514314,3941.731286,0.008,66.55124,134.701807,284.0,368.6,878.9,1825.1,2225.0,3740.636109,6152.126,173986.386723
CatBoost_encoded,5.501,1750.034408,3495.657094,0.239067,77.013235,156.715853,325.354194,404.956886,921.998191,1880.559078,2240.648891,3675.734837,5792.138331,135678.70645
LightGBM_encoded,5.501,1866.861907,4210.836377,0.318429,86.458108,170.689873,354.004812,449.255279,981.423519,1967.443248,2347.042151,3877.175186,5864.870082,206223.152951
KNN,5.501,2157.906916,5123.968575,0.0,80.0,160.0,320.0,412.0,990.0,2220.0,2679.4,4640.0,7220.0,231386.314
DecisionTree,5.501,2437.727014,6205.399228,0.0,0.0,100.0,300.0,400.0,1079.0,2500.0,3000.0,5000.0,8100.0,230581.554
XGBoostRFRegressor,5.501,2414.866076,5058.533852,0.0795,125.68325,242.245,484.6685,609.0815,1282.688,2574.338,3047.334,4915.032,7674.728,217693.754


In [34]:
mape_df.sort_values(by="90%", ascending=True)

Unnamed: 0,count,mean,std,min,5%,10%,20%,25%,50%,75%,80%,90%,95%,max
LightGBM,4272.0,14.400962,15.665396,0.00033,0.936121,1.884607,3.682735,4.683804,10.354222,18.979931,21.695472,30.424372,40.394107,200.592519
XGBoostRegressor,4272.0,15.047714,16.605902,0.004286,1.046557,2.056285,3.885576,4.935514,10.859391,19.426715,22.426218,31.74694,41.483635,252.785771
CatBoost,4272.0,15.254072,15.742605,0.001663,1.074401,2.012269,4.078015,5.168146,10.957685,20.237703,23.205524,32.56112,42.781838,216.881746
RandomForest,4272.0,16.853109,19.136637,0.000629,0.905046,1.866116,3.82614,4.816792,11.36723,22.111338,25.606999,37.132423,49.204294,251.207148
DecisionTree,4272.0,19.543139,21.581127,0.0,1.143009,2.177071,4.69718,5.955106,13.537457,25.714286,29.964111,42.224043,56.322142,305.0
LinearRegression,4272.0,25.617167,28.584164,0.004144,1.566117,3.263086,6.292223,8.036669,17.416124,32.669001,37.519488,57.555649,77.23989,361.041722
ElasticNet,4272.0,26.314743,28.028332,0.005832,1.857919,3.500578,6.723216,8.528815,18.463576,33.998698,38.523066,57.991288,76.938819,326.879548
XGBoostRFRegressor,4272.0,78.259889,89.423154,0.007473,4.844615,9.253914,18.091711,22.169798,46.459064,98.91771,123.75314,196.98474,263.244972,706.986477


In [None]:
mape_df.sort_values(by="90%", ascending=True)

Unnamed: 0,count,mean,std,min,5%,10%,20%,25%,50%,75%,80%,90%,95%,max
LightGBM,5501.0,15.620376,16.24988,0.015293,1.066646,2.051363,4.22725,5.316535,11.296983,20.671952,23.57809,32.383664,43.423478,221.321531
XGBoostRegressor,5501.0,16.03798,16.83242,0.000575,0.972188,2.03902,4.267082,5.363917,11.73015,21.176256,24.44086,33.609529,45.656179,229.7709
CatBoost,5501.0,16.070057,15.942638,0.009185,1.021823,2.134104,4.410455,5.483198,11.834324,21.726177,24.889734,33.817482,43.993006,216.106006
XGBoostRegressor_Linear,5501.0,16.392115,16.683442,0.001021,1.198667,2.484891,4.760858,5.755719,12.021877,21.385234,24.525375,34.666025,45.827657,232.4769
CatBoost_encoded,5501.0,17.290265,18.288671,0.002921,1.118698,2.230776,4.720769,5.860361,12.520894,22.429405,25.793331,36.096027,49.390002,224.376187
RandomForest,5501.0,17.390198,19.451944,0.000143,0.874718,1.923333,3.962245,5.122667,11.989806,23.222222,26.613333,37.893074,51.391667,296.29856
LightGBM_encoded,5501.0,18.30496,18.919188,0.003883,1.265823,2.535291,5.035064,6.449898,13.313519,23.906537,27.057949,38.219856,51.729316,246.937138
KNN,5501.0,19.4676,21.382597,0.0,1.176471,2.36227,4.888889,6.203035,13.809524,25.555556,29.345794,40.454545,53.9,269.7
DecisionTree,5501.0,23.234797,28.907819,0.0,0.0,1.515152,4.347826,5.882353,15.384615,30.177515,35.294118,50.454545,69.53125,373.684211
XGBoostRFRegressor,5501.0,24.140559,24.376711,0.001395,1.732507,3.406333,6.660558,8.216387,17.945477,31.342673,36.088938,51.101262,69.578545,309.78336


### Test scores for models with default settings

In [37]:
custom_percentiles = sorted({0.05, 0.1, 0.2, 0.25, 0.5, 0.75, 0.8, 0.9, 0.95})

# Initialize dictionaries to store the summaries for each metric.
mae_data = {}
mape_data = {}
r2_data = {}
adj_r2_data = {}

# Loop over each model in your results_dict.
for model_name, model_results in results_dict.items():

    if model_results.get('default') is None:
        continue

    # Get the dictionary that contains the test scores.
    test_scores = model_results['default']['test_scores']
    
    # For each metric, compute the descriptive statistics using .describe.
    # pd.Series.describe returns a Series with index: count, mean, std, min,
    # the provided percentiles, and max.
    mae_data[model_name] = pd.Series(test_scores['MAE']).describe(
        percentiles=custom_percentiles
    )
    mape_data[model_name] = pd.Series(test_scores['MAPE']).describe(
        percentiles=custom_percentiles
    )
    r2_data[model_name] = pd.Series(test_scores['R2']).describe(
        percentiles=custom_percentiles
    )
    adj_r2_data[model_name] = pd.Series(test_scores['adj_R2']).describe(
        percentiles=custom_percentiles
    )

# Now, create dataframes for each metric with the model names as the index.
mae_df = pd.DataFrame.from_dict(mae_data, orient="index")
mape_df = pd.DataFrame.from_dict(mape_data, orient="index")
r2_df = pd.DataFrame.from_dict(r2_data, orient="index")
adj_r2_df = pd.DataFrame.from_dict(adj_r2_data, orient="index")

In [38]:
mape_df.T

Unnamed: 0,LinearRegression,ElasticNet,DecisionTree,XGBoostRegressor,RandomForest,KNN,XGBoostRFRegressor,LightGBM,CatBoost
count,4272.0,4272.0,4272.0,4272.0,4272.0,4272.0,4272.0,4272.0,4272.0
mean,25.617167,35.574541,21.945425,15.779183,16.855739,18.530954,23.032103,15.080644,15.692694
std,28.584164,31.690887,28.079376,17.137645,19.226095,19.74919,23.323207,16.39441,15.828731
min,0.004144,0.020225,0.0,0.001674,0.0,0.0,0.002902,0.001938,0.010968
5%,1.566117,2.27209,0.0,0.977077,0.873246,1.272,1.57579,1.003926,1.117463
10%,3.263086,4.422013,1.474355,1.958236,1.972709,2.381301,3.175798,1.959856,2.277258
20%,6.292223,9.127842,4.166667,4.17032,3.817931,4.615385,6.335716,4.000192,4.314382
25%,8.036669,11.664331,5.555556,5.14686,5.021304,6.0,7.928746,5.141632,5.408931
50%,17.416124,27.087505,14.222895,11.209989,11.379389,13.35372,17.023456,10.813767,11.494931
75%,32.669001,51.682947,28.903509,20.525547,22.106262,24.379473,29.669156,19.550434,21.056665


In [None]:
mape_df.T

Unnamed: 0,LinearRegression,ElasticNet,DecisionTree,XGBoostRegressor,RandomForest,KNN,XGBoostRFRegressor,LightGBM,CatBoost,LightGBM_encoded,CatBoost_encoded,XGBoostRegressor_Linear
count,5501.0,5501.0,5501.0,5501.0,5501.0,5501.0,5501.0,5501.0,5501.0,5501.0,5501.0,5501.0
mean,28.549827,40.653924,23.234797,16.03798,17.390198,19.4676,24.140559,15.620376,16.070057,18.30496,17.290265,16.392115
std,32.852784,35.485908,28.907819,16.83242,19.451944,21.382597,24.376711,16.24988,15.942638,18.919188,18.288671,16.683442
min,0.004091,0.011594,0.0,0.000575,0.000143,0.0,0.001395,0.015293,0.009185,0.003883,0.002921,0.001021
5%,1.705655,2.942677,0.0,0.972188,0.874718,1.176471,1.732507,1.066646,1.021823,1.265823,1.118698,1.198667
10%,3.414394,5.39532,1.515152,2.03902,1.923333,2.36227,3.406333,2.051363,2.134104,2.535291,2.230776,2.484891
20%,7.076371,10.732236,4.347826,4.267082,3.962245,4.888889,6.660558,4.22725,4.410455,5.035064,4.720769,4.760858
25%,8.837488,13.742158,5.882353,5.363917,5.122667,6.203035,8.216387,5.316535,5.483198,6.449898,5.860361,5.755719
50%,19.39925,31.22984,15.384615,11.73015,11.989806,13.809524,17.945477,11.296983,11.834324,13.313519,12.520894,12.021877
75%,36.690748,59.333696,30.177515,21.176256,23.222222,25.555556,31.342673,20.671952,21.726177,23.906537,22.429405,21.385234


In [39]:
mae_df.T/1000

Unnamed: 0,LinearRegression,ElasticNet,DecisionTree,XGBoostRegressor,RandomForest,KNN,XGBoostRFRegressor,LightGBM,CatBoost
count,4.272,4.272,4.272,4.272,4.272,4.272,4.272,4.272,4.272
mean,2431.251052,3299.321759,2358.563435,1771.894563,1838.537547,2205.4046,2395.403619,1633.590812,1716.502192
std,4299.962121,6506.898831,4741.354046,3769.044474,3978.971668,5523.25753,4718.746618,3593.273544,3391.003525
min,0.269386,3.284262,0.0,0.144,0.0,0.0,0.208,0.112423,0.77575
5%,130.511167,195.894236,0.0,68.456906,67.267649,80.0,103.00645,75.729955,83.492867
10%,259.049874,421.674911,100.0,142.74635,142.038092,160.0,234.37,138.021702,160.2743
20%,506.038882,847.213633,300.0,301.7269,294.015388,330.0,497.797,293.445697,308.646385
25%,650.251679,1042.118969,400.0,381.149938,375.403147,420.0,619.09875,371.003758,392.107211
50%,1440.57427,2124.043838,1050.0,867.78225,882.547935,1000.0,1300.704,833.208749,915.914346
75%,2766.408394,3761.804897,2500.0,1820.597125,1919.2205,2281.4024,2614.470625,1787.078287,1871.358366


In [None]:
mae_df.T/1000

Unnamed: 0,LinearRegression,ElasticNet,DecisionTree,XGBoostRegressor,RandomForest,KNN,XGBoostRFRegressor,LightGBM,CatBoost,LightGBM_encoded,CatBoost_encoded,XGBoostRegressor_Linear
count,5.501,5.501,5.501,5.501,5.501,5.501,5.501,5.501,5.501,5.501,5.501,5.501
mean,2555.386042,3509.283517,2437.727014,1702.995778,1781.514314,2157.906916,2414.866076,1638.58555,1694.384753,1866.861907,1750.034408,1672.84175
std,4500.903873,6566.92339,6205.399228,3672.439902,3941.731286,5123.968575,5058.533852,3406.170456,3490.148321,4210.836377,3495.657094,3217.341838
min,0.169767,0.950378,0.0,0.046,0.008,0.0,0.0795,0.595492,0.615414,0.318429,0.239067,0.0725
5%,125.011055,267.245143,0.0,67.073,66.55124,80.0,125.68325,69.57812,72.825975,86.458108,77.013235,86.19525
10%,255.379306,504.242998,100.0,142.382,134.701807,160.0,242.245,135.964861,146.233573,170.689873,156.715853,169.542
20%,536.176441,978.37151,300.0,295.3,284.0,320.0,484.6685,286.008493,295.172568,354.004812,325.354194,326.588
25%,678.093566,1202.875811,400.0,370.72325,368.6,412.0,609.0815,367.952189,379.616675,449.255279,404.956886,400.267
50%,1525.963415,2292.570738,1079.0,851.494,878.9,990.0,1282.688,831.600055,871.089458,981.423519,921.998191,868.014
75%,2883.260237,3862.62483,2500.0,1752.136,1825.1,2220.0,2574.338,1749.379334,1781.876599,1967.443248,1880.559078,1781.933


In [None]:
pd.Series(y_test).describe()

count             5501.0
mean     11188191.468642
std      13947808.812676
min             800000.0
25%            4700000.0
50%            7500000.0
75%           12000000.0
max          350581554.0
Name: price, dtype: Float64