# Imports

In [41]:
import pandas as pd
# import plotly.express as px
# import plotly.graph_objects as go
# from ydata_profiling import ProfileReport
import numpy as np

dtype_mapping = {
    'propertyId': pd.StringDtype(),
    'localityName': 'category',
    'landMarks': pd.StringDtype(),
    'locality': pd.StringDtype(),
    'price': pd.Int64Dtype(),
    'nameOfSociety': pd.StringDtype(),
    'projectName': pd.StringDtype(),
    'carpetArea': pd.Int64Dtype(),
    'coveredArea': pd.Int64Dtype(),
    'carpetAreaSqft': pd.Int64Dtype(),
    'possessionStatus': pd.StringDtype(),
    'developerName': pd.StringDtype(),
    'flooringType': pd.StringDtype(),
    'floorNumber': pd.Int64Dtype(),
    'unitCountonFloor': pd.Int64Dtype(),
    'totalFloorNumber': pd.Int64Dtype(),
    'electricityStatus': pd.StringDtype(),
    'waterStatus': pd.StringDtype(),
    'longitude': pd.Float64Dtype(),
    'latitude': pd.Float64Dtype(),
    'transactionType': 'category',
    'facing': pd.StringDtype(),
    'ownershipType': pd.StringDtype(),
    'carParking': pd.StringDtype(),
    'furnished': 'category',
    'bedrooms': pd.Int64Dtype(),
    'bathrooms': pd.Int64Dtype(),
    'numberOfBalconied': pd.Int64Dtype(),
    'propertyType': 'category',
    'additionalRooms': pd.StringDtype(),
    'bookingAmountExact': pd.Int64Dtype(),
    'maintenanceChargesFrequency': 'category',
    'maintenanceCharges': pd.Int64Dtype(),
    'ageofcons': 'category',
    'isVerified': 'category',
    'listingTypeDesc': 'category',
    'premiumProperty': pd.BooleanDtype(),
    'noOfLifts': pd.Int64Dtype(),
    'propertyAmenities': pd.StringDtype(),
    'facilitiesDesc': pd.StringDtype(),
    'uuid': pd.StringDtype(),
    'flooringType_Vitrified': pd.BooleanDtype(),
    'flooringType_CeramicTiles': pd.BooleanDtype(),
    'flooringType_Marble': pd.BooleanDtype(),
    'flooringType_NormalTilesKotahStone': pd.BooleanDtype(),
    'flooringType_Granite': pd.BooleanDtype(),
    'flooringType_Wooden': pd.BooleanDtype(),
    'flooringType_Mosaic': pd.BooleanDtype(),
    'flooringType_Marbonite': pd.BooleanDtype(),
    'additionalRoom_PujaRoom': pd.BooleanDtype(),
    'additionalRoom_Study': pd.BooleanDtype(),
    'additionalRoom_Store': pd.BooleanDtype(),
    'additionalRoom_ServantRoom': pd.BooleanDtype(),
    'carParking_Open': pd.Int64Dtype(),
    'carParking_Covered': pd.Int64Dtype(),
    'ReservedParking': pd.BooleanDtype(),
}

COLUMNS_TO_DROP = [
    'coveredArea',
    'ReservedParking',
] + [
        'unitCountonFloor',
        'electricityStatus',
        'waterStatus',
        'facing',
        'bookingAmountExact',
        'isVerified',
        'listingTypeDesc',
        'maintenanceCharges',
        'maintenanceChargesFrequency',
        'latitude',
        'longitude',
        'carParking_Open',
        'carParking_Covered',
        'numberOfBalconied',
        'premiumProperty',
        'projectName',
        'nameOfSociety',
        'url',
        # 'uuid',
        'carpetAreaSqft',
        'noOfLifts',
        'ownershipType',
        'possessionStatus',
        'propertyType',

        'flooringType_Vitrified',
        'flooringType_CeramicTiles',
        'flooringType_Marble',
        'flooringType_NormalTilesKotahStone',
        'flooringType_Granite',
        'flooringType_Wooden',
        'flooringType_Mosaic',
        'flooringType_Marbonite',

        'additionalRoom_PujaRoom',
        'additionalRoom_Study',
        'additionalRoom_Store',
        'additionalRoom_ServantRoom',
        
        'landMarks', 
        'locality', 
        'developerName',]

################################################################################
# ONLY USING THE RAW SETs, NOT IMPUTED SET
################################################################################
df_train = pd.read_csv(
    'Data/train.csv',
    dtype = dtype_mapping,
    index_col=0
)
df_train.drop(COLUMNS_TO_DROP, axis=1, inplace=True)

df_test = pd.read_csv(
    'Data/test.csv',
    dtype = dtype_mapping,
    index_col=0
)
df_test.drop(COLUMNS_TO_DROP, axis=1, inplace=True)

################################################################################
# DROPPING ALL ROWS WITH MISSING VALUES
################################################################################

print("Train Set Null values: ", df_train.isna().sum(), '\n')
print("Test Set Null values: ", df_test.isna().sum(), '\n')

df_train.dropna(axis=0, inplace=True)
df_test.dropna(axis=0, inplace=True)

Train Set Null values:  localityName           0
price                  0
carpetArea          3764
floorNumber            0
totalFloorNumber       0
transactionType        0
furnished             37
bedrooms               0
bathrooms              0
ageofcons           2571
dtype: int64 

Test Set Null values:  localityName          0
price                 0
carpetArea          977
floorNumber           0
totalFloorNumber      0
transactionType       0
furnished            14
bedrooms              0
bathrooms             0
ageofcons           693
dtype: int64 



In [42]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13983 entries, 65067453 to 76736011
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   localityName      13983 non-null  category
 1   price             13983 non-null  Int64   
 2   carpetArea        13983 non-null  Int64   
 3   floorNumber       13983 non-null  Int64   
 4   totalFloorNumber  13983 non-null  Int64   
 5   transactionType   13983 non-null  category
 6   furnished         13983 non-null  category
 7   bedrooms          13983 non-null  Int64   
 8   bathrooms         13983 non-null  Int64   
 9   ageofcons         13983 non-null  category
dtypes: Int64(6), category(4)
memory usage: 925.2 KB


# Feature Encoding

## Basic Definitions

In [43]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

# Assume that df_train and df_test are your already cleaned and imputed datasets.
X_train = df_train.drop("price", axis=1)
y_train = df_train["price"]

# List of numeric features
numeric_cols = [
    "carpetArea",
    "floorNumber",
    "totalFloorNumber",
    "bedrooms",
    "bathrooms",
]

# For the two features that will be encoded differently:
cat_diff_cols = ["localityName", "transactionType"]

# The remaining categorical columns that are on an inherently ordinal scale.
# Note: even though localityName might appear here, you may choose a different
# encoding strategy depending on its cardinality. In this code, we are treating
# it specially in the one-hot transformation.
ordinal_cols = ["furnished", "ageofcons"]


# You can now proceed to train your models using X_train_linear / X_train_tree and 
# evaluate using X_test_linear / X_test_tree.


## Linear

In [44]:
# # =============================================================================
# # Pipeline for linear models
# #   - Numerical features: standard scaled.
# #   - For transactionType, furnished: one-hot encoded.
# #   - For ordinal_cols (localityName, ageofcons): ordinal-encoded
# #     and then scaled (so that all features are on a similar scale).
# # =============================================================================

# # Define transformers
# numeric_transformer = StandardScaler()

# onehot_transformer = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

# # Here we create a pipeline that first ordinal-encodes then scales the result.
# furnished_order = ['Unfurnished', 'Semi-Furnished', 'Furnished']
# ordinal_transformer_furnished = Pipeline(
#     steps=[
#         ("ordinal", OrdinalEncoder(categories=[furnished_order])),
#         ("scaler", StandardScaler()),
#     ]
# )

# # # Here we create a pipeline that first ordinal-encodes then scales the result.
# # ordinal_transformer_rs = Pipeline(
# #     steps=[
# #         ("ordinal", OrdinalEncoder()),
# #         ("scaler", StandardScaler()),
# #     ]
# # )

# # Here we create a pipeline that first ordinal-encodes then scales the result.
# age_order = [
#     'Under Construction',  # first: youngest / newest state
#     'New Construction',
#     'Less than 5 years',
#     '5 to 10 years',
#     '10 to 15 years',
#     '15 to 20 years',
#     'Above 20 years'       # last: oldest
# ]
# ordinal_transformer_ageofcons = Pipeline(
#     steps=[
#         ("ordinal", OrdinalEncoder(categories=[age_order])),
#         ("scaler", StandardScaler()),
#     ]
# )

# # Create the ColumnTransformer for the linear pipeline.
# lin_preprocessor = ColumnTransformer(
#     transformers=[
#         ("num", numeric_transformer, numeric_cols),
#         ("onehot", onehot_transformer, cat_diff_cols),
#         ("ord-furnished", ordinal_transformer_furnished, ["furnished"]),
#         # ("ord-reservedparking", ordinal_transformer_rs, ["ReservedParking"]),
#         ("ord-ageofcons", ordinal_transformer_ageofcons, ["ageofcons"]),
#     ]
# )

# # The overall pipeline (here you could add a linear model as the final estimator)
# lin_pipeline = Pipeline(steps=[("preprocessor", lin_preprocessor)])

# # Now transform the training features for the linear model:
# X_train_linear = lin_pipeline.fit_transform(X_train)

## Tree (numeric only)

In [45]:
# =============================================================================
# Pipeline for tree-based models
#   - Numerical features: standard scaled.
#   - For all categorical features (transactionType, furnished, ReservedParking,
#     localityName, ageofcons): ordinal-encoded and then scaled.
# =============================================================================

# Define transformers
numeric_transformer = StandardScaler()

onehot_transformer = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

# Here we create a pipeline that first ordinal-encodes then scales the result.
furnished_order = ['Unfurnished', 'Semi-Furnished', 'Furnished']
ordinal_transformer_furnished = Pipeline(
    steps=[
        ("ordinal", OrdinalEncoder(categories=[furnished_order])),
        ("scaler", StandardScaler()),
    ]
)

# Here we create a pipeline that first ordinal-encodes then scales the result.
age_order = [
    'Under Construction',  # first: youngest / newest state
    'New Construction',
    'Less than 5 years',
    '5 to 10 years',
    '10 to 15 years',
    '15 to 20 years',
    'Above 20 years'       # last: oldest
]
ordinal_transformer_ageofcons = Pipeline(
    steps=[
        ("ordinal", OrdinalEncoder(categories=[age_order])),
        ("scaler", StandardScaler()),
    ]
)

# Create a pipeline for encoding the categorical features as ordinal then scaling.
tree_cat_transformer = Pipeline(
    steps=[
        ("ordinal", OrdinalEncoder()),
        ("scaler", StandardScaler()),
    ]
)

tree_preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("ord", tree_cat_transformer, cat_diff_cols),
        ("ord-furnished", ordinal_transformer_furnished, ["furnished"]),
        # ("ord-reservedparking", ordinal_transformer_rs, ["ReservedParking"]),
        ("ord-ageofcons", ordinal_transformer_ageofcons, ["ageofcons"]),
    ]
)

tree_pipeline = Pipeline(steps=[("preprocessor", tree_preprocessor)])
# Now transform the training features for the tree models:

X_train_tree = tree_pipeline.fit_transform(X_train)

## Tree (numeric and categorical)

In [46]:
# # =============================================================================
# # Pipeline for models which handle categorical features:
# #   - Numerical features: standard scaled.
# #   - For categorical features (furnished, ReservedParking,
# #     , ageofcons): ordinal-encoded and then scaled.
# #   - For categorical features (transactionType, localityName): Kept as is.
# # =============================================================================

# # Create a pipeline for encoding the categorical features as ordinal then scaling.

# # Here we create a pipeline that first ordinal-encodes then scales the result.
# furnished_order = ['Unfurnished', 'Semi-Furnished', 'Furnished']
# ordinal_transformer_furnished = Pipeline(
#     steps=[
#         ("ordinal", OrdinalEncoder(categories=[furnished_order])),
#         ("scaler", StandardScaler()),
#     ]
# )

# # Here we create a pipeline that first ordinal-encodes then scales the result.
# age_order = [
#     'Under Construction',  # first: youngest / newest state
#     'New Construction',
#     'Less than 5 years',
#     '5 to 10 years',
#     '10 to 15 years',
#     '15 to 20 years',
#     'Above 20 years'       # last: oldest
# ]
# ordinal_transformer_ageofcons = Pipeline(
#     steps=[
#         ("ordinal", OrdinalEncoder(categories=[age_order])),
#         ("scaler", StandardScaler()),
#     ]
# )

# tree_preprocessor_gb = ColumnTransformer(
#     transformers=[
#         ("num", StandardScaler(), numeric_cols),
#         ("passthrough", "passthrough", cat_diff_cols),
#         ("ord-furnished", ordinal_transformer_furnished, ["furnished"]),
#         # ("ord-reservedparking", ordinal_transformer_rs, ["ReservedParking"]),
#         ("ord-ageofcons", ordinal_transformer_ageofcons, ["ageofcons"]),
#     ]
# )

# tree_pipeline_gb = Pipeline(steps=[("preprocessor", tree_preprocessor_gb)])
# # Now transform the training features for the tree models:
# X_train_gb = tree_pipeline_gb.fit_transform(X_train)

In [47]:
# print("Linear model feature shape: train:", X_train_linear.shape)
# print("Tree model feature shape: train:", X_train_tree.shape)
# print("Tree model for GB feature shape: train:", X_train_gb.shape)

In [48]:
# import pickle

# with open("PipelinesAndModels/final_preprocessor_pipeline_iteration_3.pkl", "wb") as f:
#     pickle.dump(tree_pipeline_gb, f)

# Training LightGBM model with best parameters

In [49]:
import numpy as np
import pandas as pd
import warnings

import lightgbm as lgb

# Ignore warnings for cleaner output
warnings.filterwarnings("ignore")

In [50]:
lgbm_reg = lgb.LGBMRegressor(
    random_state=42,
    learning_rate=0.1,
    n_estimators=500,
    max_depth=-1,
)
mapping = {
    "carpetArea": pd.Float64Dtype(),
    "floorNumber": pd.Float64Dtype(),
    "totalFloorNumber": pd.Float64Dtype(),
    "bedrooms": pd.Float64Dtype(),
    "bathrooms": pd.Float64Dtype(),
    "localityName": 'category',
    "transactionType": 'category',
    "furnished": pd.Float64Dtype(),
    "ageofcons": pd.Float64Dtype(),
}
X_train_tree_df = pd.DataFrame(X_train_tree, columns = numeric_cols + cat_diff_cols + ordinal_cols).astype(mapping)
y_train_df = pd.Series(y_train)

lgbm_reg.fit(X_train_tree_df, y_train_df)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000389 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 588
[LightGBM] [Info] Number of data points in the train set: 13983, number of used features: 9
[LightGBM] [Info] Start training from score 11030358.469284


In [51]:
lgbm_reg

In [52]:
# import pickle

# with open("PipelinesAndModels/lgb_model_iteration_3.pkl", "wb") as f:
#     pickle.dump(lgbm_reg, f)

# Saving entire Pipeline

In [53]:
import pickle
from sklearn.pipeline import Pipeline
import lightgbm as lgb
import pandas as pd # Make sure pandas is imported


try:
    tree_pipeline.set_output(transform="pandas")
    print("Preprocessor output configured to pandas DataFrame.")
except AttributeError:
    print("Warning: Your scikit-learn version might be older than 1.2.")
    print("`set_output(transform='pandas')` is not available.")
    print("The combined pipeline might fail during prediction if LightGBM relies on column names.")
    print("Consider upgrading scikit-learn or using column indices for categorical_feature during training.")
except Exception as e:
    print(f"An error occurred setting preprocessor output: {e}")

# Create the final combined pipeline for inference
# It chains the fitted preprocessor and the fitted regressor
final_inference_pipeline = Pipeline(steps=[
    ('preprocessing', tree_pipeline), # Your fitted preprocessor pipeline
    ('regressor', lgbm_reg)             # Your fitted LightGBM model
])

print("\nCombined inference pipeline created successfully.")

# Save the combined pipeline to a single pickle file
pipeline_filename = "PipelinesAndModels/prediction_pipeline_iteration_3.pkl"
with open(pipeline_filename, "wb") as f:
    pickle.dump(final_inference_pipeline, f)

print(f"Combined inference pipeline saved to {pipeline_filename}")


Preprocessor output configured to pandas DataFrame.

Combined inference pipeline created successfully.
Combined inference pipeline saved to PipelinesAndModels/prediction_pipeline_iteration_3.pkl


# Catrgorical Options for frontend

In [36]:
options = {}

In [37]:
options['localityName'] = sorted(list(df_train['localityName'].unique()))
options['transactionType'] = sorted(list(df_train['transactionType'].unique()))
options['ageofcons'] = age_order
options['furnished'] = furnished_order

In [38]:
import json
with open("PipelinesAndModels/options_iteration_3.json", "w") as f:
    json.dump(options, f)

# ONNX Runtime

## Imports

## Model Training

In [54]:
import pandas as pd
import numpy as np
import pickle
import warnings

# Scikit-learn imports
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

# LightGBM
# ---> Import the specific model class <---
from lightgbm import LGBMRegressor

# ONNX imports
import skl2onnx
from skl2onnx import convert_sklearn, update_registered_converter # Import the updater
from skl2onnx.common.data_types import (
    FloatTensorType,
    Int64TensorType,
    StringTensorType,
)
# ---> Import the correct shape calculator for REGRESSION <---
from skl2onnx.common.shape_calculator import (
    calculate_linear_regressor_output_shapes,
)
# ---> Import the specific converter function from onnxmltools <---
from onnxmltools.convert.lightgbm.operator_converters.LightGbm import (
    convert_lightgbm,
)
import onnxruntime as rt


# Ignore warnings for cleaner output
warnings.filterwarnings("ignore")

## Conversion to ONNX

In [55]:

# ---> REGISTER THE CONVERTER FOR LGBMRegressor <---
print("Registering LightGBM Regressor converter...")
update_registered_converter(
    LGBMRegressor, # The model class we want to handle
    "LightGbmLGBMRegressor", # A unique name for the ONNX operator
    calculate_linear_regressor_output_shapes, # Shape calculator for regression
    convert_lightgbm, # The converter function from onnxmltools
    # options=None # Options specific to the converter, often not needed for basic regression
)
print("Converter registered.")


# --- Define the Input Signature for ONNX ---
# This tells ONNX what kind of data to expect for each input column.
# The shape is [None, 1], meaning batch size unknown, 1 feature per column.
# Match the order of columns in your X_train DataFrame!
initial_types = [
    ("localityName", StringTensorType([None, 1])),
    ("carpetArea", Int64TensorType([None, 1])), # Use Int64 if original is Int
    ("floorNumber", Int64TensorType([None, 1])),
    ("totalFloorNumber", Int64TensorType([None, 1])),
    ("transactionType", StringTensorType([None, 1])),
    ("furnished", StringTensorType([None, 1])),
    ("bedrooms", Int64TensorType([None, 1])),
    ("bathrooms", Int64TensorType([None, 1])),
    ("ageofcons", StringTensorType([None, 1])),
]


print("Converting pipeline to ONNX...")
# Convert the pipeline - this should now work
onnx_model = convert_sklearn(
    final_inference_pipeline,
    "lgbm_pune_house_price",
    initial_types=initial_types,
    target_opset={'':12, 'ai.onnx.ml': 3} # Or adjust target_opset if needed, e.g., {'': 12, 'ai.onnx.ml': 2}
)
print("Conversion complete.")

# --- Save the ONNX Model ---
onnx_filename = "PipelinesAndModels/prediction_pipeline_iteration_3.onnx"
with open(onnx_filename, "wb") as f:
    f.write(onnx_model.SerializeToString())
print(f"ONNX model saved to {onnx_filename}")

Registering LightGBM Regressor converter...
Converter registered.
Converting pipeline to ONNX...
Conversion complete.
ONNX model saved to PipelinesAndModels/prediction_pipeline_iteration_3.onnx


## Testing converted model

In [None]:
# --- (Optional but Recommended) Verify the ONNX Model in Python ---
print("\nVerifying ONNX model prediction...")
sess = rt.InferenceSession(onnx_filename, providers=['CPUExecutionProvider'])

# Get one sample from your training data (or create a new one)
input_sample = X_train_tree_df.iloc[0:1] # Take the first row

# --- MODIFIED INPUT PREPARATION ---
onx_input = {}
for name, itype in initial_types:
    col_name = name # In your case, the key name matches the column name
    # Extract the single value Series
    series = input_sample[col_name]

    # Prepare the numpy array based on the expected ONNX type
    if isinstance(itype, StringTensorType):
        # For strings (from categoricals), convert to string explicitly FIRST,
        # then get values and reshape. Dtype will be 'object'.
        numpy_array = series.astype(str).values.reshape(-1, 1)
    elif isinstance(itype, Int64TensorType):
        # For integers, get values, reshape, THEN cast to np.int64
        numpy_array = series.values.reshape(-1, 1).astype(np.int64)
    elif isinstance(itype, FloatTensorType):
         # For floats, get values, reshape, THEN cast to np.float32
        numpy_array = series.values.reshape(-1, 1).astype(np.float32)
    else:
        # Handle other types if necessary, or raise an error
        raise TypeError(f"Unhandled ONNX input type: {type(itype)}")

    onx_input[name] = numpy_array
# --- END OF MODIFIED INPUT PREPARATION ---


# Make prediction with ONNX Runtime
print(f"Prepared ONNX input: {onx_input}") # Debug print
onx_pred = sess.run(None, onx_input)

# Make prediction with original Scikit-learn pipeline
skl_pred = final_inference_pipeline.predict(input_sample)

print(f"\nSample Input:\n{input_sample}")
# The ONNX output is often a list containing the actual output array
# Adjust indexing if necessary based on your model's output structure
print(f"ONNX Prediction Output Raw: {onx_pred}") # See the raw output structure
output_key = sess.get_outputs()[0].name # Get the actual output name
print(f"ONNX Prediction ({output_key}): {onx_pred[0][0]}")
print(f"SKL Prediction: {skl_pred[0]}")
print(f"Difference: {abs(onx_pred[0][0] - skl_pred[0])}")

# Check if predictions are close (allow for tiny floating point differences)
try:
    np.testing.assert_allclose(onx_pred[0][0], skl_pred[0], rtol=1e-5)
    print("\nVerification successful: ONNX and Scikit-learn predictions match.")
except AssertionError as e:
    print(f"\nVerification failed: Predictions differ significantly.\n{e}")
except IndexError:
     print(f"\nVerification failed: Could not access prediction value. Check ONNX output structure: {onx_pred}")



In [None]:
skl2onnx.get_latest_tested_opset_version()

21