# Imports and Data Prep

In [2]:
import pandas as pd
# import plotly.express as px
# import plotly.graph_objects as go
# from ydata_profiling import ProfileReport
import numpy as np

dtype_mapping = {
    'propertyId': pd.StringDtype(),
    'localityName': 'category',
    'landMarks': pd.StringDtype(),
    'locality': pd.StringDtype(),
    'price': pd.Int64Dtype(),
    'nameOfSociety': pd.StringDtype(),
    'projectName': pd.StringDtype(),
    'carpetArea': pd.Int64Dtype(),
    'coveredArea': pd.Int64Dtype(),
    'carpetAreaSqft': pd.Int64Dtype(),
    'possessionStatus': pd.StringDtype(),
    'developerName': pd.StringDtype(),
    'flooringType': pd.StringDtype(),
    'floorNumber': pd.Int64Dtype(),
    'unitCountonFloor': pd.Int64Dtype(),
    'totalFloorNumber': pd.Int64Dtype(),
    'electricityStatus': pd.StringDtype(),
    'waterStatus': pd.StringDtype(),
    'longitude': pd.Float64Dtype(),
    'latitude': pd.Float64Dtype(),
    'transactionType': 'category',
    'facing': pd.StringDtype(),
    'ownershipType': pd.StringDtype(),
    'carParking': pd.StringDtype(),
    'furnished': 'category',
    'bedrooms': pd.Int64Dtype(),
    'bathrooms': pd.Int64Dtype(),
    'numberOfBalconied': pd.Int64Dtype(),
    'propertyType': 'category',
    'additionalRooms': pd.StringDtype(),
    'bookingAmountExact': pd.Int64Dtype(),
    'maintenanceChargesFrequency': 'category',
    'maintenanceCharges': pd.Int64Dtype(),
    'ageofcons': 'category',
    'isVerified': 'category',
    'listingTypeDesc': 'category',
    'premiumProperty': pd.BooleanDtype(),
    'noOfLifts': pd.Int64Dtype(),
    'propertyAmenities': pd.StringDtype(),
    'facilitiesDesc': pd.StringDtype(),
    'uuid': pd.StringDtype(),
    'flooringType_Vitrified': pd.BooleanDtype(),
    'flooringType_CeramicTiles': pd.BooleanDtype(),
    'flooringType_Marble': pd.BooleanDtype(),
    'flooringType_NormalTilesKotahStone': pd.BooleanDtype(),
    'flooringType_Granite': pd.BooleanDtype(),
    'flooringType_Wooden': pd.BooleanDtype(),
    'flooringType_Mosaic': pd.BooleanDtype(),
    'flooringType_Marbonite': pd.BooleanDtype(),
    'additionalRoom_PujaRoom': pd.BooleanDtype(),
    'additionalRoom_Study': pd.BooleanDtype(),
    'additionalRoom_Store': pd.BooleanDtype(),
    'additionalRoom_ServantRoom': pd.BooleanDtype(),
    'carParking_Open': pd.Int64Dtype(),
    'carParking_Covered': pd.Int64Dtype(),
    'ReservedParking': pd.BooleanDtype(),
}

COLUMNS_TO_DROP = [
    'coveredArea',
    'ReservedParking',
] + [
        'unitCountonFloor',
        'electricityStatus',
        'waterStatus',
        'facing',
        'bookingAmountExact',
        'isVerified',
        'listingTypeDesc',
        'maintenanceCharges',
        'maintenanceChargesFrequency',
        'latitude',
        'longitude',
        'carParking_Open',
        'carParking_Covered',
        'numberOfBalconied',
        'premiumProperty',
        'projectName',
        'nameOfSociety',
        'url',
        # 'uuid',
        'carpetAreaSqft',
        'noOfLifts',
        'ownershipType',
        'possessionStatus',
        'propertyType',

        'flooringType_Vitrified',
        'flooringType_CeramicTiles',
        'flooringType_Marble',
        'flooringType_NormalTilesKotahStone',
        'flooringType_Granite',
        'flooringType_Wooden',
        'flooringType_Mosaic',
        'flooringType_Marbonite',

        'additionalRoom_PujaRoom',
        'additionalRoom_Study',
        'additionalRoom_Store',
        'additionalRoom_ServantRoom',
        
        'landMarks', 
        'locality', 
        'developerName',]

################################################################################
# ONLY USING THE RAW SETs, NOT IMPUTED SET
################################################################################
df_train = pd.read_csv(
    '../../Data/train.csv',
    dtype = dtype_mapping,
    index_col=0
)
df_train.drop(COLUMNS_TO_DROP, axis=1, inplace=True)

df_test = pd.read_csv(
    '../../Data/test.csv',
    dtype = dtype_mapping,
    index_col=0
)
df_test.drop(COLUMNS_TO_DROP, axis=1, inplace=True)

################################################################################
# DROPPING ALL ROWS WITH MISSING VALUES
################################################################################

print("Train Set Null values: ", df_train.isna().sum(), '\n')
print("Test Set Null values: ", df_test.isna().sum(), '\n')

df_train.dropna(axis=0, inplace=True)
df_test.dropna(axis=0, inplace=True)

Train Set Null values:  localityName           0
price                  0
carpetArea          3764
floorNumber            0
totalFloorNumber       0
transactionType        0
furnished             37
bedrooms               0
bathrooms              0
ageofcons           2571
dtype: int64 

Test Set Null values:  localityName          0
price                 0
carpetArea          977
floorNumber           0
totalFloorNumber      0
transactionType       0
furnished            14
bedrooms              0
bathrooms             0
ageofcons           693
dtype: int64 



# Python Model

## Feature Encoding

In [2]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

# Assume that df_train and df_test are your already cleaned and imputed datasets.
X_train = df_train.drop("price", axis=1)
y_train = df_train["price"]

# List of numeric features
numeric_cols = [
    "carpetArea",
    "floorNumber",
    "totalFloorNumber",
    "bedrooms",
    "bathrooms",
]

# For the two features that will be encoded differently:
cat_diff_cols = ["localityName", "transactionType"]

ordinal_cols = ["furnished", "ageofcons"]

furnished_order = ['Unfurnished', 'Semi-Furnished', 'Furnished']
ordinal_transformer_furnished = Pipeline(
    steps=[
        ("ordinal", OrdinalEncoder(categories=[furnished_order])),
        ("scaler", StandardScaler()),
    ]
)

# Here we create a pipeline that first ordinal-encodes then scales the result.
age_order = [
    'Under Construction',  # first: youngest / newest state
    'New Construction',
    'Less than 5 years',
    '5 to 10 years',
    '10 to 15 years',
    '15 to 20 years',
    'Above 20 years'       # last: oldest
]
ordinal_transformer_ageofcons = Pipeline(
    steps=[
        ("ordinal", OrdinalEncoder(categories=[age_order])),
        ("scaler", StandardScaler()),
    ]
)

In [3]:
tree_preprocessor_gb = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("passthrough", "passthrough", cat_diff_cols),
        ("ord-furnished", ordinal_transformer_furnished, ["furnished"]),
        # ("ord-reservedparking", ordinal_transformer_rs, ["ReservedParking"]),
        ("ord-ageofcons", ordinal_transformer_ageofcons, ["ageofcons"]),
    ]
)

tree_pipeline_gb = Pipeline(steps=[("preprocessor", tree_preprocessor_gb)])
# Now transform the training features for the tree models:
X_train_gb = tree_pipeline_gb.fit_transform(X_train)

In [7]:
# Assuming you have df_test loaded and cleaned similarly to df_train
# Make sure it has the same columns as the original X_train before preprocessing

import pandas as pd
import numpy as np
import pickle
import json

# --- Configuration ---
TEST_DATA_JSON_PATH = "test_data_x.json"

X_test = df_test.drop("price", axis=1)
y_test = df_test["price"].astype(float) # Ensure y_test is float for metrics

# --- Save X_test to JSON for Node.js ---
# Convert categoricals to strings for JSON compatibility
X_test_json = X_test.copy()
for col in X_test_json.select_dtypes(include='category').columns:
     X_test_json[col] = X_test_json[col].astype(str)

# Use 'records' orientation for an array of objects
X_test_json.to_json(TEST_DATA_JSON_PATH, orient='records', indent=4)

print(f"Test features saved to {TEST_DATA_JSON_PATH}")
print(f"Test target shape: {y_test.shape}")


Test features saved to test_data_x.json
Test target shape: (3469,)


# Web ONNX

In [5]:
import json
import os
import time
import http.server
import socketserver
import threading # To run the server in the background

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options as ChromeOptions
# from webdriver_manager.chrome import ChromeDriverManager

# --- Configuration ---
TEST_DATA_JSON_PATH = "test_data_x.json"
PREDICTIONS_BROWSER_PATH = "onnx_browser_predictions.json"
HTML_PAGE_FILENAME = "inference_page.html"
SERVER_PORT = 8008 # Choose an available port
SERVER_ADDRESS = "localhost"

# --- Simple HTTP Server Setup ---
class Handler(http.server.SimpleHTTPRequestHandler):
    # Optional: Prevent caching to ensure fresh loads during testing
    def end_headers(self):
        self.send_header('Cache-Control', 'no-store, no-cache, must-revalidate')
        self.send_header('Pragma', 'no-cache')
        self.send_header('Expires', '0')
        super().end_headers()

def start_http_server(port, directory="."):
    """Starts a simple HTTP server in a background thread."""
    os.chdir(directory) # Serve files from the specified directory
    httpd = socketserver.TCPServer(("", port), Handler)
    print(f"Serving HTTP on http://{SERVER_ADDRESS}:{port}/ from directory '{directory}'...")
    server_thread = threading.Thread(target=httpd.serve_forever, daemon=True)
    server_thread.start()
    return httpd, server_thread

# --- Main Selenium Logic ---
def run_selenium_prediction():
    print("Starting Selenium ONNX Runtime Web prediction...")
    driver = None
    httpd = None
    server_thread = None
    original_cwd = os.getcwd() # Remember original directory

    try:
        # 0. Start HTTP Server
        # Determine the correct directory to serve from. It should be the one
        # where relative paths in inference_page.html make sense.
        # Assuming the script runs where node_modules and PipelinesAndModels are accessible
        # If not, adjust the directory path.
        serve_directory = "."
        httpd, server_thread = start_http_server(SERVER_PORT, serve_directory)
        time.sleep(1) # Give server a moment to start

        # 1. Load Test Data
        print(f"Loading test data from {TEST_DATA_JSON_PATH}...")
        test_data_path = os.path.join(original_cwd, TEST_DATA_JSON_PATH) # Use absolute path if needed
        if not os.path.exists(test_data_path):
            raise FileNotFoundError(f"Test data file not found: {test_data_path}")
        with open(test_data_path, 'r') as f:
            test_data = json.load(f)
        print(f"Loaded {len(test_data)} test samples.")

        # 2. Setup Selenium WebDriver
        print("Setting up Chrome WebDriver...")
        chrome_options = ChromeOptions()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        # Optional: Increase timeout if model loading is slow
        # chrome_options.page_load_strategy = 'normal' # Default
        # service = ChromeService(ChromeDriverManager().install())
        driver = webdriver.Chrome(options=chrome_options)
        # Set a script timeout in case JS execution hangs
        driver.set_script_timeout(120) # 120 seconds, adjust as needed
        print("WebDriver initialized.")

        # 3. Load the HTML Page via HTTP
        page_url = f"http://{SERVER_ADDRESS}:{SERVER_PORT}/{HTML_PAGE_FILENAME}"
        print(f"Navigating to {page_url}...")
        driver.get(page_url)
        # Wait for the "Ready for data." status, indicating JS has loaded initially
        # Using Selenium's explicit waits is more robust than time.sleep
        from selenium.webdriver.support.ui import WebDriverWait
        from selenium.webdriver.support import expected_conditions as EC
        from selenium.webdriver.common.by import By

        WebDriverWait(driver, 30).until(
            EC.text_to_be_present_in_element((By.ID, 'status'), 'Ready for data.')
        )
        print("HTML page loaded and initial JS ready.")

        # 4. Execute Inference Function in Browser Context
        print("Executing inference function in browser...")
        start_time = time.time()
        js_script = "return await runInferenceInBrowser(arguments[0]);"
        # Use execute_async_script if the JS function is truly async and long-running,
        # but execute_script often works fine if the await resolves within the timeout.
        predictions_or_error = driver.execute_script(js_script, test_data)
        browser_time = time.time() - start_time
        print(f"Browser inference execution time: {browser_time:.2f}s")

        # 5. Check for Errors returned from JS
        if isinstance(predictions_or_error, dict) and 'error' in predictions_or_error:
             print("Error received from browser JavaScript:")
             print(f"  Message: {predictions_or_error.get('error')}")
             print(f"  Stack: {predictions_or_error.get('stack')}")
             raise RuntimeError("JavaScript execution failed in browser.")

        if not isinstance(predictions_or_error, list):
             raise TypeError(f"Expected a list of predictions from browser, got: {type(predictions_or_error)}")

        print(f"Received {len(predictions_or_error)} predictions back from browser.")

        # 6. Save Predictions
        output_path = os.path.join(original_cwd, PREDICTIONS_BROWSER_PATH)
        print(f"Saving browser predictions to {output_path}...")
        with open(output_path, 'w') as f:
            json.dump(predictions_or_error, f, indent=2)
        print("Browser predictions saved successfully.")

    except Exception as e:
        print(f"\nError during Selenium browser prediction: {e}")
        raise

    finally:
        # 7. Close Browser
        if driver:
            print("Closing browser...")
            driver.quit()
            print("Browser closed.")
        # 8. Stop HTTP Server
        if httpd:
            print("Shutting down HTTP server...")
            httpd.shutdown() # Stop the server loop
            httpd.server_close() # Release the port
            if server_thread:
                server_thread.join(timeout=5) # Wait for thread to finish
            print("HTTP server stopped.")
        os.chdir(original_cwd) # Change back to original directory


run_selenium_prediction()


Starting Selenium ONNX Runtime Web prediction...
Serving HTTP on http://localhost:8008/ from directory '.'...
Loading test data from test_data_x.json...
Loaded 3469 test samples.
Setting up Chrome WebDriver...
WebDriver initialized.
Navigating to http://localhost:8008/inference_page.html...


127.0.0.1 - - [05/May/2025 17:18:46] "GET /inference_page.html HTTP/1.1" 200 -
127.0.0.1 - - [05/May/2025 17:18:46] "GET /node_modules/onnxruntime-web/dist/ort.min.js HTTP/1.1" 200 -


HTML page loaded and initial JS ready.
Executing inference function in browser...


127.0.0.1 - - [05/May/2025 17:18:46] code 404, message File not found
127.0.0.1 - - [05/May/2025 17:18:46] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [05/May/2025 17:18:47] "GET /node_modules/onnxruntime-web/dist/ort-wasm-simd-threaded.jsep.mjs HTTP/1.1" 200 -
127.0.0.1 - - [05/May/2025 17:18:47] "GET /node_modules/onnxruntime-web/dist/ort-wasm-simd-threaded.jsep.wasm HTTP/1.1" 200 -
127.0.0.1 - - [05/May/2025 17:18:47] "GET /prediction_pipeline_iteration_3.onnx HTTP/1.1" 200 -


Browser inference execution time: 4.41s
Received 3469 predictions back from browser.
Saving browser predictions to c:\Aditya Joshi\PuneHousePricePrediction\Stationary\Iteration_3\Tests\Prediction\onnx_browser_predictions.json...
Browser predictions saved successfully.
Closing browser...
Browser closed.
Shutting down HTTP server...
HTTP server stopped.


# Comparison: Python vs Node.js

In [8]:
import pandas as pd
import numpy as np
import pickle
import json
import time
import requests # Import requests library

# Scikit-learn metrics
from sklearn.metrics import r2_score

# ONNX Runtime
import onnxruntime as rt

# LightGBM and converters
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from skl2onnx import update_registered_converter
from skl2onnx.common.shape_calculator import calculate_linear_regressor_output_shapes
from onnxmltools.convert.lightgbm.operator_converters.LightGbm import convert_lightgbm

# --- Configuration ---
TEST_DATA_JSON_PATH = "test_data_x.json"
PREDICTIONS_JS_PATH = "onnx_js_predictions.json"
PREDICTIONS_BROWSER_PATH = "onnx_browser_predictions.json"
PIPELINE_PATH = "../../PipelinesAndModels/Prediction/prediction_pipeline_iteration_3.pkl"
ONNX_MODEL_PATH = "../../PipelinesAndModels/Prediction/prediction_pipeline_iteration_3.onnx"
API_ENDPOINT_URL = "https://phpp-api.adityajoshi.in/predict" # Your API endpoint
API_REQUEST_DELAY = 0.75 # Delay in seconds between API calls

# --- Define initial types ---
from skl2onnx.common.data_types import (
    FloatTensorType, Int64TensorType, StringTensorType
)
# (initial_types definition remains the same)
initial_types = [
    ("localityName", StringTensorType([None, 1])),
    ("carpetArea", Int64TensorType([None, 1])),
    ("floorNumber", Int64TensorType([None, 1])),
    ("totalFloorNumber", Int64TensorType([None, 1])),
    ("transactionType", StringTensorType([None, 1])),
    ("furnished", StringTensorType([None, 1])),
    ("bedrooms", Int64TensorType([None, 1])),
    ("bathrooms", Int64TensorType([None, 1])),
    ("ageofcons", StringTensorType([None, 1])),
]


# --- Function to prepare ONNX input dict ---
def prepare_onnx_input(data_row_df, types):
    # (Keep the function as defined previously)
    onx_input = {}
    for name, itype in types:
        col_name = name
        series = data_row_df[col_name]
        if isinstance(itype, StringTensorType):
            numpy_array = series.astype(str).values.reshape(-1, 1)
        elif isinstance(itype, Int64TensorType):
            numpy_array = series.values.reshape(-1, 1).astype(np.int64)
        elif isinstance(itype, FloatTensorType):
            numpy_array = series.values.reshape(-1, 1).astype(np.float32)
        else:
            raise TypeError(f"Unhandled ONNX input type: {type(itype)}")
        onx_input[name] = numpy_array
    return onx_input

# --- Main Comparison Logic ---
if __name__ == "__main__":
    print("Starting comparison with detailed error analysis (including API)...")

    # 1. Load Test Data (X_test, y_test)
    print(f"Loading test data features from {TEST_DATA_JSON_PATH}...")
    X_test = pd.read_json(TEST_DATA_JSON_PATH, orient='records')
    if 'y_test' not in globals():
         raise NameError("y_test not found. Ensure it's loaded or reload it.")
    y_test_np = y_test.to_numpy()
    print(f"Loaded X_test shape: {X_test.shape}, y_test shape: {y_test_np.shape}")

    # --- Load Models/Pipelines (Sections 2 & 3) ---
    # (Keep the loading logic for SKL Pipeline and ONNX Python as before)
    # 2. Load Scikit-learn Pipeline
    print(f"Loading Scikit-learn pipeline from {PIPELINE_PATH}...")
    try:
        update_registered_converter(
            LGBMRegressor, "LightGbmLGBMRegressor",
            calculate_linear_regressor_output_shapes, convert_lightgbm
        )
        with open(PIPELINE_PATH, 'rb') as f:
            skl_pipeline = pickle.load(f)
        print("Pipeline loaded successfully.")
    except Exception as e: print(f"Error loading pickle file: {e}"); exit()

    # 3. Load ONNX Model (Python Runtime)
    print(f"Loading ONNX model from {ONNX_MODEL_PATH}...")
    try:
        onnx_session_py = rt.InferenceSession(
            ONNX_MODEL_PATH, providers=['CPUExecutionProvider']
        )
        onnx_py_output_name = onnx_session_py.get_outputs()[0].name
    except Exception as e: print(f"Error loading ONNX model: {e}"); exit()


    # --- Generate/Load Predictions (Sections 4, 5, 6, 7) ---
    # (Keep prediction logic for SKL, ONNX Py, ONNX JS, ONNX Browser as before)
    # 4. Get Predictions: Scikit-learn
    print("Generating predictions with Scikit-learn pipeline...")
    start_time = time.time()
    y_pred_skl = skl_pipeline.predict(X_test)
    skl_time = time.time() - start_time
    print(f"Scikit-learn prediction time: {skl_time:.2f}s")


    # 5. Get Predictions: ONNX Runtime (Python)
    print("Generating predictions with ONNX Runtime (Python)...")
    y_pred_onnx_py_list = []
    start_time = time.time()
    for i in range(len(X_test)):
        row_df = X_test.iloc[i:i+1]
        onx_input = prepare_onnx_input(row_df, initial_types)
        pred = onnx_session_py.run([onnx_py_output_name], onx_input)[0]
        y_pred_onnx_py_list.append(pred.item())
        if (i + 1) % 100 == 0: print(f"  Processed {i + 1} / {len(X_test)} samples...")
    y_pred_onnx_py = np.array(y_pred_onnx_py_list)
    onnx_py_time = time.time() - start_time
    print(f"ONNX (Python) prediction time: {onnx_py_time:.2f}s")

    # 6. Load Predictions: ONNX Runtime (Node.js)
    print(f"Loading predictions from Node.js ONNX run ({PREDICTIONS_JS_PATH})...")
    y_pred_onnx_js = None
    try:
        with open(PREDICTIONS_JS_PATH, 'r') as f: y_pred_onnx_js = np.array(json.load(f))
        if len(y_pred_onnx_js) != len(y_test_np): raise ValueError("Length mismatch")
        print(f"Loaded {len(y_pred_onnx_js)} Node.js predictions.")
    except Exception as e: print(f"Error loading Node.js predictions: {e}")

    # 7. Load Predictions: ONNX Runtime (Browser/Web)
    print(f"Loading predictions from Browser ONNX run ({PREDICTIONS_BROWSER_PATH})...")
    y_pred_onnx_browser = None
    try:
        with open(PREDICTIONS_BROWSER_PATH, 'r') as f: y_pred_onnx_browser = np.array(json.load(f))
        if len(y_pred_onnx_browser) != len(y_test_np): raise ValueError("Length mismatch")
        print(f"Loaded {len(y_pred_onnx_browser)} Browser predictions.")
    except Exception as e: print(f"Error loading Browser predictions: {e}")


    # 8. Get Predictions: Live API (FastAPI)
    print(f"Generating predictions via Live API ({API_ENDPOINT_URL})...")
    y_pred_api_list = []
    api_errors = 0
    start_time = time.time()
    headers = {'Content-Type': 'application/json'}

    for i in range(len(X_test)): # Trial: 
        # Convert row to dictionary suitable for JSON payload
        # Ensure keys match exactly what your API expects
        payload = X_test.iloc[i].to_dict()
        # Convert numpy types (like int64) to standard Python types for JSON
        for key, value in payload.items():
            if isinstance(value, np.integer):
                payload[key] = int(value)
            elif isinstance(value, np.floating):
                payload[key] = float(value)
            # Add other type conversions if necessary

        try:
            response = requests.post(API_ENDPOINT_URL, headers=headers, json=payload, timeout=30) # Added timeout
            response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)

            data = response.json()
            # --- Adjust this line based on the exact key in your API response ---
            predicted_price = data.get('predictedPrice')
            if predicted_price is None:
                 print(f"Warning: 'predictedPrice' key not found in API response for row {i}. Response: {data}")
                 predicted_price = np.nan # Use NaN for missing predictions
                 api_errors += 1
            # --- End of adjustment ---
            y_pred_api_list.append(float(predicted_price)) # Ensure it's float

        except requests.exceptions.RequestException as e:
            print(f"Error calling API for row {i}: {e}")
            y_pred_api_list.append(np.nan) # Append NaN on error
            api_errors += 1
        except Exception as e: # Catch other potential errors like JSON parsing
             print(f"Non-request error processing API response for row {i}: {e}")
             y_pred_api_list.append(np.nan)
             api_errors += 1


        if (i + 1) % 50 == 0: # Log progress less frequently due to delay
            elapsed = time.time() - start_time
            print(f"  Processed {i + 1} / {len(X_test)} samples via API... (Errors: {api_errors}, Time: {elapsed:.1f}s)")

        # --- Add delay ---
        time.sleep(API_REQUEST_DELAY)

    y_pred_api = np.array(y_pred_api_list)
    api_time = time.time() - start_time
    print(f"API prediction finished. Total time: {api_time:.2f}s. Errors encountered: {api_errors}")
    # Handle potential NaNs if errors occurred
    valid_api_preds = ~np.isnan(y_pred_api)


    # 9. Calculate Error Arrays and R2 Scores
    print("\n--- Calculating Errors and R2 Scores ---")
    results = {}
    denominator =  y_test_np + 1e-10 # np.where(y_test_np == 0, 1, y_test_np)

    # Scikit-learn
    ae_skl = np.abs(y_test_np - y_pred_skl); ape_skl = np.abs((y_test_np - y_pred_skl) / denominator) * 100; r2_skl = r2_score(y_test_np, y_pred_skl)
    results["SKL_Python"] = {"AE": ae_skl, "APE": ape_skl, "R2": r2_skl}
    print(f"SKL Python R2: {r2_skl:.6f}")

    # ONNX Python
    ae_onnx_py = np.abs(y_test_np - y_pred_onnx_py); ape_onnx_py = np.abs((y_test_np - y_pred_onnx_py) / denominator) * 100; r2_onnx_py = r2_score(y_test_np, y_pred_onnx_py)
    results["ONNX_Python"] = {"AE": ae_onnx_py, "APE": ape_onnx_py, "R2": r2_onnx_py}
    print(f"ONNX Python R2: {r2_onnx_py:.6f}")

    # ONNX Node.js
    if y_pred_onnx_js is not None:
        ae_onnx_js = np.abs(y_test_np - y_pred_onnx_js); ape_onnx_js = np.abs((y_test_np - y_pred_onnx_js) / denominator) * 100; r2_onnx_js = r2_score(y_test_np, y_pred_onnx_js)
        results["ONNX_JS"] = {"AE": ae_onnx_js, "APE": ape_onnx_js, "R2": r2_onnx_js}
        print(f"ONNX JS R2:    {r2_onnx_js:.6f}")
    else: print("ONNX JS R2:    N/A")

    # ONNX Browser
    if y_pred_onnx_browser is not None:
        ae_onnx_browser = np.abs(y_test_np - y_pred_onnx_browser); ape_onnx_browser = np.abs((y_test_np - y_pred_onnx_browser) / denominator) * 100; r2_onnx_browser = r2_score(y_test_np, y_pred_onnx_browser)
        results["ONNX_Browser"] = {"AE": ae_onnx_browser, "APE": ape_onnx_browser, "R2": r2_onnx_browser}
        print(f"ONNX Browser R2:{r2_onnx_browser:.6f}")
    else: print("ONNX Browser R2:N/A")

    # Live API
    if api_errors < len(X_test): # Only calculate if we got at least some valid predictions
        # Filter y_test and predictions to only include valid API results
        y_test_filt = y_test_np[valid_api_preds]
        y_pred_api_filt = y_pred_api[valid_api_preds]
        denominator_filt = np.where(y_test_filt == 0, 1, y_test_filt)

        ae_api = np.abs(y_test_filt - y_pred_api_filt)
        ape_api = np.abs((y_test_filt - y_pred_api_filt) / denominator_filt) * 100
        r2_api = r2_score(y_test_filt, y_pred_api_filt)
        # Store the *filtered* error arrays for describe()
        results["API_FastAPI"] = {"AE": ae_api, "APE": ape_api, "R2": r2_api}
        print(f"API FastAPI R2: {r2_api:.6f} (calculated on {len(y_test_filt)} valid responses)")
    else:
        print(f"API FastAPI R2: N/A (No valid responses received)")
        results["API_FastAPI"] = {"AE": np.array([]), "APE": np.array([]), "R2": "N/A"}


    # 10. Analyze Error Distributions using Pandas describe()
    print("\n--- Error Distribution Analysis ---")
    pd.set_option('display.float_format', '{:,.4f}'.format)

    for method, data in results.items():
        print(f"\n--- {method} ---")
        r2_val = data['R2']
        print(f"R2 Score: {r2_val:.6f}" if isinstance(r2_val, float) else f"R2 Score: {r2_val}")

        if len(data["AE"]) > 0: # Check if there are errors to describe
            ae_series = pd.Series(data["AE"])
            print("\nAbsolute Error (AE) Distribution:")
            print(ae_series.describe(percentiles=[.25, .5, .75, .9, .95, .99]))

            ape_series = pd.Series(data["APE"])
            print("\nAbsolute Percentage Error (APE) Distribution (%):")
            print(ape_series.describe(percentiles=[.25, .5, .75, .9, .95, .99]))
        else:
            print("\nError distributions: N/A (No valid predictions)")


    # 11. Analyze Prediction Differences
    print("\n--- Prediction Differences Distribution ---")
    # (Keep the difference calculations as before, adding comparisons with API results)

    print("\nSKL vs ONNX-Py Difference:")
    diff_skl_onnxpy = np.abs(y_pred_skl - y_pred_onnx_py)
    print(pd.Series(diff_skl_onnxpy).describe(percentiles=[.5, .75, .95, .99, 1.0]))

    if y_pred_onnx_js is not None:
        print("\nSKL vs ONNX-JS Difference:")
        diff_skl_onnxjs = np.abs(y_pred_skl - y_pred_onnx_js)
        print(pd.Series(diff_skl_onnxjs).describe(percentiles=[.5, .75, .95, .99, 1.0]))

    if y_pred_onnx_browser is not None:
         print("\nSKL vs ONNX-Browser Difference:")
         diff_skl_onnxbrowser = np.abs(y_pred_skl - y_pred_onnx_browser)
         print(pd.Series(diff_skl_onnxbrowser).describe(percentiles=[.5, .75, .95, .99, 1.0]))

    # Add comparisons involving the API, only if valid API predictions exist
    if api_errors < len(X_test):
        print("\nSKL vs API-FastAPI Difference (on valid API responses):")
        # Compare only where API predictions were valid
        diff_skl_api = np.abs(y_pred_skl[valid_api_preds] - y_pred_api_filt)
        print(pd.Series(diff_skl_api).describe(percentiles=[.5, .75, .95, .99, 1.0]))

        if y_pred_onnx_browser is not None:
             print("\nONNX-Browser vs API-FastAPI Difference (on valid API responses):")
             diff_browser_api = np.abs(y_pred_onnx_browser[valid_api_preds] - y_pred_api_filt)
             print(pd.Series(diff_browser_api).describe(percentiles=[.5, .75, .95, .99, 1.0]))


    print("\nComparison finished.")



Starting comparison with detailed error analysis (including API)...
Loading test data features from test_data_x.json...
Loaded X_test shape: (3469, 9), y_test shape: (3469,)
Loading Scikit-learn pipeline from ../../PipelinesAndModels/Prediction/prediction_pipeline_iteration_3.pkl...
Pipeline loaded successfully.
Loading ONNX model from ../../PipelinesAndModels/Prediction/prediction_pipeline_iteration_3.onnx...
Generating predictions with Scikit-learn pipeline...




Scikit-learn prediction time: 0.39s
Generating predictions with ONNX Runtime (Python)...
  Processed 100 / 3469 samples...
  Processed 200 / 3469 samples...
  Processed 300 / 3469 samples...
  Processed 400 / 3469 samples...
  Processed 500 / 3469 samples...
  Processed 600 / 3469 samples...
  Processed 700 / 3469 samples...
  Processed 800 / 3469 samples...
  Processed 900 / 3469 samples...
  Processed 1000 / 3469 samples...
  Processed 1100 / 3469 samples...
  Processed 1200 / 3469 samples...
  Processed 1300 / 3469 samples...
  Processed 1400 / 3469 samples...
  Processed 1500 / 3469 samples...
  Processed 1600 / 3469 samples...
  Processed 1700 / 3469 samples...
  Processed 1800 / 3469 samples...
  Processed 1900 / 3469 samples...
  Processed 2000 / 3469 samples...
  Processed 2100 / 3469 samples...
  Processed 2200 / 3469 samples...
  Processed 2300 / 3469 samples...
  Processed 2400 / 3469 samples...
  Processed 2500 / 3469 samples...
  Processed 2600 / 3469 samples...
  Processe

In [9]:
compare = pd.DataFrame(
    {
        'SKL_static': y_pred_skl[:500],
        'API': y_pred_api[:500],
        'ONNX_Browser': y_pred_onnx_browser[:500],
    }
)
compare.astype(int).sample(10)

Unnamed: 0,SKL_static,API,ONNX_Browser
299,13690255,13690255,13690261
47,6785827,6785827,6785828
444,2671554,2671554,2671555
233,5171097,5171097,5171096
96,3831198,3831198,3831198
312,19916934,19916934,19916936
82,12870733,12870733,12870742
326,12583945,12583945,12583959
53,5712248,5712248,5712245
219,8265650,8265650,8265650


In [10]:
pd.concat([X_test[:500], compare.astype(int), df_test.reset_index()[:500][['price']]], axis=1).sample(10).rename(columns={
    'totalFloorNumber': 'Total Floors',
    'bedrooms': 'Bed',
    'bathrooms': 'Bath',
    'carpetArea': 'Carpet',
    'floorNumber': 'Floor',
    'transactionType': 'Type',
})

Unnamed: 0,localityName,Carpet,Floor,Total Floors,Type,furnished,Bed,Bath,ageofcons,SKL_static,API,ONNX_Browser,price
54,Sanaswadi,463,4,4,Resale,Unfurnished,2,2,New Construction,4622269,4622269,4622273,2000000
191,"Chinchwad, Pimpri Chinchwad",1239,12,22,Resale,Unfurnished,3,3,Less than 5 years,19075604,19075604,19075594,14000000
280,Wagholi,750,10,14,Resale,Unfurnished,2,2,Less than 5 years,5920278,5920278,5920280,7200000
160,Hinjewadi,820,8,21,Resale,Semi-Furnished,2,2,Less than 5 years,8663022,8663022,8663027,8000000
330,"Pimple Saudagar, Pimpri Chinchwad",760,7,7,Resale,Unfurnished,2,2,5 to 10 years,7875847,7875847,7875849,8500000
196,Balewadi,889,2,22,New Property,Unfurnished,2,2,New Construction,12332701,12332701,12332716,9700000
34,NIBM Road,1900,9,12,Resale,Furnished,3,4,5 to 10 years,22808589,22808589,22808598,23000000
355,Mohamadwadi Settlement,2200,22,25,New Property,Unfurnished,4,4,Less than 5 years,33829752,33829752,33829728,28060000
468,Keshav Nagar Mundhwa,1100,14,22,New Property,Semi-Furnished,3,3,New Construction,11803099,11803099,11803113,11550000
436,Hadapsar,730,12,22,Resale,Unfurnished,2,2,New Construction,9220459,9220459,9220469,8000000


In [11]:
df_test[:500]

Unnamed: 0_level_0,localityName,price,carpetArea,floorNumber,totalFloorNumber,transactionType,furnished,bedrooms,bathrooms,ageofcons
propertyId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
75987143,Bavdhan,11800000,1266,10,12,Resale,Unfurnished,3,3,Less than 5 years
75092669,Kondhwa,9500000,1200,4,7,Resale,Semi-Furnished,3,3,5 to 10 years
74846961,"Punawale, Pimpri Chinchwad",6100000,776,9,24,New Property,Unfurnished,2,2,Under Construction
75676635,Camp,14000000,864,2,3,Resale,Semi-Furnished,3,3,Above 20 years
77813777,Wakad,12500000,1150,1,6,Resale,Unfurnished,3,3,5 to 10 years
...,...,...,...,...,...,...,...,...,...,...
75656955,Ambegaon,4500000,593,2,5,Resale,Unfurnished,2,2,Less than 5 years
75692129,Tathawade Pimpri Chinchwad,4500000,495,3,4,Resale,Semi-Furnished,1,2,Less than 5 years
75657785,Narhe,4500000,644,1,5,Resale,Semi-Furnished,2,2,5 to 10 years
72574395,Hinjewadi,6050000,657,10,22,Resale,Semi-Furnished,2,2,Less than 5 years
