In [2]:
pip install pandas numpy mgwr scikit-learn statsmodels


Collecting mgwr
  Downloading mgwr-2.2.1-py3-none-any.whl.metadata (1.5 kB)
Collecting libpysal>=4.0.0 (from mgwr)
  Downloading libpysal-4.12.1-py3-none-any.whl.metadata (4.8 kB)
Collecting spglm>=1.0.6 (from mgwr)
  Downloading spglm-1.1.0-py3-none-any.whl.metadata (3.9 kB)
Collecting spreg (from mgwr)
  Downloading spreg-1.8.2-py3-none-any.whl.metadata (1.7 kB)
Collecting geopandas>=0.10.0 (from libpysal>=4.0.0->mgwr)
  Downloading geopandas-1.0.1-py3-none-any.whl.metadata (2.2 kB)
Collecting shapely>=2.0.1 (from libpysal>=4.0.0->mgwr)
  Downloading shapely-2.0.7-cp312-cp312-win_amd64.whl.metadata (7.1 kB)
Collecting pyogrio>=0.7.2 (from geopandas>=0.10.0->libpysal>=4.0.0->mgwr)
  Downloading pyogrio-0.10.0-cp312-cp312-win_amd64.whl.metadata (5.6 kB)
Collecting pyproj>=3.3.0 (from geopandas>=0.10.0->libpysal>=4.0.0->mgwr)
  Downloading pyproj-3.7.1-cp312-cp312-win_amd64.whl.metadata (31 kB)
Downloading mgwr-2.2.1-py3-none-any.whl (47 kB)
   ---------------------------------------- 0

DEPRECATION: Loading egg at c:\users\admin\appdata\local\programs\python\python312\lib\site-packages\vboxapi-1.0-py3.12.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330

[notice] A new release of pip is available: 24.1.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import numpy as np
import pandas as pd
from numpy.linalg import cond, matrix_rank
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from mgwr.gwr import GWR
from mgwr.sel_bw import Sel_BW

# Load new datasets
print("Loading new datasets...")
train_data = pd.read_csv("new_normalized_train.csv").sample(50000, random_state=42)
test_data = pd.read_csv("new_normalized_test.csv").sample(50000, random_state=42)

# Define target variable and features
print("Defining target variable and features...")
target_variable = "resale_price"
model_columns = [
    "month", "year", "town_LE", "flat_type_LE", "storey_range_LE",  
    "price_per_sqm", "flat_model_LE", "lease_commence_date", "Latitude", "Longitude", 
    "LTAMRTStation_within_1km", "MallCoordinates_within_1km", "Hawker_within_1km", 
    "PreSchool_within_1km", "Primary_within_1km", "Secondary_within_1km", 
    "JuniorCollege_within_1km", "MixedLevel_within_1km", "NParks_within_1km", "Sports_within_1km"
]

# Convert features and target variable to numeric
print("Converting features and target variable to numeric...")
train_data[model_columns] = train_data[model_columns].apply(pd.to_numeric, errors='coerce')
test_data[model_columns] = test_data[model_columns].apply(pd.to_numeric, errors='coerce')
train_data[target_variable] = pd.to_numeric(train_data[target_variable], errors='coerce')
test_data[target_variable] = pd.to_numeric(test_data[target_variable], errors='coerce')

# Remove zero-variance columns
print("Checking for zero-variance columns...")
zero_var_cols = [col for col in model_columns if train_data[col].nunique() == 1]
if zero_var_cols:
    print(f"Dropping zero-variance columns: {zero_var_cols}")
    train_data.drop(columns=zero_var_cols, inplace=True)
    test_data.drop(columns=zero_var_cols, inplace=True)
    model_columns = [col for col in model_columns if col not in zero_var_cols]

# Check condition number before processing
cond_number = np.linalg.cond(train_data[model_columns].values)
print(f"Initial Condition number of X_train: {cond_number:.2e}")

# Remove highly collinear features using VIF (excluding Longitude and Latitude)
print("Checking for multicollinearity using VIF...")
def calculate_vif(df):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = df.columns
    vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif_data

vif_columns = [col for col in model_columns if col not in ["Longitude", "Latitude"]]  # Keep these in the model
while True:
    vif_df = calculate_vif(train_data[vif_columns])
    max_vif = vif_df["VIF"].max()
    if max_vif > 4:  # Lower VIF threshold to remove high collinearity
        feature_to_drop = vif_df.loc[vif_df["VIF"].idxmax(), "Feature"]
        print(f"Dropping {feature_to_drop} due to high VIF ({max_vif:.2f})")
        train_data.drop(columns=[feature_to_drop], inplace=True)
        test_data.drop(columns=[feature_to_drop], inplace=True)
        vif_columns.remove(feature_to_drop)
    else:
        break

# Check for low-variance features again after VIF filtering
low_var_cols = [col for col in vif_columns if train_data[col].std() < 1e-4]
if low_var_cols:
    print(f"Dropping low-variance columns: {low_var_cols}")
    train_data.drop(columns=low_var_cols, inplace=True)
    test_data.drop(columns=low_var_cols, inplace=True)
    vif_columns = [col for col in vif_columns if col not in low_var_cols]

# Slightly jitter geographical coordinates to ensure uniqueness
print("Applying jitter to geographical coordinates...")
train_data[["Longitude", "Latitude"]] += np.random.normal(0, 0.0001, train_data[["Longitude", "Latitude"]].shape)
test_data[["Longitude", "Latitude"]] += np.random.normal(0, 0.0001, test_data[["Longitude", "Latitude"]].shape)

# Ensure minimum spatial uniqueness threshold
unique_locations = len(train_data[["Longitude", "Latitude"]].drop_duplicates())
if unique_locations / len(train_data) < 0.95:
    print("❌ ERROR: Too many duplicate spatial points. GWR may fail.")
    exit()

# Scale features for numerical stability
print("Scaling features for numerical stability...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_data[vif_columns])
X_test_scaled = scaler.transform(test_data[vif_columns])
print(f"Condition number after scaling: {cond(X_train_scaled):.2e}")

# Extract geographical coordinates
print("Extracting geographical coordinates...")
coords_train = train_data[['Longitude', 'Latitude']].values
coords_test = test_data[['Longitude', 'Latitude']].values

# Check matrix rank before proceeding
if matrix_rank(X_train_scaled) < X_train_scaled.shape[1]:
    print("❌ ERROR: Feature matrix is still singular. Skipping GWR model.")
    exit()

# Select optimal bandwidth using cross-validation
print("Selecting optimal bandwidth using cross-validation...")
try:
    selector = Sel_BW(coords_train, train_data[target_variable].values.reshape(-1, 1), X_train_scaled)
    optimal_bandwidth = selector.search()
    print(f"Optimal Bandwidth: {optimal_bandwidth}")
except np.linalg.LinAlgError:
    print("❌ ERROR: Matrix is still singular after preprocessing.")
    print("Possible cause: Check feature correlation or spatial diversity.")
    optimal_bandwidth = None

# Fit the GWR model only if bandwidth selection was successful
if optimal_bandwidth is not None:
    print("Fitting the GWR model...")
    gwr_model = GWR(coords_train, train_data[target_variable].values.reshape(-1, 1), X_train_scaled, bw=optimal_bandwidth)
    gwr_results = gwr_model.fit()
    print("GWR Model Fitted Successfully!")
else:
    print("Skipping GWR model fitting due to singular matrix issue.")

Loading new datasets...
Defining target variable and features...
Converting features and target variable to numeric...
Checking for zero-variance columns...
Initial Condition number of X_train: 1.51e+05
Checking for multicollinearity using VIF...
Dropping year due to high VIF (55.94)
Dropping PreSchool_within_1km due to high VIF (20.27)
Dropping lease_commence_date due to high VIF (12.15)
Dropping flat_type_LE due to high VIF (9.53)
Dropping Primary_within_1km due to high VIF (8.78)
Dropping price_per_sqm due to high VIF (6.65)
Dropping flat_model_LE due to high VIF (5.76)
Applying jitter to geographical coordinates...
Scaling features for numerical stability...
Condition number after scaling: 2.01e+00
Extracting geographical coordinates...
Selecting optimal bandwidth using cross-validation...
Optimal Bandwidth: 1434.0
Fitting the GWR model...
GWR Model Fitted Successfully!


In [64]:
# GWR VIF

import numpy as np
import pandas as pd
from numpy.linalg import cond, matrix_rank
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from mgwr.gwr import GWR
from mgwr.sel_bw import Sel_BW
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load new datasets
print("Loading new datasets...")
train_data = pd.read_csv("new_normalized_train.csv")
test_data = pd.read_csv("new_normalized_test.csv")

# Define target variable and features
print("Defining target variable and features...")
target_variable = "resale_price"
model_columns = [
    "month", "year", "town_LE", "flat_type_LE", "storey_range_LE",  
    "price_per_sqm", "flat_model_LE", "lease_commence_date", "Latitude", "Longitude", 
    "LTAMRTStation_within_1km", "MallCoordinates_within_1km", "Hawker_within_1km", 
    "PreSchool_within_1km", "Primary_within_1km", "Secondary_within_1km", 
    "JuniorCollege_within_1km", "MixedLevel_within_1km", "NParks_within_1km", "Sports_within_1km"
]

# Convert features and target variable to numeric
print("Converting features and target variable to numeric...")
train_data[model_columns] = train_data[model_columns].apply(pd.to_numeric, errors='coerce')
test_data[model_columns] = test_data[model_columns].apply(pd.to_numeric, errors='coerce')
train_data[target_variable] = pd.to_numeric(train_data[target_variable], errors='coerce')
test_data[target_variable] = pd.to_numeric(test_data[target_variable], errors='coerce')

# Remove zero-variance columns
print("Checking for zero-variance columns...")
zero_var_cols = [col for col in model_columns if train_data[col].nunique() == 1]
if zero_var_cols:
    print(f"Dropping zero-variance columns: {zero_var_cols}")
    train_data.drop(columns=zero_var_cols, inplace=True)
    test_data.drop(columns=zero_var_cols, inplace=True)
    model_columns = [col for col in model_columns if col not in zero_var_cols]

# Check condition number before processing
cond_number = np.linalg.cond(train_data[model_columns].values)
print(f"Initial Condition number of X_train: {cond_number:.2e}")

# Remove highly collinear features using VIF (excluding Longitude and Latitude)
print("Checking for multicollinearity using VIF...")
def calculate_vif(df):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = df.columns
    vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif_data

vif_columns = [col for col in model_columns if col not in ["Longitude", "Latitude"]]  # Keep these in the model
while True:
    vif_df = calculate_vif(train_data[vif_columns])
    max_vif = vif_df["VIF"].max()
    if max_vif > 4:  # Lower VIF threshold to remove high collinearity
        feature_to_drop = vif_df.loc[vif_df["VIF"].idxmax(), "Feature"]
        print(f"Dropping {feature_to_drop} due to high VIF ({max_vif:.2f})")
        train_data.drop(columns=[feature_to_drop], inplace=True)
        test_data.drop(columns=[feature_to_drop], inplace=True)
        vif_columns.remove(feature_to_drop)
    else:
        break

# Check for low-variance features again after VIF filtering
low_var_cols = [col for col in vif_columns if train_data[col].std() < 1e-4]
if low_var_cols:
    print(f"Dropping low-variance columns: {low_var_cols}")
    train_data.drop(columns=low_var_cols, inplace=True)
    test_data.drop(columns=low_var_cols, inplace=True)
    vif_columns = [col for col in vif_columns if col not in low_var_cols]

# Slightly jitter geographical coordinates to ensure uniqueness
print("Applying jitter to geographical coordinates...")
train_data[["Longitude", "Latitude"]] += np.random.normal(0, 0.0001, train_data[["Longitude", "Latitude"]].shape)
test_data[["Longitude", "Latitude"]] += np.random.normal(0, 0.0001, test_data[["Longitude", "Latitude"]].shape)

# Ensure minimum spatial uniqueness threshold
unique_locations = len(train_data[["Longitude", "Latitude"]].drop_duplicates())
if unique_locations / len(train_data) < 0.95:
    print("❌ ERROR: Too many duplicate spatial points. GWR may fail.")
    exit()

# Scale features for numerical stability
print("Scaling features for numerical stability...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_data[vif_columns])
X_test_scaled = scaler.transform(test_data[vif_columns])
print(f"Condition number after scaling: {cond(X_train_scaled):.2e}")

# Extract geographical coordinates
print("Extracting geographical coordinates...")
coords_train = train_data[['Longitude', 'Latitude']].values
coords_test = test_data[['Longitude', 'Latitude']].values

# Check matrix rank before proceeding
if matrix_rank(X_train_scaled) < X_train_scaled.shape[1]:
    print("❌ ERROR: Feature matrix is still singular. Skipping GWR model.")
    exit()

# Select optimal bandwidth using cross-validation
print("Selecting optimal bandwidth using cross-validation...")
try:
    selector = Sel_BW(coords_train, train_data[target_variable].values.reshape(-1, 1), X_train_scaled)
    optimal_bandwidth = selector.search()
    print(f"Optimal Bandwidth: {optimal_bandwidth}")
except np.linalg.LinAlgError:
    print("❌ ERROR: Matrix is still singular after preprocessing.")
    print("Possible cause: Check feature correlation or spatial diversity.")
    optimal_bandwidth = None

# Fit the GWR model only if bandwidth selection was successful
if optimal_bandwidth is not None:
    print("Fitting the GWR model...")
    gwr_model = GWR(coords_train, train_data[target_variable].values.reshape(-1, 1), X_train_scaled, bw=optimal_bandwidth)
    gwr_results = gwr_model.fit()
    print("GWR Model Fitted Successfully!")
    gwr_results.summary()
    
    # Evaluate model performance
    print("Evaluating GWR model...")

    print('Mean R2 =', gwr_results.R2)
    print('AIC =', gwr_results.aic)
    print('AICc =', gwr_results.aicc)

    # Generate predictions correctly
    scale = gwr_results.scale
    residuals = gwr_results.resid_response

    y_test_pred = gwr_model.predict(coords_test, X_test_scaled, scale, residuals)

    # print(f"y_test_pred: {y_test_pred.predictions}")

    # Compute evaluation metrics
    mae = mean_absolute_error(test_data[target_variable], y_test_pred.predictions.flatten())
    mse = mean_squared_error(test_data[target_variable], y_test_pred.predictions.flatten())
    rmse = np.sqrt(mse)
    r2 = r2_score(test_data[target_variable], y_test_pred.predictions.flatten())

    # Print evaluation metrics
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
    print(f"R² Score: {r2:.4f}")



else:
    print("Skipping GWR model fitting due to singular matrix issue.")


Loading new datasets...
Defining target variable and features...
Converting features and target variable to numeric...
Checking for zero-variance columns...
Initial Condition number of X_train: 1.20e+05
Checking for multicollinearity using VIF...
Dropping year due to high VIF (60.30)
Dropping PreSchool_within_1km due to high VIF (21.03)
Dropping lease_commence_date due to high VIF (13.58)
Dropping flat_type_LE due to high VIF (10.02)
Dropping Primary_within_1km due to high VIF (9.21)
Dropping price_per_sqm due to high VIF (6.55)
Dropping flat_model_LE due to high VIF (5.61)
Applying jitter to geographical coordinates...
Scaling features for numerical stability...
Condition number after scaling: 2.13e+00
Extracting geographical coordinates...
Selecting optimal bandwidth using cross-validation...
Optimal Bandwidth: 301.0
Fitting the GWR model...
GWR Model Fitted Successfully!
Model type                                                         Gaussian
Number of observations:              

In [78]:
# import numpy as np
# import pandas as pd
# from numpy.linalg import cond, matrix_rank
# from sklearn.preprocessing import StandardScaler
# from statsmodels.stats.outliers_influence import variance_inflation_factor
# from mgwr.gwr import GWR
# from mgwr.sel_bw import Sel_BW
# from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# # Load new datasets
# print("Loading new datasets...")
# train_data = pd.read_csv("../../Data/normalized_test.csv").sample(1000, random_state=42)
# test_data = pd.read_csv("../../Data/normalized_train.csv").sample(1000, random_state=42)

# # Define target variable and features
# print("Defining target variable and features...")
# target_variable = "resale_price"
# model_columns = [
#     "month", "year", 

#     "town_YISHUN", "town_WOODLANDS", "town_TOA PAYOH", "town_TAMPINES", "town_SERANGOON", "town_SENGKANG", "town_SEMBAWANG", "town_QUEENSTOWN", "town_PUNGGOL", "town_PASIR RIS", 
#     "town_MARINE PARADE", "town_KALLANG/WHAMPOA", "town_JURONG WEST", "town_JURONG EAST", "town_HOUGANG", "town_GEYLANG", "town_CLEMENTI", "town_CHOA CHU KANG", "town_CENTRAL AREA", 
#     "town_BUKIT TIMAH", "town_BUKIT PANJANG", "town_BUKIT MERAH", "town_BUKIT BATOK", "town_BISHAN", "town_BEDOK", 

#     "flat_type_MULTI-GENERATION", "flat_type_EXECUTIVE", "flat_type_5 ROOM", "flat_type_4 ROOM", "flat_type_3 ROOM", "flat_type_2 ROOM", 

#     "storey_range_LE", 
#     "price_per_sqm", 

#     "flat_model_Type S2", "flat_model_Type S1", "flat_model_Terrace", "flat_model_Standard", "flat_model_Simplified", "flat_model_Premium Maisonette", "flat_model_Premium Apartment Loft", 
#     "flat_model_Premium Apartment", "flat_model_New Generation", "flat_model_Multi Generation", "flat_model_Model A2", "flat_model_Model A-Maisonette", "flat_model_Model A", 
#     "flat_model_Maisonette", "flat_model_Improved-Maisonette", "flat_model_Improved", "flat_model_DBSS", "flat_model_Apartment", "flat_model_Adjoined flat", "flat_model_3Gen", 

#     "lease_commence_date",

#     "Latitude", "Longitude", 

#     "LTAMRTStation_within_1km",
#     "MallCoordinates_within_1km", "Hawker_within_1km", 
#     "PreSchool_within_1km", "Primary_within_1km", "Secondary_within_1km", 
#     "JuniorCollege_within_1km","MixedLevel_within_1km", 
#     "NParks_within_1km", "Sports_within_1km", 
# ]


# # Convert boolean columns to integers (0 or 1)
# print("Converting boolean columns to integers...")
# bool_cols = train_data.select_dtypes(include=['bool']).columns
# train_data[bool_cols] = train_data[bool_cols].astype(int)
# test_data[bool_cols] = test_data[bool_cols].astype(int)

# # Convert all features and target variable to numeric
# print("Ensuring all features are numeric...")
# train_data[model_columns] = train_data[model_columns].apply(pd.to_numeric, errors='coerce')
# test_data[model_columns] = test_data[model_columns].apply(pd.to_numeric, errors='coerce')
# train_data[target_variable] = pd.to_numeric(train_data[target_variable], errors='coerce')
# test_data[target_variable] = pd.to_numeric(test_data[target_variable], errors='coerce')

# # Remove zero-variance columns
# print("Checking for zero-variance columns...")
# zero_var_cols = [col for col in model_columns if train_data[col].nunique() == 1]
# if zero_var_cols:
#     print(f"Dropping zero-variance columns: {zero_var_cols}")
#     train_data.drop(columns=zero_var_cols, inplace=True)
#     test_data.drop(columns=zero_var_cols, inplace=True)
#     model_columns = [col for col in model_columns if col not in zero_var_cols]

# # Check condition number before processing
# cond_number = np.linalg.cond(train_data[model_columns].values)
# print(f"Initial Condition number of X_train: {cond_number:.2e}")

# # Remove highly collinear features using VIF (excluding Longitude and Latitude)
# print("Checking for multicollinearity using VIF...")
# def calculate_vif(df):
#     vif_data = pd.DataFrame()
#     vif_data["Feature"] = df.columns
#     vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
#     return vif_data

# vif_columns = [col for col in model_columns if col not in ["Longitude", "Latitude"]]  # Keep these in the model
# while True:
#     vif_df = calculate_vif(train_data[vif_columns])
#     max_vif = vif_df["VIF"].max()
#     if max_vif > 4:  # Lower VIF threshold to remove high collinearity
#         feature_to_drop = vif_df.loc[vif_df["VIF"].idxmax(), "Feature"]
#         print(f"Dropping {feature_to_drop} due to high VIF ({max_vif:.2f})")
#         train_data.drop(columns=[feature_to_drop], inplace=True)
#         test_data.drop(columns=[feature_to_drop], inplace=True)
#         vif_columns.remove(feature_to_drop)
#     else:
#         break

# # Check for low-variance features again after VIF filtering
# low_var_cols = [col for col in vif_columns if train_data[col].std() < 1]
# if low_var_cols:
#     print(f"Dropping low-variance columns: {low_var_cols}")
#     train_data.drop(columns=low_var_cols, inplace=True)
#     test_data.drop(columns=low_var_cols, inplace=True)
#     vif_columns = [col for col in vif_columns if col not in low_var_cols]

# # Slightly jitter geographical coordinates to ensure uniqueness
# print("Applying jitter to geographical coordinates...")
# train_data[["Longitude", "Latitude"]] += np.random.normal(0, 0.0001, train_data[["Longitude", "Latitude"]].shape)
# test_data[["Longitude", "Latitude"]] += np.random.normal(0, 0.0001, test_data[["Longitude", "Latitude"]].shape)

# # Ensure minimum spatial uniqueness threshold
# unique_locations = len(train_data[["Longitude", "Latitude"]].drop_duplicates())
# if unique_locations / len(train_data) < 0.95:
#     print("❌ ERROR: Too many duplicate spatial points. GWR may fail.")
#     exit()

# # Scale features for numerical stability
# print("Scaling features for numerical stability...")
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(train_data[vif_columns])
# X_test_scaled = scaler.transform(test_data[vif_columns])
# print(f"Condition number after scaling: {cond(X_train_scaled):.2e}")

# # Extract geographical coordinates
# print("Extracting geographical coordinates...")
# coords_train = train_data[['Longitude', 'Latitude']].values
# coords_test = test_data[['Longitude', 'Latitude']].values

# # Check matrix rank before proceeding
# if matrix_rank(X_train_scaled) < X_train_scaled.shape[1]:
#     print("❌ ERROR: Feature matrix is still singular. Skipping GWR model.")
#     exit()

# # Select optimal bandwidth using cross-validation
# print("Selecting optimal bandwidth using cross-validation...")
# try:
#     selector = Sel_BW(coords_train, train_data[target_variable].values.reshape(-1, 1), X_train_scaled)
#     optimal_bandwidth = selector.search()
#     print(f"Optimal Bandwidth: {optimal_bandwidth}")
# except np.linalg.LinAlgError:
#     print("❌ ERROR: Matrix is still singular after preprocessing.")
#     print("Possible cause: Check feature correlation or spatial diversity.")
#     optimal_bandwidth = None

# # Fit the GWR model only if bandwidth selection was successful
# if optimal_bandwidth is not None:
#     print("Fitting the GWR model...")
#     gwr_model = GWR(coords_train, train_data[target_variable].values.reshape(-1, 1), X_train_scaled, bw=optimal_bandwidth)
#     gwr_results = gwr_model.fit()
#     print("GWR Model Fitted Successfully!")
#     gwr_results.summary()
    
#     # Evaluate model performance
#     print("Evaluating GWR model...")

#     print('Mean R2 =', gwr_results.R2)
#     print('AIC =', gwr_results.aic)
#     print('AICc =', gwr_results.aicc)

#     # Generate predictions correctly
#     scale = gwr_results.scale
#     residuals = gwr_results.resid_response

#     y_test_pred = gwr_model.predict(coords_test, X_test_scaled, scale, residuals)

#     # print(f"y_test_pred: {y_test_pred.predictions}")

#     # Compute evaluation metrics
#     mae = mean_absolute_error(test_data[target_variable], y_test_pred.predictions.flatten())
#     mse = mean_squared_error(test_data[target_variable], y_test_pred.predictions.flatten())
#     rmse = np.sqrt(mse)
#     r2 = r2_score(test_data[target_variable], y_test_pred.predictions.flatten())

#     # Print evaluation metrics
#     print(f"Mean Absolute Error (MAE): {mae:.2f}")
#     print(f"Mean Squared Error (MSE): {mse:.2f}")
#     print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
#     print(f"R² Score: {r2:.4f}")



# else:
#     print("Skipping GWR model fitting due to singular matrix issue.")

Loading new datasets...
Defining target variable and features...
Converting boolean columns to integers...
Ensuring all features are numeric...
Checking for zero-variance columns...
Dropping zero-variance columns: ['flat_type_MULTI-GENERATION', 'flat_model_Type S2', 'flat_model_Terrace', 'flat_model_Premium Maisonette', 'flat_model_Multi Generation', 'flat_model_Model A-Maisonette', 'flat_model_Improved-Maisonette', 'flat_model_Adjoined flat', 'flat_model_3Gen']
Initial Condition number of X_train: 1.58e+19
Checking for multicollinearity using VIF...


  vif = 1. / (1. - r_squared_i)


Dropping flat_type_EXECUTIVE due to high VIF (inf)
Dropping flat_model_Model A due to high VIF (727074.61)
Dropping year due to high VIF (264.68)
Dropping lease_commence_date due to high VIF (32.60)
Dropping flat_type_4 ROOM due to high VIF (27.74)
Dropping PreSchool_within_1km due to high VIF (24.47)
Dropping Primary_within_1km due to high VIF (12.41)
Dropping LTAMRTStation_within_1km due to high VIF (11.14)
Dropping price_per_sqm due to high VIF (10.94)
Dropping Hawker_within_1km due to high VIF (9.66)
Dropping NParks_within_1km due to high VIF (5.28)
Dropping month due to high VIF (4.61)
Dropping Secondary_within_1km due to high VIF (4.52)
Dropping storey_range_LE due to high VIF (4.22)
Dropping low-variance columns: ['town_YISHUN', 'town_WOODLANDS', 'town_TOA PAYOH', 'town_TAMPINES', 'town_SERANGOON', 'town_SENGKANG', 'town_SEMBAWANG', 'town_QUEENSTOWN', 'town_PUNGGOL', 'town_PASIR RIS', 'town_MARINE PARADE', 'town_KALLANG/WHAMPOA', 'town_JURONG WEST', 'town_JURONG EAST', 'town_HOU

In [None]:
# GWR PCA

import numpy as np
import pandas as pd
from mgwr.gwr import GWR
from mgwr.sel_bw import Sel_BW
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.decomposition import PCA

# Load new datasets
print("Loading new datasets...")
train_data = pd.read_csv("../../Data/normalized_test.csv")
test_data = pd.read_csv("../../Data/normalized_train.csv")

# Define target variable and features
print("Defining target variable and features...")
target_variable = "resale_price"
model_columns = [
    "month", "year", 
    "town_YISHUN", "town_WOODLANDS", "town_TOA PAYOH", "town_TAMPINES", "town_SERANGOON", "town_SENGKANG", "town_SEMBAWANG", "town_QUEENSTOWN", "town_PUNGGOL", "town_PASIR RIS", 
    "town_MARINE PARADE", "town_KALLANG/WHAMPOA", "town_JURONG WEST", "town_JURONG EAST", "town_HOUGANG", "town_GEYLANG", "town_CLEMENTI", "town_CHOA CHU KANG", "town_CENTRAL AREA", 
    "town_BUKIT TIMAH", "town_BUKIT PANJANG", "town_BUKIT MERAH", "town_BUKIT BATOK", "town_BISHAN", "town_BEDOK", 
    "flat_type_MULTI-GENERATION", "flat_type_EXECUTIVE", "flat_type_5 ROOM", "flat_type_4 ROOM", "flat_type_3 ROOM", "flat_type_2 ROOM", 
    "storey_range_LE", 
    "price_per_sqm", 
    "flat_model_Type S2", "flat_model_Type S1", "flat_model_Terrace", "flat_model_Standard", "flat_model_Simplified", "flat_model_Premium Maisonette", "flat_model_Premium Apartment Loft", 
    "flat_model_Premium Apartment", "flat_model_New Generation", "flat_model_Multi Generation", "flat_model_Model A2", "flat_model_Model A-Maisonette", "flat_model_Model A", 
    "flat_model_Maisonette", "flat_model_Improved-Maisonette", "flat_model_Improved", "flat_model_DBSS", "flat_model_Apartment", "flat_model_Adjoined flat", "flat_model_3Gen", 
    "lease_commence_date",
    "Latitude", "Longitude", 
    "LTAMRTStation_within_1km",
    "MallCoordinates_within_1km", "Hawker_within_1km", 
    "PreSchool_within_1km", "Primary_within_1km", "Secondary_within_1km", 
    "JuniorCollege_within_1km","MixedLevel_within_1km", 
    "NParks_within_1km", "Sports_within_1km", 
]

# Convert boolean columns to integers (0 or 1)
print("Converting boolean columns to integers...")
bool_cols = train_data.select_dtypes(include=['bool']).columns
train_data[bool_cols] = train_data[bool_cols].astype(int)
test_data[bool_cols] = test_data[bool_cols].astype(int)

# Convert all features and target variable to numeric
print("Ensuring all features are numeric...")
train_data[model_columns] = train_data[model_columns].apply(pd.to_numeric, errors='coerce')
test_data[model_columns] = test_data[model_columns].apply(pd.to_numeric, errors='coerce')
train_data[target_variable] = pd.to_numeric(train_data[target_variable], errors='coerce')
test_data[target_variable] = pd.to_numeric(test_data[target_variable], errors='coerce')

# Extract feature matrix and coordinates
X_train = train_data[model_columns].values
X_test = test_data[model_columns].values
coords_train = train_data[['Longitude', 'Latitude']].values
coords_test = test_data[['Longitude', 'Latitude']].values

# Apply PCA for Dimensionality Reduction (Preserving 90% Variance)
print("Applying PCA to reduce feature dimensions...")
pca = PCA(n_components=0.97)  # Preserve 90% variance
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)


print(f"Number of Principal Components Selected: {X_train_pca.shape[1]}")
print(f"Explained Variance: {sum(pca.explained_variance_ratio_):.2f}")

# kernels = ['gaussian', 'bisquare', 'exponential']
# adaptive_options = [True, False]

kernel ='gaussian'
adaptive = False

try:
        
        # Select optimal bandwidth using cross-validation
        print("Selecting optimal bandwidth using cross-validation...")
        # selector = Sel_BW(coords_train, train_data[target_variable].values.reshape(-1, 1), X_train_pca, kernel=kernel, fixed=not adaptive)
        
        # Randomly select sample_size rows for bandwidth selection
        sample_indices = np.random.choice(len(train_data), 5000, replace=False)

        selector = Sel_BW(
            coords_train[sample_indices], 
            train_data[target_variable].values[sample_indices].reshape(-1, 1), 
            X_train_pca[sample_indices],
            kernel=kernel, 
            fixed=not adaptive
        )
        
        opt_bw = selector.search()
        print(f"Optimal Bandwidth: {opt_bw}")
        
        print("Training the GWR model...")
        gwr_model = GWR(coords_train, train_data[target_variable].values.reshape(-1, 1), X_train_pca, bw=opt_bw, kernel=kernel)
        gwr_results = gwr_model.fit()
        print("GWR Model Fitted Successfully!", )
        gwr_results.summary()
        
        # Generate predictions correctly
        scale = gwr_results.scale
        residuals = gwr_results.resid_response
        y_pred = gwr_model.predict(coords_test, X_test_pca, scale, residuals)
        
        mae = mean_absolute_error(test_data[target_variable], y_pred.predictions.flatten())
        mse = mean_squared_error(test_data[target_variable], y_pred.predictions.flatten())
        rmse = np.sqrt(mse)
        r2 = r2_score(test_data[target_variable], y_pred.predictions.flatten())
        mape = np.mean(np.abs((test_data[target_variable] - y_pred.predictions.flatten()) / test_data[target_variable])) * 100
        aic = gwr_results.aic

        # Print evaluation metrics

        print(f"Mean Absolute Error (MAE): {mae}")
        print(f"Mean Absolute Percentage Error (MAPE): {mape}")
        print(f"Mean Squared Error (MSE): {mse}")
        print(f"Root Mean Squared Error (RMSE): {rmse}")
        print(f"R² Score: {r2}")

except Exception as e:
        print(f"Error occurred with kernel={kernel}, Adaptive={adaptive} - {str(e)}")
    

Loading new datasets...
Defining target variable and features...
Converting boolean columns to integers...
Ensuring all features are numeric...
Applying PCA to reduce feature dimensions...
Number of Principal Components Selected: 7
Explained Variance: 0.98
Selecting optimal bandwidth using cross-validation...
Optimal Bandwidth: 0.4
Training the GWR model...


In [19]:
print("Checking for NaN or Inf in train data...")
print(train_data.isna().sum())  # Count NaNs
print(np.isinf(train_data).sum())  # Count Inf values

print("Checking for NaN or Inf in PCA-transformed training data...")
print(np.isnan(X_train_pca).sum())  # Count NaNs
print(np.isinf(X_train_pca).sum())  # Count Inf values

print("Checking for NaN or Inf in X_train before PCA...")
print(np.isnan(X_train).sum())
print(np.isinf(X_train).sum())

print("Checking for NaN or Inf in target variable...")
print(np.isnan(train_data[target_variable]).sum())
print(np.isinf(train_data[target_variable]).sum())

print("Checking for NaN or Inf in coordinates...")
print(np.isnan(coords_train).sum())
print(np.isinf(coords_train).sum())


Checking for NaN or Inf in train data...
month                 0
year                  0
town                  0
town_LE               0
town_YISHUN           0
                     ..
MixedLevel_nearest    0
NParks_within_1km     0
NParks_nearest        0
Sports_within_1km     0
Sports_nearest        0
Length: 91, dtype: int64


TypeError: ufunc 'isinf' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [None]:
=

Checking for NaN or Inf values in PCA-transformed data...
NaN in X_train_pca: 0 | Inf in X_train_pca: 0
NaN in X_test_pca: 0 | Inf in X_test_pca: 0



GUASSIAN
Mean Absolute Error (MAE): 0.07886822348806316
Mean Squared Error (MSE): 0.010459145242201446
Root Mean Squared Error (RMSE): 0.10226996256086851
R² Score: 0.3386578854628215

BISQUARE
Mean Absolute Error (MAE): 0.07549681841499624
Mean Squared Error (MSE): 0.00996194775705299
Root Mean Squared Error (RMSE): 0.09980955744342819
R² Score: 0.3700961749746552

EXPONENTIAL
Mean Absolute Error (MAE): 0.07836792864710952
Mean Squared Error (MSE): 0.010350346436098199
Root Mean Squared Error (RMSE): 0.1017366523731649
R² Score: 0.34553734174927964

In [120]:
import numpy as np
import pandas as pd
from mgwr.gwr import GWR
from mgwr.sel_bw import Sel_BW
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.decomposition import PCA
from itertools import product

# Load new datasets
print("Loading new datasets...")
train_data = pd.read_csv("../../Data/normalized_test.csv")
test_data = pd.read_csv("../../Data/normalized_train.csv")

# Define target variable and features
print("Defining target variable and features...")
target_variable = "resale_price"
model_columns = [
    "month", "year", 
    "town_YISHUN", "town_WOODLANDS", "town_TOA PAYOH", "town_TAMPINES", "town_SERANGOON", "town_SENGKANG", "town_SEMBAWANG", "town_QUEENSTOWN", "town_PUNGGOL", "town_PASIR RIS", 
    "town_MARINE PARADE", "town_KALLANG/WHAMPOA", "town_JURONG WEST", "town_JURONG EAST", "town_HOUGANG", "town_GEYLANG", "town_CLEMENTI", "town_CHOA CHU KANG", "town_CENTRAL AREA", 
    "town_BUKIT TIMAH", "town_BUKIT PANJANG", "town_BUKIT MERAH", "town_BUKIT BATOK", "town_BISHAN", "town_BEDOK", 
    "flat_type_MULTI-GENERATION", "flat_type_EXECUTIVE", "flat_type_5 ROOM", "flat_type_4 ROOM", "flat_type_3 ROOM", "flat_type_2 ROOM", 
    "storey_range_LE", 
    "price_per_sqm", 
    "flat_model_Type S2", "flat_model_Type S1", "flat_model_Terrace", "flat_model_Standard", "flat_model_Simplified", "flat_model_Premium Maisonette", "flat_model_Premium Apartment Loft", 
    "flat_model_Premium Apartment", "flat_model_New Generation", "flat_model_Multi Generation", "flat_model_Model A2", "flat_model_Model A-Maisonette", "flat_model_Model A", 
    "flat_model_Maisonette", "flat_model_Improved-Maisonette", "flat_model_Improved", "flat_model_DBSS", "flat_model_Apartment", "flat_model_Adjoined flat", "flat_model_3Gen", 
    "lease_commence_date",
    "Latitude", "Longitude", 
    "LTAMRTStation_within_1km",
    "MallCoordinates_within_1km", "Hawker_within_1km", 
    "PreSchool_within_1km", "Primary_within_1km", "Secondary_within_1km", 
    "JuniorCollege_within_1km","MixedLevel_within_1km", 
    "NParks_within_1km", "Sports_within_1km", 
]

# Convert boolean columns to integers (0 or 1)
print("Converting boolean columns to integers...")
bool_cols = train_data.select_dtypes(include=['bool']).columns
train_data[bool_cols] = train_data[bool_cols].astype(int)
test_data[bool_cols] = test_data[bool_cols].astype(int)

# Convert all features and target variable to numeric
print("Ensuring all features are numeric...")
train_data[model_columns] = train_data[model_columns].apply(pd.to_numeric, errors='coerce')
test_data[model_columns] = test_data[model_columns].apply(pd.to_numeric, errors='coerce')
train_data[target_variable] = pd.to_numeric(train_data[target_variable], errors='coerce')
test_data[target_variable] = pd.to_numeric(test_data[target_variable], errors='coerce')

# Extract feature matrix and coordinates
X_train = train_data[model_columns].values
X_test = test_data[model_columns].values
coords_train = train_data[['Longitude', 'Latitude']].values
coords_test = test_data[['Longitude', 'Latitude']].values

# Define PCA variance thresholds and sample sizes to test
pca_variance_options = [0.95, 0.97, 0.99]
sample_sizes = [8000, 9000, 10000]

# Store results
results = []

for pca_variance, sample_size in product(pca_variance_options, sample_sizes):
    print(f"Testing PCA Variance = {pca_variance}, Sample Size = {sample_size}")

    # Apply PCA for Dimensionality Reduction
    print("Applying PCA to reduce feature dimensions...")
    pca = PCA(n_components=pca_variance)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    num_components = X_train_pca.shape[1]
    explained_variance = sum(pca.explained_variance_ratio_)

    print(f"Number of Principal Components Selected: {num_components}")
    print(f"Explained Variance: {explained_variance:.2f}")

    # Select optimal bandwidth using cross-validation
    print("Selecting optimal bandwidth using cross-validation...")
    try:
        # Randomly select sample_size rows for bandwidth selection
        sample_indices = np.random.choice(len(train_data), sample_size, replace=False)

        selector = Sel_BW(
            coords_train[sample_indices], 
            train_data[target_variable].values[sample_indices].reshape(-1, 1), 
            X_train_pca[sample_indices]
        )

        optimal_bandwidth = selector.search()
        print(f"Optimal Bandwidth: {optimal_bandwidth}")

        # Store results
        results.append({
            "PCA Variance": pca_variance,
            "Sample Size": sample_size,
            "Num Components": num_components,
            "Explained Variance": explained_variance,
            "Optimal Bandwidth": optimal_bandwidth
        })

    except np.linalg.LinAlgError:
        print("❌ ERROR: Matrix is still singular after preprocessing.")
        results.append({
            "PCA Variance": pca_variance,
            "Sample Size": sample_size,
            "Num Components": num_components,
            "Explained Variance": explained_variance,
            "Optimal Bandwidth": "Singular Matrix Error"
        })

# Save results to CSV
results_df = pd.DataFrame(results)
results_df.to_csv("gwr_pca_bandwidth_experiment.csv", index=False)
print("✅ Experimentation results saved to gwr_pca_bandwidth_experiment.csv")


Loading new datasets...
Defining target variable and features...
Converting boolean columns to integers...
Ensuring all features are numeric...
Testing PCA Variance = 0.95, Sample Size = 8000
Applying PCA to reduce feature dimensions...
Number of Principal Components Selected: 6
Explained Variance: 0.96
Selecting optimal bandwidth using cross-validation...
Optimal Bandwidth: 56.0
Testing PCA Variance = 0.95, Sample Size = 9000
Applying PCA to reduce feature dimensions...
Number of Principal Components Selected: 6
Explained Variance: 0.96
Selecting optimal bandwidth using cross-validation...
❌ ERROR: Matrix is still singular after preprocessing.
Testing PCA Variance = 0.95, Sample Size = 10000
Applying PCA to reduce feature dimensions...
Number of Principal Components Selected: 6
Explained Variance: 0.96
Selecting optimal bandwidth using cross-validation...


KeyboardInterrupt: 

In [24]:
import numpy as np
import pandas as pd
from mgwr.gwr import GWR
from mgwr.sel_bw import Sel_BW
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.decomposition import PCA
from itertools import product
import dask
from dask.distributed import Client

# Initialize Dask client for parallel processing
client = Client()

# Load new datasets
print("Loading new datasets...")
train_data = pd.read_csv("../../Data/normalized_test.csv")
test_data = pd.read_csv("../../Data/normalized_train.csv")

# Define target variable and features
print("Defining target variable and features...")
target_variable = "resale_price"
model_columns = [
    "month", "year", 
    "town_YISHUN", "town_WOODLANDS", "town_TOA PAYOH", "town_TAMPINES", "town_SERANGOON", "town_SENGKANG", "town_SEMBAWANG", "town_QUEENSTOWN", "town_PUNGGOL", "town_PASIR RIS", 
    "town_MARINE PARADE", "town_KALLANG/WHAMPOA", "town_JURONG WEST", "town_JURONG EAST", "town_HOUGANG", "town_GEYLANG", "town_CLEMENTI", "town_CHOA CHU KANG", "town_CENTRAL AREA", 
    "town_BUKIT TIMAH", "town_BUKIT PANJANG", "town_BUKIT MERAH", "town_BUKIT BATOK", "town_BISHAN", "town_BEDOK", 
    "flat_type_MULTI-GENERATION", "flat_type_EXECUTIVE", "flat_type_5 ROOM", "flat_type_4 ROOM", "flat_type_3 ROOM", "flat_type_2 ROOM", 
    "storey_range_LE", 
    "price_per_sqm", 
    "flat_model_Type S2", "flat_model_Type S1", "flat_model_Terrace", "flat_model_Standard", "flat_model_Simplified", "flat_model_Premium Maisonette", "flat_model_Premium Apartment Loft", 
    "flat_model_Premium Apartment", "flat_model_New Generation", "flat_model_Multi Generation", "flat_model_Model A2", "flat_model_Model A-Maisonette", "flat_model_Model A", 
    "flat_model_Maisonette", "flat_model_Improved-Maisonette", "flat_model_Improved", "flat_model_DBSS", "flat_model_Apartment", "flat_model_Adjoined flat", "flat_model_3Gen", 
    "lease_commence_date",
    "Latitude", "Longitude", 
    "LTAMRTStation_within_1km",
    "MallCoordinates_within_1km", "Hawker_within_1km", 
    "PreSchool_within_1km", "Primary_within_1km", "Secondary_within_1km", 
    "JuniorCollege_within_1km","MixedLevel_within_1km", 
    "NParks_within_1km", "Sports_within_1km", 
]

# Convert boolean columns to integers (0 or 1)
print("Converting boolean columns to integers...")
bool_cols = train_data.select_dtypes(include=['bool']).columns
train_data[bool_cols] = train_data[bool_cols].astype(int)
test_data[bool_cols] = test_data[bool_cols].astype(int)

# Convert all features and target variable to numeric
print("Ensuring all features are numeric...")
train_data[model_columns] = train_data[model_columns].apply(pd.to_numeric, errors='coerce')
test_data[model_columns] = test_data[model_columns].apply(pd.to_numeric, errors='coerce')
train_data[target_variable] = pd.to_numeric(train_data[target_variable], errors='coerce')
test_data[target_variable] = pd.to_numeric(test_data[target_variable], errors='coerce')

# Extract feature matrix and coordinates
X_train = train_data[model_columns].values
X_test = test_data[model_columns].values
coords_train = train_data[['Longitude', 'Latitude']].values
coords_test = test_data[['Longitude', 'Latitude']].values

# Apply PCA to retain 97% variance
print("Applying PCA for 0.97 variance...")
pca = PCA(n_components=0.97)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

print(f"Number of Principal Components Selected: {X_train_pca.shape[1]}")
print(f"Explained Variance: {sum(pca.explained_variance_ratio_):.2f}")


# kernels = ['gaussian', 'bisquare', 'exponential']
# adaptive_options = [True, False]

# Define hyperparameter grid
kernels = ['gaussian']
adaptive_options = [False]

print("Traning and Evaluation...")
# Function to train and evaluate a GWR model
@dask.delayed
def train_gwr(kernel, adaptive):
    print(f"Testing kernel: {kernel}, Adaptive: {adaptive}", flush=True)
    try:
        # selector = Sel_BW(coords_train, train_data[target_variable].values.reshape(-1, 1), X_train_pca, kernel=kernel, fixed=not adaptive)
        
        # Randomly select sample_size rows for bandwidth selection
        sample_indices = np.random.choice(len(train_data), 5000, replace=False)

        selector = Sel_BW(
            coords_train[sample_indices], 
            train_data[target_variable].values[sample_indices].reshape(-1, 1), 
            X_train_pca[sample_indices],
            kernel=kernel, 
            fixed=not adaptive
        )
        
        opt_bw = selector.search()
        
        print("Fitting the GWR model...", flush=True)
        gwr_model = GWR(coords_train, train_data[target_variable].values.reshape(-1, 1), X_train_pca, bw=opt_bw, kernel=kernel, fixed=not adaptive)
        gwr_results = gwr_model.fit()
        print("GWR Model Fitted Successfully!", flush=True)
        gwr_results.summary()
        
        # Generate predictions correctly
        scale = gwr_results.scale
        residuals = gwr_results.resid_response
        y_pred = gwr_model.predict(coords_test, X_test_pca, scale, residuals)
        
        mae = mean_absolute_error(test_data[target_variable], y_pred.predictions.flatten())
        mse = mean_squared_error(test_data[target_variable], y_pred.predictions.flatten())
        rmse = np.sqrt(mse)
        r2 = r2_score(test_data[target_variable], y_pred.predictions.flatten())
        mape = np.mean(np.abs((test_data[target_variable] - y_pred.predictions.flatten()) / test_data[target_variable])) * 100
        aic = gwr_results.aic
    except Exception as e:
        print(f"Error occurred with kernel: {kernel}, Adaptive: {adaptive}: {str(e)}", flush=True)
        opt_bw, mae, mse, rmse, r2, mape, aic = -1, -1, -1, -1, -1, -1, -1
    
    return {
        "Kernel": kernel,
        "Adaptive": adaptive,
        "Bandwidth": opt_bw,
        "MAE": mae,
        "MSE": mse,
        "RMSE": rmse,
        "R²": r2,
        "MAPE": mape,
        "AIC": aic
    }

# Parallel grid search using Dask
results = dask.compute(*[train_gwr(kernel, adaptive) for kernel, adaptive in product(kernels, adaptive_options)])

# Save results to CSV
results_df = pd.DataFrame(results)
results_df.to_csv("gwr_results_1.csv", index=False)

# Identify best model based on AIC
best_result = min(results, key=lambda x: x["AIC"])
print(f"Best Kernel: {best_result['Kernel']}, Adaptive: {best_result['Adaptive']}, Optimal Bandwidth: {best_result['Bandwidth']}")
print(f"Best AIC Score: {best_result['AIC']}")

# Shutdown the Dask client to free resources
client.shutdown()

Perhaps you already have a cluster running?
Hosting the HTTP server on port 59812 instead


Loading new datasets...
Defining target variable and features...
Converting boolean columns to integers...
Ensuring all features are numeric...
Applying PCA for 0.97 variance...
Number of Principal Components Selected: 7
Explained Variance: 0.98
Traning and Evaluation...


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
Task exception was never retrieved
future: <Task finished name='Task-9136864' coro=<Client._gather.<locals>.wait() done, defined at c:\Users\Admin\AppData\Local\Programs\Python\Python312\Lib\site-packages\distributed\client.py:2394> exception=AllExit()>
Traceback (most recent call last):
  File "c:\Users\Admin\AppData\Local\Programs\Python\Python312\Lib\site-packages\distributed\client.py", line 2403, in wait
    raise AllExit()
distributed.client.AllExit


KeyboardInterrupt: 