In [8]:
pip install pandas numpy mgwr scikit-learn statsmodels


Collecting statsmodels
  Downloading statsmodels-0.14.4-cp311-cp311-win_amd64.whl.metadata (9.5 kB)
Collecting patsy>=0.5.6 (from statsmodels)
  Downloading patsy-1.0.1-py2.py3-none-any.whl.metadata (3.3 kB)
Downloading statsmodels-0.14.4-cp311-cp311-win_amd64.whl (9.9 MB)
   ---------------------------------------- 0.0/9.9 MB ? eta -:--:--
   --- ------------------------------------ 0.8/9.9 MB 4.2 MB/s eta 0:00:03
   ------ --------------------------------- 1.6/9.9 MB 4.2 MB/s eta 0:00:02
   --------- ------------------------------ 2.4/9.9 MB 4.2 MB/s eta 0:00:02
   ------------ --------------------------- 3.1/9.9 MB 4.2 MB/s eta 0:00:02
   ----------------- ---------------------- 4.2/9.9 MB 4.1 MB/s eta 0:00:02
   -------------------- ------------------- 5.0/9.9 MB 4.1 MB/s eta 0:00:02
   ----------------------- ---------------- 5.8/9.9 MB 4.2 MB/s eta 0:00:01
   --------------------------- ------------ 6.8/9.9 MB 4.2 MB/s eta 0:00:01
   ------------------------------ --------- 7.6/9


[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: C:\Users\SChoy\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
import numpy as np
import pandas as pd
from numpy.linalg import cond, matrix_rank
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from mgwr.gwr import GWR
from mgwr.sel_bw import Sel_BW

# Load new datasets
print("Loading new datasets...")
train_data = pd.read_csv("new_normalized_train.csv").sample(5000, random_state=42)
test_data = pd.read_csv("new_normalized_test.csv").sample(5000, random_state=42)

# Define target variable and features
print("Defining target variable and features...")
target_variable = "resale_price"
model_columns = [
    "month", "year", "town_LE", "flat_type_LE", "storey_range_LE",  
    "price_per_sqm", "flat_model_LE", "lease_commence_date", "Latitude", "Longitude", 
    "LTAMRTStation_within_1km", "MallCoordinates_within_1km", "Hawker_within_1km", 
    "PreSchool_within_1km", "Primary_within_1km", "Secondary_within_1km", 
    "JuniorCollege_within_1km", "MixedLevel_within_1km", "NParks_within_1km", "Sports_within_1km"
]

# Convert features and target variable to numeric
print("Converting features and target variable to numeric...")
train_data[model_columns] = train_data[model_columns].apply(pd.to_numeric, errors='coerce')
test_data[model_columns] = test_data[model_columns].apply(pd.to_numeric, errors='coerce')
train_data[target_variable] = pd.to_numeric(train_data[target_variable], errors='coerce')
test_data[target_variable] = pd.to_numeric(test_data[target_variable], errors='coerce')

# Remove zero-variance columns
print("Checking for zero-variance columns...")
zero_var_cols = [col for col in model_columns if train_data[col].nunique() == 1]
if zero_var_cols:
    print(f"Dropping zero-variance columns: {zero_var_cols}")
    train_data.drop(columns=zero_var_cols, inplace=True)
    test_data.drop(columns=zero_var_cols, inplace=True)
    model_columns = [col for col in model_columns if col not in zero_var_cols]

# Check condition number before processing
cond_number = np.linalg.cond(train_data[model_columns].values)
print(f"Initial Condition number of X_train: {cond_number:.2e}")

# Remove highly collinear features using VIF (excluding Longitude and Latitude)
print("Checking for multicollinearity using VIF...")
def calculate_vif(df):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = df.columns
    vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif_data

vif_columns = [col for col in model_columns if col not in ["Longitude", "Latitude"]]  # Keep these in the model
while True:
    vif_df = calculate_vif(train_data[vif_columns])
    max_vif = vif_df["VIF"].max()
    if max_vif > 4:  # Lower VIF threshold to remove high collinearity
        feature_to_drop = vif_df.loc[vif_df["VIF"].idxmax(), "Feature"]
        print(f"Dropping {feature_to_drop} due to high VIF ({max_vif:.2f})")
        train_data.drop(columns=[feature_to_drop], inplace=True)
        test_data.drop(columns=[feature_to_drop], inplace=True)
        vif_columns.remove(feature_to_drop)
    else:
        break

# Check for low-variance features again after VIF filtering
low_var_cols = [col for col in vif_columns if train_data[col].std() < 1e-4]
if low_var_cols:
    print(f"Dropping low-variance columns: {low_var_cols}")
    train_data.drop(columns=low_var_cols, inplace=True)
    test_data.drop(columns=low_var_cols, inplace=True)
    vif_columns = [col for col in vif_columns if col not in low_var_cols]

# Slightly jitter geographical coordinates to ensure uniqueness
print("Applying jitter to geographical coordinates...")
train_data[["Longitude", "Latitude"]] += np.random.normal(0, 0.0001, train_data[["Longitude", "Latitude"]].shape)
test_data[["Longitude", "Latitude"]] += np.random.normal(0, 0.0001, test_data[["Longitude", "Latitude"]].shape)

# Ensure minimum spatial uniqueness threshold
unique_locations = len(train_data[["Longitude", "Latitude"]].drop_duplicates())
if unique_locations / len(train_data) < 0.95:
    print("❌ ERROR: Too many duplicate spatial points. GWR may fail.")
    exit()

# Scale features for numerical stability
print("Scaling features for numerical stability...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_data[vif_columns])
X_test_scaled = scaler.transform(test_data[vif_columns])
print(f"Condition number after scaling: {cond(X_train_scaled):.2e}")

# Extract geographical coordinates
print("Extracting geographical coordinates...")
coords_train = train_data[['Longitude', 'Latitude']].values
coords_test = test_data[['Longitude', 'Latitude']].values

# Check matrix rank before proceeding
if matrix_rank(X_train_scaled) < X_train_scaled.shape[1]:
    print("❌ ERROR: Feature matrix is still singular. Skipping GWR model.")
    exit()

# Select optimal bandwidth using cross-validation
print("Selecting optimal bandwidth using cross-validation...")
try:
    selector = Sel_BW(coords_train, train_data[target_variable].values.reshape(-1, 1), X_train_scaled)
    optimal_bandwidth = selector.search()
    print(f"Optimal Bandwidth: {optimal_bandwidth}")
except np.linalg.LinAlgError:
    print("❌ ERROR: Matrix is still singular after preprocessing.")
    print("Possible cause: Check feature correlation or spatial diversity.")
    optimal_bandwidth = None

# Fit the GWR model only if bandwidth selection was successful
if optimal_bandwidth is not None:
    print("Fitting the GWR model...")
    gwr_model = GWR(coords_train, train_data[target_variable].values.reshape(-1, 1), X_train_scaled, bw=optimal_bandwidth)
    gwr_results = gwr_model.fit()
    print("GWR Model Fitted Successfully!")
else:
    print("Skipping GWR model fitting due to singular matrix issue.")

Loading new datasets...
Defining target variable and features...
Converting features and target variable to numeric...
Checking for zero-variance columns...
Initial Condition number of X_train: 1.51e+05
Checking for multicollinearity using VIF...
Dropping year due to high VIF (55.94)
Dropping PreSchool_within_1km due to high VIF (20.27)
Dropping lease_commence_date due to high VIF (12.15)
Dropping flat_type_LE due to high VIF (9.53)
Dropping Primary_within_1km due to high VIF (8.78)
Dropping price_per_sqm due to high VIF (6.65)
Dropping flat_model_LE due to high VIF (5.76)
Applying jitter to geographical coordinates...
Scaling features for numerical stability...
Condition number after scaling: 2.01e+00
Extracting geographical coordinates...
Selecting optimal bandwidth using cross-validation...
❌ ERROR: Matrix is still singular after preprocessing.
Possible cause: Check feature correlation or spatial diversity.
Skipping GWR model fitting due to singular matrix issue.
