<a href="https://colab.research.google.com/github/kush1305/Satellite-Imagery-Based-Property-Valuation-/blob/main/model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install xgboost --quiet

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

TRAIN_PATH = "" ## train data features
TEST_PATH  = "" ## test data features csv (including CNN features)
SUB_PATH   = "" ##path to save the predictions

df = pd.read_csv(TRAIN_PATH)
df_test = pd.read_csv(TEST_PATH)

y = np.log1p(df["price"])

tabular_features = [
    "bedrooms","bathrooms","sqft_living","sqft_lot","floors","condition","grade",
    "view","waterfront","sqft_living15","sqft_lot15",
    "green_cover_percent","urban_density_percent"
]

X_tab = df[tabular_features]
X_tab_test = df_test[tabular_features]

cnn_features = [c for c in df.columns if c.startswith("cnn_")]
X_cnn = df[cnn_features]
X_cnn_test = df_test[cnn_features]

X_tab_train, X_tab_val, X_cnn_train, X_cnn_val, y_train, y_val = train_test_split(
    X_tab, X_cnn, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_cnn_train = scaler.fit_transform(X_cnn_train)
X_cnn_val   = scaler.transform(X_cnn_val)
X_cnn_test  = scaler.transform(X_cnn_test)

pca = PCA(n_components=100, random_state=42)
X_cnn_train_pca = pca.fit_transform(X_cnn_train)
X_cnn_val_pca   = pca.transform(X_cnn_val)
X_cnn_test_pca  = pca.transform(X_cnn_test)

X_train = np.hstack([X_tab_train.values, X_cnn_train_pca])
X_val   = np.hstack([X_tab_val.values, X_cnn_val_pca])
X_test  = np.hstack([X_tab_test.values, X_cnn_test_pca])

model = xgb.XGBRegressor(
    n_estimators=900,
    learning_rate=0.03,
    max_depth=7,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

y_pred = model.predict(X_val)

rmse = mean_squared_error(y_val, y_pred) ** 0.5
mae  = mean_absolute_error(y_val, y_pred)
r2   = r2_score(y_val, y_pred)

print("RMSE:", rmse)
print("MAE :", mae)
print("R2  :", r2)

test_pred_log = model.predict(X_test)
test_pred = np.expm1(test_pred_log)

submission = pd.DataFrame({
    "id": df_test["id"],
    "price": test_pred
})

submission.to_csv(SUB_PATH, index=False)
print("Saved:", SUB_PATH)
