# Pipeline

In [10]:
import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score, classification_report

### SpatialRefiner: refine the score given by the Random Forest by looking at the neighbours using KNN

In [None]:
class SpatialRefiner(BaseEstimator, TransformerMixin):
    """
    Taking into consideration the neighbors with KNN to refine the output
    """
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors
        self.knn = KNeighborsClassifier(n_neighbors=n_neighbors)

    def fit(self, X_spatial, y):
        self.knn.fit(X_spatial, y)
        return self

    def transform(self, X):
        return X

    def refine(self, x_spatial, y_all):
        """
        Compare predictions from RF with neighbors
        x_spatial → samples' coordinates to be refined
        y_all → labels
        """
        refined = []
        neighbors = self.knn.kneighbors(x_spatial, return_distance=False)
        for neigh_idx in neighbors:
            neighbor_classes = y_all[neigh_idx]
            new_class = np.bincount(neighbor_classes).argmax()
            refined.append(new_class)
        return np.array(refined)


In [17]:
# EXAMPLE DATA - TO BE SUBSTITUTED WITH OURS
np.random.seed(42)
n_samples = 500

# Intrinsic features (ex: "vani", materials, m^2, ...)
x_intrinsic = np.random.rand(n_samples, 10)

# Spatial coordinates (lat, long)
x_spatial = np.random.rand(n_samples, 2)

# Labels
y = np.random.randint(0, 4, size=n_samples)


In [18]:
# Train - test split for the Random Forest training
X_intr_train, X_intr_test, X_spat_train, X_spat_test, y_train, y_test = train_test_split(
    x_intrinsic, x_spatial, y, test_size=0.2, random_state=42
)

In [19]:
# Pipeline for intrinsic data
# 1. Scale the features so that they have the same scale (ex 4 "vani" & 137 m^2 don't have the same scale...)
# 2. With PCA select the X most importat features (the one that provide the most information)
# 3. Train the Random Forest to predict the class 
intrinsic_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=8)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Training
intrinsic_pipeline.fit(X_intr_train, y_train)

# Predict
y_pred_base = intrinsic_pipeline.predict(X_intr_test)


In [None]:
# Spatial Refinement

# Fit the KNN on the whole dataset
spatial_refiner = SpatialRefiner(n_neighbors=7)
spatial_refiner.fit(x_spatial, y)

# Refine predictions of the test set - in our case will be of the new house.
y_pred_refined = spatial_refiner.refine(X_spat_test, y_pred_base, y)


In [21]:
print("Accuracy (base RF):", accuracy_score(y_test, y_pred_base))
print("Accuracy (refined with KNN):", accuracy_score(y_test, y_pred_refined))

print("\nClassification report (refined):")
print(classification_report(y_test, y_pred_refined))


Accuracy (base RF): 0.31
Accuracy (refined with KNN): 0.42

Classification report (refined):
              precision    recall  f1-score   support

           0       0.44      0.57      0.50        21
           1       0.43      0.45      0.44        29
           2       0.31      0.20      0.24        25
           3       0.44      0.48      0.46        25

    accuracy                           0.42       100
   macro avg       0.41      0.42      0.41       100
weighted avg       0.41      0.42      0.41       100

