# **CLASSIFICATION**

## Imports

In [93]:
# Bascic imports
import pandas as pd
import numpy as np

# Model imports
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Preprocessing imports
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_validate, StratifiedKFold

## Load Dataset

In [94]:
data = pd.read_csv('../data/cleaned/data_final.csv')

# We drop the columns that we won't use for classification
data = data.drop(columns=["url", "description"])
data.head()

Unnamed: 0,price,zone,neighborhood,built_area,usable_area,bedrooms,bathrooms,floor,exterior,elevator,garage,storage_room,balcony,new,condition,year,agency,consumption_label,emissions_label
0,440000.0,deusto,"La Ribera-Ibarrekolanda, Bilbao",76.0,70.0,2,2,3.0,exterior,True,True,True,True,False,Buen estado,2025.0,ORDUNTE Inmobiliaria,A,A
1,442000.0,deusto,"La Ribera-Ibarrekolanda, Bilbao",82.0,67.104025,2,2,1.0,exterior,True,True,False,True,True,Nuevo,2025.0,LOIOLA GESTIÓN INMOBILIARIA,A,A
2,381000.0,deusto,"La Ribera-Ibarrekolanda, Bilbao",58.0,44.04577,1,1,4.0,exterior,True,True,False,True,True,Nuevo,2025.0,LOIOLA GESTIÓN INMOBILIARIA,A,A
3,575000.0,deusto,"La Ribera-Ibarrekolanda, Bilbao",104.0,88.240759,3,2,1.0,exterior,True,True,False,True,True,Nuevo,2025.0,LOIOLA GESTIÓN INMOBILIARIA,A,A
4,306000.0,deusto,"La Ribera-Ibarrekolanda, Bilbao",62.0,47.888812,1,1,1.0,exterior,True,True,False,False,True,Nuevo,2025.0,Loiola,A,A


## General Configuration

### Defining the Model

In [95]:
# We specify categorical columns for preprocessing
cat_cols = ["exterior", "condition", "agency", "consumption_label", "emissions_label"]

# Métricas para evaluación (ajustadas para clasificación multiclase)
scoring = {
    'accuracy': 'accuracy',
    'recall_weighted': 'recall_weighted',
    'precision_weighted': 'precision_weighted',
    'f1_weighted': 'f1',
    'matthews': "matthews_corrcoef",
    'balanced_accuracy': 'balanced_accuracy'
}

# Probar múltiples modelos
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Random Forest (tuned)': RandomForestClassifier(n_estimators=300, max_depth=15, min_samples_split=5, random_state=42, n_jobs=-1),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42, multi_class='multinomial'),
    'Naive Bayes': GaussianNB(),
#    'SVM (Linear)': SVC(kernel='linear', random_state=42, probability=True),
#    'SVM (RBF)': SVC(kernel='rbf', random_state=42, probability=True),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# Usar StratifiedKFold para mantener la distribución de clases
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

## Based on Zone

First we will try to classify the houses according to their zone

### Modify data

In [96]:
# We drop the neighbourhood column to classify by zone
data_zone = data.drop(columns=["neighborhood"])

data_zone = data_zone.drop(columns=["agency", "consumption_label", "emissions_label"])

cat_cols = ["exterior", "condition"]

# We also split the data into features and target
X_zone = data_zone.drop(columns=["zone"])
y_zone = data_zone["zone"]

# Check everything is ok
print(y_zone.head())
X_zone.head()

0    deusto
1    deusto
2    deusto
3    deusto
4    deusto
Name: zone, dtype: object


Unnamed: 0,price,built_area,usable_area,bedrooms,bathrooms,floor,exterior,elevator,garage,storage_room,balcony,new,condition,year
0,440000.0,76.0,70.0,2,2,3.0,exterior,True,True,True,True,False,Buen estado,2025.0
1,442000.0,82.0,67.104025,2,2,1.0,exterior,True,True,False,True,True,Nuevo,2025.0
2,381000.0,58.0,44.04577,1,1,4.0,exterior,True,True,False,True,True,Nuevo,2025.0
3,575000.0,104.0,88.240759,3,2,1.0,exterior,True,True,False,True,True,Nuevo,2025.0
4,306000.0,62.0,47.888812,1,1,1.0,exterior,True,True,False,False,True,Nuevo,2025.0


### Apply the Models

In [99]:
# ESTRATEGIA 1:
print("="*80)
print("ESTRATEGIA 1: Eliminar todas las filas con valores nulos")
print("="*80)

X_consumption_1encoded = pd.get_dummies(X_zone, columns=cat_cols)
y_consumption_1encoded = LabelEncoder().fit_transform(y_zone)

print(f"\nNúmero de muestras: {len(X_consumption_1encoded)}")
print(f"Número de features: {X_consumption_1encoded.shape[1]}")
print(f"Distribución de clases: {np.bincount(y_consumption_1encoded)}")

# Probar múltiples modelos
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Random Forest (tuned)': RandomForestClassifier(n_estimators=300, max_depth=15, min_samples_split=5, random_state=42, n_jobs=-1),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42, multi_class='multinomial'),
    'Naive Bayes': GaussianNB(),
#    'SVM (Linear)': SVC(kernel='linear', random_state=42, probability=True),
#    'SVM (RBF)': SVC(kernel='rbf', random_state=42, probability=True),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}

results_subset1 = {}
for name, model in models.items():
    print(f"\n{name}:")
    scores = cross_validate(model, X_consumption_1encoded, y_consumption_1encoded, 
                           cv=cv_strategy, scoring=scoring, return_train_score=False, n_jobs=-1)
    
    results_subset1[name] = {}
    for metric_name, metric_scores in scores.items():
        if metric_name.startswith('test_'):
            metric = metric_name.replace('test_', '')
            results_subset1[name][metric] = (metric_scores.mean(), metric_scores.std())
            print(f"  {metric}: {metric_scores.mean():.4f} (+/- {metric_scores.std():.4f})")

print("\n" + "="*80)

ESTRATEGIA 1: Eliminar todas las filas con valores nulos

Número de muestras: 1230
Número de features: 17
Distribución de clases: [237 127  76  61  90 156 136  34 167  33 113]

Random Forest:
  accuracy: 0.6285 (+/- 0.0190)
  recall_weighted: 0.6285 (+/- 0.0190)
  precision_weighted: 0.6370 (+/- 0.0185)
  f1_weighted: nan (+/- nan)
  matthews: 0.5780 (+/- 0.0218)
  balanced_accuracy: 0.5852 (+/- 0.0192)

Random Forest (tuned):
  accuracy: 0.6260 (+/- 0.0217)
  recall_weighted: 0.6260 (+/- 0.0217)
  precision_weighted: 0.6404 (+/- 0.0207)
  f1_weighted: nan (+/- nan)
  matthews: 0.5755 (+/- 0.0247)
  balanced_accuracy: 0.5737 (+/- 0.0254)

Logistic Regression:
  accuracy: 0.2984 (+/- 0.0202)
  recall_weighted: 0.2984 (+/- 0.0202)
  precision_weighted: 0.1557 (+/- 0.0212)
  f1_weighted: nan (+/- nan)
  matthews: 0.1871 (+/- 0.0273)
  balanced_accuracy: 0.1746 (+/- 0.0145)

Naive Bayes:
  accuracy: 0.3317 (+/- 0.0208)
  recall_weighted: 0.3317 (+/- 0.0208)
  precision_weighted: 0.3323 (+/

## Based on Neighbourhood

If we achieve good results trying to classify by zone, we will try to be more specific and classify according to the neighbourhood

### Modify data

In [15]:
# We drop the zone column to classify by neighbourhood (as it would be too easy otherwise)
data_neighbourhood = data.drop(columns=["zone"])

# We also split the data into features and target
X_neighbourhood = data_neighbourhood.drop(columns=["neighborhood"])
y_neighbourhood = data_neighbourhood["neighborhood"]
# Check everything is ok
print(y_neighbourhood.head())
X_neighbourhood.head()

0    La Ribera-Ibarrekolanda, Bilbao
1    La Ribera-Ibarrekolanda, Bilbao
2    La Ribera-Ibarrekolanda, Bilbao
3    La Ribera-Ibarrekolanda, Bilbao
4    La Ribera-Ibarrekolanda, Bilbao
Name: neighborhood, dtype: object


Unnamed: 0,price,built_area,usable_area,bedrooms,bathrooms,floor,exterior,elevator,garage,storage_room,balcony,new,condition,year,agency,consumption_label,emissions_label
0,440000.0,76.0,70.0,2,2,3.0,exterior,True,True,True,True,False,Buen estado,2025.0,ORDUNTE Inmobiliaria,A,A
1,442000.0,82.0,67.104025,2,2,1.0,exterior,True,True,False,True,True,Nuevo,2025.0,LOIOLA GESTIÓN INMOBILIARIA,A,A
2,381000.0,58.0,44.04577,1,1,4.0,exterior,True,True,False,True,True,Nuevo,2025.0,LOIOLA GESTIÓN INMOBILIARIA,A,A
3,575000.0,104.0,88.240759,3,2,1.0,exterior,True,True,False,True,True,Nuevo,2025.0,LOIOLA GESTIÓN INMOBILIARIA,A,A
4,306000.0,62.0,47.888812,1,1,1.0,exterior,True,True,False,False,True,Nuevo,2025.0,Loiola,A,A
