# **CLASSIFICATION**

## Imports

In [81]:
# Bascic imports
import pandas as pd
import numpy as np
import re

# Model imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# More robust model imports
from sklearn.ensemble import HistGradientBoostingClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Preprocessing imports
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_validate, StratifiedKFold

## Load Dataset

In [82]:
data = pd.read_csv('../data/cleaned/data_final.csv')

# We drop the columns that we won't use for classification
data = data.drop(columns=["url", "description"])

# Convertimos las columnas de texto a categorías
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype('category')

# data.info()
# data.head() 

## Based on Zone

First we will try to classify the houses according to their zone

### Modify data

In [83]:
# We drop the neighbourhood column to classify by zone
data_zone = data.drop(columns=["neighborhood"])

# We also split the data into features and target
X_zone = data_zone.drop(columns=["zone"])
y_zone = data_zone["zone"]

# Check everything is ok
#print(y_zone.head())
#X_zone.head()

### Defining models

In [93]:
# Métricas para evaluación (ajustadas para clasificación multiclase)
SCORING = {
    'accuracy': 'accuracy',
    'recall_weighted': 'recall_weighted',
    'precision_weighted': 'precision_weighted',
    'f1_weighted': 'f1_weighted',
    'f1_macro': 'f1_macro',       # CRÍTICO: Te dirá si estás fallando en las clases pequeñas
    'matthews': "matthews_corrcoef",
    'balanced_accuracy': 'balanced_accuracy'
}

# Probar múltiples modelos
MODELS = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Random Forest (tuned)': RandomForestClassifier(n_estimators=300, max_depth=15, min_samples_split=5, random_state=42, n_jobs=-1),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42, multi_class='multinomial'),
    'Naive Bayes': GaussianNB(),
#    'SVM (Linear)': SVC(kernel='linear', random_state=42, probability=True),
#    'SVM (RBF)': SVC(kernel='rbf', random_state=42, probability=True),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'Hist Gradient Boosting': HistGradientBoostingClassifier(max_iter=100, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42, n_jobs=-1),
    'LightGBM': LGBMClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'CatBoost': CatBoostClassifier(iterations=100, random_state=42, verbose=0, allow_writing_files=False), # (Verbose=0 para que no llene la pantalla de logs)
    'Neural Network (MLP)': make_pipeline(StandardScaler(), MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42))
}

# Usar StratifiedKFold para mantener la distribución de clases
cv_strategy = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

### Defining function to evaluate the models

In [85]:
def evaluate_models(X, y, cv_strategy=cv_strategy, scoring=SCORING, models=MODELS):
    # We are going to try all defined models
    results = {}
    # We evaluate each model using cross-validation
    for name, model in models.items():
        print(f"\n{name}:")
        scores = cross_validate(model, X, y, cv=cv_strategy, scoring=scoring, return_train_score=False, n_jobs=-1)
        
        # We check the results with several metrics
        results[name] = {}
        for metric_name, metric_scores in scores.items():
            if metric_name.startswith('test_'):
                metric = metric_name.replace('test_', '')
                results[name][metric] = (metric_scores.mean(), metric_scores.std())
                print(f"  {metric}: {metric_scores.mean():.4f} (+/- {metric_scores.std():.4f})")
    return results


### Strategy 1
Use all the variables, encoding categorical ones as dummies

In [86]:
# ESTRATEGIA 1:
print("="*80)
print("ESTRATEGIA 1: Utilizar todas las variables, codificando categóricas con dummies")
print("="*80)

cat_cols1 = ["exterior", "condition", "agency", "consumption_label", "emissions_label"]

X_zone_encoded_1 = pd.get_dummies(X_zone, columns=cat_cols1)
# Eliminamos caracteres especiales de los nombres de las columnas que pueden hacer fallar algunos modelos
X_zone_encoded_1.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_zone_encoded_1.columns]
y_zone_encoded_1 = LabelEncoder().fit_transform(y_zone)

print(f"\nNúmero de muestras: {len(X_zone_encoded_1)}")
print(f"Número de features: {X_zone_encoded_1.shape[1]}")
print(f"Distribución de clases: {np.bincount(y_zone_encoded_1)}")

result1 = evaluate_models(X_zone_encoded_1, y_zone_encoded_1, models=MODELS, cv_strategy=cv_strategy)

print("\n" + "="*80)

ESTRATEGIA 1: Utilizar todas las variables, codificando categóricas con dummies

Número de muestras: 1230
Número de features: 324
Distribución de clases: [237 127  76  61  90 156 136  34 167  33 113]

Random Forest:
  accuracy: 0.6073 (+/- 0.0225)
  recall_weighted: 0.6073 (+/- 0.0225)
  precision_weighted: 0.6346 (+/- 0.0297)
  f1_weighted: 0.5966 (+/- 0.0253)
  f1_macro: 0.5697 (+/- 0.0380)
  matthews: 0.5531 (+/- 0.0262)
  balanced_accuracy: 0.5439 (+/- 0.0370)

Random Forest (tuned):
  accuracy: 0.5707 (+/- 0.0186)
  recall_weighted: 0.5707 (+/- 0.0186)
  precision_weighted: 0.6337 (+/- 0.0337)
  f1_weighted: 0.5493 (+/- 0.0209)
  f1_macro: 0.5133 (+/- 0.0309)
  matthews: 0.5133 (+/- 0.0213)
  balanced_accuracy: 0.4814 (+/- 0.0293)

Logistic Regression:
  accuracy: 0.2976 (+/- 0.0167)
  recall_weighted: 0.2976 (+/- 0.0167)
  precision_weighted: 0.1599 (+/- 0.0158)
  f1_weighted: 0.1957 (+/- 0.0071)
  f1_macro: 0.1168 (+/- 0.0055)
  matthews: 0.1862 (+/- 0.0223)
  balanced_accuracy:

### Strategy 1.1
We have seen that some models handle categorical columns automatically (without having to use the dummies), which can improve both the time required to fit the model and the result, so we will try it

In [None]:
# Models that handle categorical features natively
MODELS_NATIVE = {
    'XGBoost': XGBClassifier( n_estimators=100, learning_rate=0.1, random_state=42, n_jobs=-1, tree_method="hist", # <--- Necesario para el modo rápido
        enable_categorical=True, use_label_encoder=False, eval_metric='mlogloss'), # <--- Habilita manejo de categóricas
    'LightGBM': LGBMClassifier(n_estimators=100, random_state=42, n_jobs=-1, verbose=-1),
    'CatBoost': CatBoostClassifier(iterations=100, random_state=42, verbose=0, allow_writing_files=False,
        cat_features=cat_cols1), # Evita crear carpetas 'catboost_info'
}

print("="*80)
print("ESTRATEGIA 1.1: Utilizar todas las variables, manteniendo categóricas como categorías (modelos que lo soportan)")
print("="*80)

X_zone_unencoded_1 = X_zone.copy()

# Eliminamos caracteres especiales de los nombres de las columnas que pueden hacer fallar algunos modelos
X_zone_unencoded_1.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_zone_uncoded_1.columns]
y_zone_unencoded_1 = LabelEncoder().fit_transform(y_zone)

print(f"\nNúmero de muestras: {len(X_zone_unencoded_1)}")
print(f"Número de features: {X_zone_unencoded_1.shape[1]}")
print(f"Distribución de clases: {np.bincount(y_zone_unencoded_1)}")

result11 = evaluate_models(X_zone_unencoded_1, y_zone_unencoded_1, models=MODELS_NATIVE, scoring=SCORING)
print("\n" + "="*80)


ESTRATEGIA 1.1: Utilizar todas las variables, manteniendo categóricas como categorías (modelos que lo soportan)

Número de muestras: 1230
Número de features: 17
Distribución de clases: [237 127  76  61  90 156 136  34 167  33 113]

XGBoost:
  accuracy: 0.6285 (+/- 0.0211)
  recall_weighted: 0.6285 (+/- 0.0211)
  precision_weighted: 0.6354 (+/- 0.0232)
  f1_weighted: 0.6253 (+/- 0.0221)
  f1_macro: 0.6213 (+/- 0.0228)
  matthews: 0.5792 (+/- 0.0242)
  balanced_accuracy: 0.6127 (+/- 0.0271)

LightGBM:
  accuracy: 0.6699 (+/- 0.0151)
  recall_weighted: 0.6699 (+/- 0.0151)
  precision_weighted: 0.6793 (+/- 0.0130)
  f1_weighted: 0.6680 (+/- 0.0145)
  f1_macro: 0.6538 (+/- 0.0222)
  matthews: 0.6260 (+/- 0.0169)
  balanced_accuracy: 0.6401 (+/- 0.0238)

CatBoost:
  accuracy: 0.5764 (+/- 0.0181)
  recall_weighted: 0.5764 (+/- 0.0181)
  precision_weighted: 0.5914 (+/- 0.0205)
  f1_weighted: 0.5667 (+/- 0.0183)
  f1_macro: 0.5343 (+/- 0.0277)
  matthews: 0.5199 (+/- 0.0206)
  balanced_accuracy

### Strategy 1.2
En la estrategia 1 no hemos podido usar SVM porque el tiempo de entrenamiento era demasiado largo, aun probando a ejecutarlo en la GPU en colab. Asi que vamos a reducir un poco la cantidad de dimensiones eliminando "agency"

In [95]:
# Models SVM
MODELS_SVM = {
    'SVM (Linear)': SVC(kernel='linear', random_state=42, probability=True),
    'SVM (RBF)': SVC(kernel='rbf', random_state=42, probability=True)
}

print("="*80)
print("ESTRATEGIA 1.2: Utilizar svm")
print("="*80)

X_zone_encoded_12 = X_zone.copy()
X_zone_encoded_12[0:500]
X_zone_encoded_12.drop(columns=["agency", "consumption_label", "emissions_label"], inplace=True)  # Eliminamos "agency" para reducir dimensionalidad

cat_cols12 = ["exterior", "condition"]
X_zone_encoded_12 = pd.get_dummies(X_zone_encoded_12, columns=cat_cols12)
# Eliminamos caracteres especiales de los nombres de las columnas que pueden hacer fallar algunos modelos
X_zone_encoded_12.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_zone_encoded_12.columns]

y_zone_encoded_12 = LabelEncoder().fit_transform(y_zone)
print(f"\nNúmero de muestras: {len(X_zone_encoded_12)}")
print(f"Número de features: {X_zone_encoded_12.shape[1]}")
print(f"Distribución de clases: {np.bincount(y_zone_encoded_12)}")

result12 = evaluate_models(X_zone_encoded_12, y_zone_encoded_12, models=MODELS_SVM, scoring=SCORING)

print("\n" + "="*80)


ESTRATEGIA 1.2: Utilizar svm

Número de muestras: 1230
Número de features: 17
Distribución de clases: [237 127  76  61  90 156 136  34 167  33 113]

SVM (Linear):


KeyboardInterrupt: 