# **CLASSIFICATION**

## Imports

In [49]:
# Bascic imports
import pandas as pd
import numpy as np
import re

# Model imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# More robust model imports
from sklearn.ensemble import HistGradientBoostingClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Preprocessing imports
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_validate, StratifiedKFold

## Load Dataset

In [None]:
data = pd.read_csv('../data/cleaned/data_final.csv')

# We drop the columns that we won't use for classification
data = data.drop(columns=["url", "description"])

# Convertimos las columnas de texto a categorías
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype('category')

# data.info()
# data.head() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1230 entries, 0 to 1229
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   price              1230 non-null   float64 
 1   zone               1230 non-null   category
 2   neighborhood       1230 non-null   category
 3   built_area         1230 non-null   float64 
 4   usable_area        1230 non-null   float64 
 5   bedrooms           1230 non-null   int64   
 6   bathrooms          1230 non-null   int64   
 7   floor              1230 non-null   float64 
 8   exterior           1230 non-null   category
 9   elevator           1230 non-null   bool    
 10  garage             1230 non-null   bool    
 11  storage_room       1230 non-null   bool    
 12  balcony            1230 non-null   bool    
 13  new                1230 non-null   bool    
 14  condition          1230 non-null   category
 15  year               1230 non-null   float64 
 16  agency

## Based on Zone

First we will try to classify the houses according to their zone

### Modify data

In [51]:
# We drop the neighbourhood column to classify by zone
data_zone = data.drop(columns=["neighborhood"])

# We also split the data into features and target
X_zone = data_zone.drop(columns=["zone"])
y_zone = data_zone["zone"]

# Check everything is ok
#print(y_zone.head())
#X_zone.head()

### Defining models

In [55]:
# Métricas para evaluación (ajustadas para clasificación multiclase)
SCORING = {
    'accuracy': 'accuracy',
    'recall_weighted': 'recall_weighted',
    'precision_weighted': 'precision_weighted',
    'f1_weighted': 'f1_weighted',
    'matthews': "matthews_corrcoef",
    'balanced_accuracy': 'balanced_accuracy'
}

# Probar múltiples modelos
MODELS = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Random Forest (tuned)': RandomForestClassifier(n_estimators=300, max_depth=15, min_samples_split=5, random_state=42, n_jobs=-1),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42, multi_class='multinomial'),
    'Naive Bayes': GaussianNB(),
#    'SVM (Linear)': SVC(kernel='linear', random_state=42, probability=True),
#    'SVM (RBF)': SVC(kernel='rbf', random_state=42, probability=True),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'Hist Gradient Boosting': HistGradientBoostingClassifier(max_iter=100, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42, n_jobs=-1),
    'LightGBM': LGBMClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'CatBoost': CatBoostClassifier(iterations=100, random_state=42, verbose=0) # (Verbose=0 para que no llene la pantalla de logs)
}

# Usar StratifiedKFold para mantener la distribución de clases
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

### Defining function to evaluate the models

In [53]:
def evaluate_models(X, y, cv_strategy=cv_strategy, scoring=SCORING, models=MODELS):
    # We are going to try all defined models
    results = {}
    # We evaluate each model using cross-validation
    for name, model in models.items():
        print(f"\n{name}:")
        scores = cross_validate(model, X, y, cv=cv_strategy, scoring=scoring, return_train_score=False, n_jobs=-1)
        
        # We check the results with several metrics
        results[name] = {}
        for metric_name, metric_scores in scores.items():
            if metric_name.startswith('test_'):
                metric = metric_name.replace('test_', '')
                results[name][metric] = (metric_scores.mean(), metric_scores.std())
                print(f"  {metric}: {metric_scores.mean():.4f} (+/- {metric_scores.std():.4f})")
    return results


### Strategy 1
Use all the variables, encoding categorical ones as dummies

In [54]:
# ESTRATEGIA 1:
print("="*80)
print("ESTRATEGIA 1: Utilizar todas las variables, codificando categóricas con dummies")
print("="*80)

cat_cols1 = ["exterior", "condition", "agency", "consumption_label", "emissions_label"]

X_zone_encoded_1 = pd.get_dummies(X_zone, columns=cat_cols1)
# Eliminamos caracteres especiales de los nombres de las columnas que pueden hacer fallar algunos modelos
X_zone_encoded_1.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_zone_encoded_1.columns]
y_zone_encoded_1 = LabelEncoder().fit_transform(y_zone)

print(f"\nNúmero de muestras: {len(X_zone_encoded_1)}")
print(f"Número de features: {X_zone_encoded_1.shape[1]}")
print(f"Distribución de clases: {np.bincount(y_zone_encoded_1)}")

result1s = evaluate_models(X_zone_encoded_1, y_zone_encoded_1)

print("\n" + "="*80)

ESTRATEGIA 1: Utilizar todas las variables, codificando categóricas con dummies

Número de muestras: 1230
Número de features: 324
Distribución de clases: [237 127  76  61  90 156 136  34 167  33 113]

Random Forest:
  accuracy: 0.6073 (+/- 0.0225)
  recall_weighted: 0.6073 (+/- 0.0225)
  precision_weighted: 0.6346 (+/- 0.0297)
  f1_weighted: nan (+/- nan)
  matthews: 0.5531 (+/- 0.0262)
  balanced_accuracy: 0.5439 (+/- 0.0370)

Random Forest (tuned):
  accuracy: 0.5707 (+/- 0.0186)
  recall_weighted: 0.5707 (+/- 0.0186)
  precision_weighted: 0.6337 (+/- 0.0337)
  f1_weighted: nan (+/- nan)
  matthews: 0.5133 (+/- 0.0213)
  balanced_accuracy: 0.4814 (+/- 0.0293)

Logistic Regression:
  accuracy: 0.2976 (+/- 0.0167)
  recall_weighted: 0.2976 (+/- 0.0167)
  precision_weighted: 0.1599 (+/- 0.0158)
  f1_weighted: nan (+/- nan)
  matthews: 0.1862 (+/- 0.0223)
  balanced_accuracy: 0.1768 (+/- 0.0092)

Naive Bayes:
  accuracy: 0.3317 (+/- 0.0208)
  recall_weighted: 0.3317 (+/- 0.0208)
  precis

### Strategy 1.1
We have seen that some models handle categorical columns automatically (without having to use the dummies), which can improve both the time required to fit the model and the result, so we will try it

In [None]:
# Models that handle categorical features natively

# Probar múltiples modelos
MODELS_NATIVE = {
    'Hist Gradient Boosting': HistGradientBoostingClassifier(max_iter=100, random_state=42, 
    categorical_features='from_dtype' # <--- Clave: lee el tipo 'category' de Pandas
    ),
    'XGBoost': XGBClassifier(
        n_estimators=100, 
        learning_rate=0.1, 
        random_state=42, 
        n_jobs=-1,
        tree_method="hist",      # <--- Necesario para el modo rápido
        enable_categorical=True, # <--- Activa el soporte nativo
        use_label_encoder=False,
        eval_metric='mlogloss'
    ),
    'LightGBM': LGBMClassifier(
        n_estimators=100, 
        random_state=42, 
        n_jobs=-1, 
        verbose=-1 # Silenciar warnings
    ),
    'CatBoost': CatBoostClassifier(
        iterations=100, 
        random_state=42, 
        verbose=0,                # Silencioso
        allow_writing_files=False # Evita crear carpetas 'catboost_info'
    )
}