# ENTREGABLE 4

# INSTRUCCIONES

Utilizar el archivo CSV (`dataset_banco_clean.csv`) con 45189 filas y 17 columnas y aplicar las técnicas de normalización del entregable 3.

In [19]:
# imports
%pip install xgboost
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report




Note: you may need to restart the kernel to use updated packages.


In [20]:
# Cargar los datos
ruta = 'dataset_banco_clean.csv'  # Asegúrate de que esta ruta sea accesible en tu entorno
data = pd.read_csv(ruta)

# Verify the columns
print("Columns in the DataFrame:", data.columns.tolist())

# Check the first few rows to see if 'y' is present
print(data.head())

Columns in the DataFrame: ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']
   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no   2143.0     yes   no   
1   44    technician   single  secondary      no     29.0     yes   no   
2   33  entrepreneur  married  secondary      no      2.0     yes  yes   
3   47   blue-collar  married    unknown      no   1506.0     yes   no   
4   33       unknown   single    unknown      no      1.0      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may     261.0         1   -1.0         0  unknown  no  
1  unknown    5   may     151.0         1   -1.0         0  unknown  no  
2  unknown    5   may      76.0         1   -1.0         0  unknown  no  
3  unknown    5   may      92.0         1   -1.0         0  unkn

# Objetivo

Generar un model de clasificación capaz de predecir la clase de flor en función de las carácterísticas del dataset

* Aplicar las técnicas oportunas de procesamiento de datos

* Generar split de los datos

* Valorar diferentes modelos de clasificación

* Comparación entre modelos

* Ensemble

* Métricas

* Conclusiones finales

# Preprocesamos

In [21]:
from sklearn.preprocessing import LabelEncoder

# Encoding the target column 'y'
label_encoder = LabelEncoder()
data['y'] = label_encoder.fit_transform(data['y'])

# Check the transformation
print("Unique values in 'y' after encoding:", data['y'].unique())


Unique values in 'y' after encoding: [0 1]


In [22]:
from sklearn.model_selection import train_test_split

# Prepare input and output data
X = data.drop('y', axis=1)
y = data['y']


# Hacemos el split de train-test 70%-30%

In [23]:
# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Verify shapes
print("Shapes of X_train, y_train:", X_train.shape, y_train.shape)

Shapes of X_train, y_train: (31632, 16) (31632,)


# Revisamos el pipeline

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

# Identify numeric and categorical features
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()

# Create transformers for categorical and numeric variables
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Combining transformers into a single preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


# Creamos diferentes modelos de clasificación y un ensemble con todos ellos

In [29]:
from sklearn.ensemble import VotingClassifier

# Ensemble using Voting Classifier
ensemble_clf = VotingClassifier(
    estimators=[
        ('lr', clf),
        ('svm', svm_clf),
        ('rf', rf_clf),
        ('xgb', xgb_clf)
    ],
    voting='hard'
)

# Alternatively, you could use 'soft' voting if all classifiers are capable of providing probabilities.
# This often provides a performance boost as it takes the predicted probabilities into account.


In [30]:
# Logistic Regression
clf = make_pipeline(preprocessor, LogisticRegression())

# SVM Model
svm_clf = make_pipeline(preprocessor, SVC(random_state=42))

# Random Forest Model
rf_clf = make_pipeline(preprocessor, RandomForestClassifier(random_state=42))

# XGBoost Model
xgb_clf = make_pipeline(preprocessor, XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42))



# Ahora un ensemble y evaluamos los modelos.

In [31]:
models = {
    "Logistic Regression": clf,
    "SVM": svm_clf,
    "Random Forest": rf_clf,
    "XGBoost": xgb_clf,
    "Ensemble": ensemble_clf
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(f"Model {name} trained.")
    # Making predictions
    y_pred = model.predict(X_test)
    # Evaluation
    print(f"Evaluation of {name}:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))


Model Logistic Regression trained.
Evaluation of Logistic Regression:
Accuracy: 0.9027070885889209
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.98      0.95     11971
           1       0.66      0.35      0.46      1586

    accuracy                           0.90     13557
   macro avg       0.79      0.66      0.70     13557
weighted avg       0.89      0.90      0.89     13557

Model SVM trained.
Evaluation of SVM:
Accuracy: 0.9044773917533377
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.98      0.95     11971
           1       0.70      0.32      0.44      1586

    accuracy                           0.90     13557
   macro avg       0.81      0.65      0.69     13557
weighted avg       0.89      0.90      0.89     13557

Model Random Forest trained.
Evaluation of Random Forest:
Accuracy: 0.9049937301762927
Classification Report:
               precisi

# Conclusiones

En el análisis comparativo de modelos de clasificación, el modelo de ensemble (voting classifier) demostró la mejor precisión con un 91%, superando a los modelos individuales como Regresión Logística (LR), SVM, Random Forest (RF) y XGBoost. La elección del modelo adecuado dependerá del equilibrio entre la precisión y las limitaciones prácticas.

* El modelo de ensemble ofrece predicciones más robustas y precisas.
* XGBoost y RF tienen alta capacidad predictiva, pero mayor complejidad.
* SVM es eficiente en espacios dimensionales altos pero requiere más tiempo de entrenamiento.
* LR es sencilla de interpretar y rápida de entrenar, aunque menos precisa.
* La elección del modelo debe considerar la interpretabilidad y los recursos computacionales disponibles.