# ENTREGABLE 4

## INSTRUCCIONES

Utilizar el archivo CSV (`dataset_banco_clean.csv`) con 45189 filas y 17 columnas y aplicar las técnicas de normalización del entregable 3.

In [1]:
# imports
from sklearn.datasets import load_iris
import numpy as np
import pandas as pd

In [3]:
ruta = "dataset_banco_clean.csv"
df = pd.read_csv(ruta)
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143.0,yes,no,unknown,5,may,261.0,1,-1.0,0,unknown,no
1,44,technician,single,secondary,no,29.0,yes,no,unknown,5,may,151.0,1,-1.0,0,unknown,no
2,33,entrepreneur,married,secondary,no,2.0,yes,yes,unknown,5,may,76.0,1,-1.0,0,unknown,no
3,47,blue-collar,married,unknown,no,1506.0,yes,no,unknown,5,may,92.0,1,-1.0,0,unknown,no
4,33,unknown,single,unknown,no,1.0,no,no,unknown,5,may,198.0,1,-1.0,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45184,51,technician,married,tertiary,no,825.0,no,no,cellular,17,nov,977.0,3,-1.0,0,unknown,yes
45185,71,retired,divorced,primary,no,1729.0,no,no,cellular,17,nov,456.0,2,-1.0,0,unknown,yes
45186,72,retired,married,secondary,no,5715.0,no,no,cellular,17,nov,1127.0,5,184.0,3,success,yes
45187,57,blue-collar,married,secondary,no,668.0,no,no,telephone,17,nov,508.0,4,-1.0,0,unknown,no


# Objetivo

Generar un model de clasificación capaz de predecir la clase de flor en función de las carácterísticas del dataset

* Aplicar las técnicas oportunas de procesamiento de datos

* Generar split de los datos

* Valorar diferentes modelos de clasificación

* Comparación entre modelos

* Ensemble

* Métricas

* Conclusiones finales

## Particiones

In [4]:
x = df.drop('y', axis=1)
y = df['y']

In [5]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=42)

# Verificar
print(x_train.shape, y_train.shape)
print(x_val.shape, y_val.shape)
print(x_test.shape, y_test.shape)

(31632, 16) (31632,)
(6778, 16) (6778,)
(6779, 16) (6779,)


## Normalizacion

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelBinarizer

numeric_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
categorical_features = ['marital', 'housing', 'loan', 'contact', 'job', 'month', 'poutcome', 'education']

# Definir los transformers para características numéricas y categóricas
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combinar los transformers en un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

label_bin = LabelBinarizer()
y_train_bin = label_bin.fit_transform(y_train)
y_val_bin = label_bin.transform(y_val)
y_test_bin = label_bin.transform(y_test)

x_train_s = preprocessor.fit_transform(x_train)
x_val_s = preprocessor.transform(x_val)
x_test_s = preprocessor.transform(x_test)

## Modelos


### Decision tree

In [9]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(x_train_s, y_train_bin) 
y_pred = decision_tree.predict(x_val_s)

accuracy = accuracy_score(y_val_bin, y_pred)
conf_matrix = confusion_matrix(y_val_bin, y_pred)
class_report = classification_report(y_val_bin, y_pred)

print('Accuracy:',accuracy)
print('Confusion matrix:\n', conf_matrix)
print('Classification report:\n',class_report)

Accuracy: 0.8762171732074359
Confusion matrix:
 [[5568  422]
 [ 417  371]]
Classification report:
               precision    recall  f1-score   support

           0       0.93      0.93      0.93      5990
           1       0.47      0.47      0.47       788

    accuracy                           0.88      6778
   macro avg       0.70      0.70      0.70      6778
weighted avg       0.88      0.88      0.88      6778



### Random Forest

In [10]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(x_train_s, y_train_bin)
y_pred = random_forest.predict(x_val_s)

accuracy = accuracy_score(y_val_bin, y_pred)
conf_matrix = confusion_matrix(y_val_bin, y_pred)
class_report = classification_report(y_val_bin, y_pred)

print('Accuracy:', accuracy)
print('Confusion matrix:\n', conf_matrix)
print('Classification report:\n', class_report)

  return fit_method(estimator, *args, **kwargs)


Accuracy: 0.9094128061375036
Confusion matrix:
 [[5858  132]
 [ 482  306]]
Classification report:
               precision    recall  f1-score   support

           0       0.92      0.98      0.95      5990
           1       0.70      0.39      0.50       788

    accuracy                           0.91      6778
   macro avg       0.81      0.68      0.72      6778
weighted avg       0.90      0.91      0.90      6778



### KNN

In [11]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()
knn_model.fit(x_train_s, y_train_bin)
y_pred = knn_model.predict(x_val_s)

accuracy = accuracy_score(y_val_bin, y_pred)
conf_matrix = confusion_matrix(y_val_bin, y_pred)
class_report = classification_report(y_val_bin, y_pred)

print('Accuracy:', accuracy)
print('Confusion matrix:\n', conf_matrix)
print('Classification report:\n', class_report)

  return self._fit(X, y)


Accuracy: 0.8965771614045441
Confusion matrix:
 [[5822  168]
 [ 533  255]]
Classification report:
               precision    recall  f1-score   support

           0       0.92      0.97      0.94      5990
           1       0.60      0.32      0.42       788

    accuracy                           0.90      6778
   macro avg       0.76      0.65      0.68      6778
weighted avg       0.88      0.90      0.88      6778



### SVM

In [12]:
from sklearn.svm import SVC

svm_model = SVC()
svm_model.fit(x_train_s, y_train_bin)
y_pred = svm_model.predict(x_val_s)

accuracy = accuracy_score(y_val_bin, y_pred)
conf_matrix = confusion_matrix(y_val_bin, y_pred)
class_report = classification_report(y_val_bin, y_pred)

print('Accuracy:', accuracy)
print('Confusion matrix:\n', conf_matrix)
print('Classification report:\n', class_report)

  y = column_or_1d(y, warn=True)


Accuracy: 0.9049867217468279
Confusion matrix:
 [[5883  107]
 [ 537  251]]
Classification report:
               precision    recall  f1-score   support

           0       0.92      0.98      0.95      5990
           1       0.70      0.32      0.44       788

    accuracy                           0.90      6778
   macro avg       0.81      0.65      0.69      6778
weighted avg       0.89      0.90      0.89      6778



### NAIVE BAYES

In [16]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(x_train_s, y_train_bin)
y_pred = nb_model.predict(x_val_s)

accuracy = accuracy_score(y_val_bin, y_pred)
conf_matrix = confusion_matrix(y_val_bin, y_pred)
class_report = classification_report(y_val_bin, y_pred)

print('Accuracy:', accuracy)
print('Confusion matrix:\n', conf_matrix)
print('Classification report:\n', class_report)


Accuracy: 0.8577751549129536
Confusion matrix:
 [[5428  562]
 [ 402  386]]
Classification report:
               precision    recall  f1-score   support

           0       0.93      0.91      0.92      5990
           1       0.41      0.49      0.44       788

    accuracy                           0.86      6778
   macro avg       0.67      0.70      0.68      6778
weighted avg       0.87      0.86      0.86      6778



  y = column_or_1d(y, warn=True)


### XGBOOST

In [14]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(random_state=42)
xgb_model.fit(x_train_s, y_train_bin)
y_pred = xgb_model.predict(x_val_s)

accuracy = accuracy_score(y_val_bin, y_pred)
conf_matrix = confusion_matrix(y_val_bin, y_pred)
class_report = classification_report(y_val_bin, y_pred)

print('Accuracy:', accuracy)
print('Confusion matrix:\n', conf_matrix)
print('Classification report:\n', class_report)

Accuracy: 0.9063145470640307
Confusion matrix:
 [[5773  217]
 [ 418  370]]
Classification report:
               precision    recall  f1-score   support

           0       0.93      0.96      0.95      5990
           1       0.63      0.47      0.54       788

    accuracy                           0.91      6778
   macro avg       0.78      0.72      0.74      6778
weighted avg       0.90      0.91      0.90      6778



### ENSEMBLE 

In [21]:
from sklearn.ensemble import VotingClassifier
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

# Definir los clasificadores individuales
random_forest = RandomForestClassifier(random_state=42)
xgb_model = xgb.XGBClassifier(random_state=42)

# Crear el ensemble de clasificadores
ensemble_model = VotingClassifier(estimators=[
    ('random_forest', random_forest),
    ('xgboost', xgb_model)
], voting='soft')  # Se utiliza 'soft' para combinar las probabilidades de clasificación en lugar de las etiquetas

# Entrenar el ensemble en el conjunto de entrenamiento
ensemble_model.fit(x_train_s, y_train_bin)

# Predecir en el conjunto de validación
y_pred = ensemble_model.predict(x_val_s)

# Calcular la precisión del modelo
accuracy = accuracy_score(y_val_bin, y_pred)
conf_matrix = confusion_matrix(y_val_bin, y_pred)
class_report = classification_report(y_val_bin, y_pred)

print('Accuracy:', accuracy)
print('Confusion matrix:\n', conf_matrix)
print('Classification report:\n', class_report)

Accuracy: 0.9111832398937739
Confusion matrix:
 [[5825  165]
 [ 437  351]]
Classification report:
               precision    recall  f1-score   support

           0       0.93      0.97      0.95      5990
           1       0.68      0.45      0.54       788

    accuracy                           0.91      6778
   macro avg       0.81      0.71      0.74      6778
weighted avg       0.90      0.91      0.90      6778



# Conclusiones

Del análisis de clasificación que hemos hecho podemos destacar varios aspectos.
Se observa una amplia variedad de algoritmos, como SVM, RF, k-NN y Xgboost, cada uno con sus propias ventajas y desventajas. Aunque no era el foco principal, se reconoce la importancia del preprocesamiento de datos para una clasificación efectiva, abordando la normalización de características y el manejo de valores faltantes. 

La evaluación del rendimiento de los modelos se revela crucial, utilizando métricas como precisión, exhaustividad, F1-score, AUC-ROC y Confusion Matrix. Además, el ajuste de hiperparámetros mediante técnicas como la búsqueda en cuadrícula o la optimización bayesiana es esencial para maximizar el rendimiento del modelo. La interpretación de modelos, especialmente en algoritmos como árboles de decisión y regresión logística, se destaca como una herramienta valiosa para comprender cómo se toman decisiones y obtener insights sobre los datos.

En conclusión, para lograr un buen resultado en la clasificación en machine learning  es crucial tener un conocimiento profundo de los algoritmos, técnicas de preprocesamiento, evaluación de modelos y estrategias para superar desafíos específicos.