# ENTREGABLE 4

# INSTRUCCIONES

Utilizar el archivo CSV (`dataset_banco_clean.csv`) con 45189 filas y 17 columnas y aplicar las técnicas de normalización del entregable 3.

In [1]:
# imports
from sklearn.datasets import load_iris
import numpy as np
import pandas as pd

In [2]:
ruta = "./dataset_banco_clean.csv"
data_cleaned = pd.read_csv(ruta)

# Objetivo

Generar un model de clasificación capaz de predecir la clase de flor en función de las carácterísticas del dataset

* Aplicar las técnicas oportunas de procesamiento de datos

* Generar split de los datos

* Valorar diferentes modelos de clasificación

* Comparación entre modelos

* Ensemble

* Métricas

* Conclusiones finales

In [3]:
data_cleaned.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143.0,yes,no,unknown,5,may,261.0,1,-1.0,0,unknown,no
1,44,technician,single,secondary,no,29.0,yes,no,unknown,5,may,151.0,1,-1.0,0,unknown,no
2,33,entrepreneur,married,secondary,no,2.0,yes,yes,unknown,5,may,76.0,1,-1.0,0,unknown,no
3,47,blue-collar,married,unknown,no,1506.0,yes,no,unknown,5,may,92.0,1,-1.0,0,unknown,no
4,33,unknown,single,unknown,no,1.0,no,no,unknown,5,may,198.0,1,-1.0,0,unknown,no


## Normalizacion:

In [11]:
# First, let's ensure that numerical columns are treated as numeric data_cleaned types.

# Identify columns that should be numeric but are currently object type
numeric_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

# Attempt to convert these columns to numeric, coercing errors which will turn non-convertible values to NaN
data_cleaned[numeric_cols] = data_cleaned[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Check the data_cleaned types again to confirm changes and identify any columns with unexpected NaNs introduced by coercing
data_cleaned.dtypes, data_cleaned[numeric_cols].isnull().sum()

data_cleaned.columns


Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [4]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Apply OneHotEncoder to the categorical columns
# Initialize OneHotEncoder without the 'sparse' argument
encoder_corrected = OneHotEncoder()

# List of categorical columns for OneHotEncoding
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

# Encode the target variable 'y' using LabelEncoder
label_encoder = LabelEncoder()
data_cleaned['y_encoded'] = label_encoder.fit_transform(data_cleaned['y'])

# Apply OneHotEncoder to the categorical columns again
data_encoded_corrected = pd.DataFrame(encoder_corrected.fit_transform(data_cleaned[categorical_columns]).toarray(),
                                      columns=encoder_corrected.get_feature_names_out(categorical_columns))

# Reindex to align with the original data indices after dropna
data_encoded_corrected.index = data_cleaned.index

# Drop the original categorical columns and concatenate the encoded columns, including 'y_encoded'
data_final_corrected = pd.concat([data_cleaned.drop(categorical_columns + ['y'], axis=1), data_encoded_corrected], axis=1)

# Verify if 'y_encoded' is present and correctly formed
print(data_final_corrected['y_encoded'].head())



0    0
1    0
2    0
3    0
4    0
Name: y_encoded, dtype: int64


In [6]:
data_final_corrected.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,y_encoded,job_administrative,job_blue-collar,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,58,2143.0,5,261.0,1,-1.0,0,0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,44,29.0,5,151.0,1,-1.0,0,0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,33,2.0,5,76.0,1,-1.0,0,0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,47,1506.0,5,92.0,1,-1.0,0,0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,33,1.0,5,198.0,1,-1.0,0,0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [7]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# List of numeric columns to normalize
numeric_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

# Apply MinMaxScaler to the numeric columns
data_final_corrected[numeric_cols] = scaler.fit_transform(data_final_corrected[numeric_cols])

# Show summary statistics of the normalized columns to verify the transformation
data_final_corrected[numeric_cols].describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45189.0,45189.0,45189.0,45189.0,45189.0,45189.0,45189.0
mean,0.297876,0.017539,0.493573,0.052291,0.02844,0.047224,0.009899
std,0.137903,0.007328,0.277418,0.052339,0.049945,0.114802,0.032896
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.194805,0.015108,0.233333,0.020744,0.0,0.0,0.0
50%,0.272727,0.01581,0.5,0.036404,0.016129,0.0,0.0
75%,0.38961,0.01764,0.666667,0.064674,0.032258,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Split de datos:

In [8]:
from sklearn.model_selection import train_test_split

# Define the features and target variable
X = data_final_corrected.drop('y_encoded', axis=1)
y = data_final_corrected['y_encoded']

# Split the data into training and temporary set (80% - 20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the temporary set into validation and test set (50% - 50%)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Print the shapes of the splits to confirm their sizes
X_train.shape, X_val.shape, X_test.shape


((36151, 51), (4519, 51), (4519, 51))

In [15]:
data_final_corrected['y_encoded'].unique()

array([0, 1])

# Modelos

## Regresion Logistica:

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Logistic Regression model
log_reg = LogisticRegression(random_state=42, max_iter=1000)

# Train the model using the training data
log_reg.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = log_reg.predict(X_val)

# Calculate accuracy and other performance metrics on the validation set
accuracy_val = accuracy_score(y_val, y_val_pred)
classification_rep = classification_report(y_val, y_val_pred)

print(accuracy_val, classification_rep)


0.9006417348971011               precision    recall  f1-score   support

           0       0.92      0.98      0.95      3987
           1       0.66      0.32      0.43       532

    accuracy                           0.90      4519
   macro avg       0.79      0.65      0.69      4519
weighted avg       0.89      0.90      0.89      4519



El modelo de regresión logística ha sido entrenado y evaluado con los siguientes resultados en el conjunto de validación arrojando los siguientes resultados:

- Precisión (Accuracy): 90.68%

- Reporte de clasificación:
    Clase 0 (No):
    Precisión: 92%
    Recall: 98%
    F1-Score: 95%
    Clase 1 (Sí):
    Precisión: 70%
    Recall: 34%
    F1-Score: 45%

Aplicaremos Grid Search para encontrar mejores hiperparametros:

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the model and parameters for GridSearchCV
model = LogisticRegression(random_state=42, max_iter=1000)
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2', 'none'],  # Types of penalty
    'solver': ['liblinear', 'lbfgs', 'saga']  # Solvers that support different penalties
}

# Setup GridSearchCV to find the best parameters (focusing on maximizing F1-score for the minority class)
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_macro', verbose=1)

# Perform the grid search on the training data
grid_search.fit(X_train, y_train)

# Best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

best_params, best_score


In [19]:
# Initialize the optimized Logistic Regression model with the best parameters from GridSearch
optimized_log_reg = LogisticRegression(
    C=best_params['C'],
    penalty=best_params['penalty'],
    solver=best_params['solver'],
    random_state=42,
    max_iter=1000
)

# Train the model using the training data
optimized_log_reg.fit(X_train, y_train)

# Predict on the validation set
y_val_pred_optimized = optimized_log_reg.predict(X_val)

# Calculate accuracy and other performance metrics on the validation set
accuracy_val_optimized = accuracy_score(y_val, y_val_pred_optimized)
classification_rep_optimized = classification_report(y_val, y_val_pred_optimized)

# Print the results
print("Optimized Model Accuracy:", accuracy_val_optimized)
print("Optimized Model Classification Report:\n", classification_rep_optimized)


Optimized Model Accuracy: 0.9010843106882054
Optimized Model Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.98      0.95      3987
           1       0.65      0.34      0.45       532

    accuracy                           0.90      4519
   macro avg       0.79      0.66      0.70      4519
weighted avg       0.89      0.90      0.89      4519



Gracias al ajuste de hiperparametros se logra mejorar la clasificion de la clase 1 pasando f1 score de 0.42 a 0.45.

## Random Forest

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=150, random_state=42)

# Train the model using the training data
random_forest.fit(X_train, y_train)

# Predict on the validation set
y_val_pred_rf = random_forest.predict(X_val)

# Calculate accuracy and other performance metrics on the validation set
accuracy_val_rf = accuracy_score(y_val, y_val_pred_rf)
classification_rep_rf = classification_report(y_val, y_val_pred_rf)

# Print the results
print("Random Forest Model Accuracy:", accuracy_val_rf)
print("Random Forest Model Classification Report:\n", classification_rep_rf)



Random Forest Model Accuracy: 0.9108209780924983
Random Forest Model Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.98      0.95      3987
           1       0.71      0.41      0.52       532

    accuracy                           0.91      4519
   macro avg       0.82      0.69      0.74      4519
weighted avg       0.90      0.91      0.90      4519



In [None]:
from sklearn.model_selection import GridSearchCV

# Define the model
rf = RandomForestClassifier(random_state=42)

# Define the parameter grid
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Setup GridSearchCV
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# Perform the grid search
grid_search_rf.fit(X_train, y_train)

# Best parameters and best score
best_params_rf = grid_search_rf.best_params_
best_score_rf = grid_search_rf.best_score_

print("Best Parameters:", best_params_rf)
print("Best Score:", best_score_rf)


In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the RandomForestClassifier with specific parameters
random_forest = RandomForestClassifier(
    max_depth=20,
    max_features='sqrt',
    min_samples_leaf=2,
    min_samples_split=2,
    n_estimators=100,
    random_state=42
)

# Train the model using the training data
random_forest.fit(X_train, y_train)

# Predict on the validation set
y_val_pred_rf = random_forest.predict(X_val)

# Calculate accuracy and other performance metrics on the validation set
accuracy_val_rf = accuracy_score(y_val, y_val_pred_rf)
classification_rep_rf = classification_report(y_val, y_val_pred_rf)

# Print the results
print("Random Forest Model Accuracy:", accuracy_val_rf)
print("Random Forest Model Classification Report:\n", classification_rep_rf)


Random Forest Model Accuracy: 0.9112635538836026
Random Forest Model Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.98      0.95      3987
           1       0.74      0.38      0.50       532

    accuracy                           0.91      4519
   macro avg       0.83      0.68      0.73      4519
weighted avg       0.90      0.91      0.90      4519



In [25]:
# Predict on the test set
y_test_pred_rf = random_forest.predict(X_test)

# Calculate accuracy and other performance metrics on the test set
accuracy_test_rf = accuracy_score(y_test, y_test_pred_rf)
classification_rep_test_rf = classification_report(y_test, y_test_pred_rf)

# Print the results
print("Random Forest Model Test Accuracy:", accuracy_test_rf)
print("Random Forest Model Test Classification Report:\n", classification_rep_test_rf)


Random Forest Model Test Accuracy: 0.9061739322859039
Random Forest Model Test Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.98      0.95      3991
           1       0.70      0.35      0.46       528

    accuracy                           0.91      4519
   macro avg       0.81      0.66      0.71      4519
weighted avg       0.89      0.91      0.89      4519



# Conclusiones


Es notable la importancia del ajuste de hiperparametros que han logrado mejora siginificativas en los modelos entrenados.

De igual manera la normalizacion y el uso de OneHotEncoder con el fin de alistar los datos han permitido un buen accuracy y prediccion.

El modelo tiene un excelente desempeño general, especialmente en la predicción de la clase mayoritaria clase 0. Sin embargo, lucha más con la clase minoritaria clase 1, como lo indican las métricas más bajas de recall y F1-score para esa clase.