# Classificação com o Dataset Bank Marketing 💰
Este notebook utiliza o dataset real da UCI para prever se um cliente aceitará uma oferta de depósito a prazo.

**Modelos comparados:**
- Árvore de Decisão
- Regressão Logística
- KNN

**Etapas:**
- Carga e limpeza de dados
- Pré-processamento e normalização
- Treinamento e avaliação dos modelos
- Comparação por acurácia e F1-score

In [5]:
import os
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

In [6]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("janiobachmann/bank-marketing-dataset")

print("Path to dataset files:", path)

Path to dataset files: /home/lecraizer/.cache/kagglehub/datasets/janiobachmann/bank-marketing-dataset/versions/1


In [15]:
# Nome do arquivo (confirme se existe — deve ser esse!)
file_path = os.path.join(path, "bank.csv")

# Carregar
df = pd.read_csv(file_path, sep=',')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [16]:
# Verificar tipos de dados e valores nulos
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11162 entries, 0 to 11161
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        11162 non-null  int64 
 1   job        11162 non-null  object
 2   marital    11162 non-null  object
 3   education  11162 non-null  object
 4   default    11162 non-null  object
 5   balance    11162 non-null  int64 
 6   housing    11162 non-null  object
 7   loan       11162 non-null  object
 8   contact    11162 non-null  object
 9   day        11162 non-null  int64 
 10  month      11162 non-null  object
 11  duration   11162 non-null  int64 
 12  campaign   11162 non-null  int64 
 13  pdays      11162 non-null  int64 
 14  previous   11162 non-null  int64 
 15  poutcome   11162 non-null  object
 16  deposit    11162 non-null  object
dtypes: int64(7), object(10)
memory usage: 1.4+ MB


age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
deposit      0
dtype: int64

In [18]:
# Separar X e y
X = df.drop(columns='deposit')
y = df['deposit'].map({'yes': 1, 'no': 0})

# Codificar variáveis categóricas com get_dummies
X_encoded = pd.get_dummies(X, drop_first=True)

# Dividir em treino e teste
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.3, random_state=42, stratify=y)

# Normalizar os dados
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [20]:
# Inicializar modelos
model_dt = DecisionTreeClassifier(random_state=42)
model_knn = KNeighborsClassifier()
model_lr = LogisticRegression(max_iter=1000)

# Treinamento
model_dt.fit(X_train_scaled, y_train)
model_knn.fit(X_train_scaled, y_train)
model_lr.fit(X_train_scaled, y_train)

# Previsões
y_pred_dt = model_dt.predict(X_test_scaled)
y_pred_knn = model_knn.predict(X_test_scaled)
y_pred_lr = model_lr.predict(X_test_scaled)

# Avaliação
results = pd.DataFrame({
    'Modelo': ['Árvore de Decisão', 'KNN', 'Regressão Logística'],
    'Acurácia': [
        accuracy_score(y_test, y_pred_dt),
        accuracy_score(y_test, y_pred_knn),
        accuracy_score(y_test, y_pred_lr)
    ],
    'F1-Score': [
        f1_score(y_test, y_pred_dt),
        f1_score(y_test, y_pred_knn),
        f1_score(y_test, y_pred_lr)
    ]
})
results

Unnamed: 0,Modelo,Acurácia,F1-Score
0,Árvore de Decisão,0.779934,0.76566
1,KNN,0.780532,0.754427
2,Regressão Logística,0.825321,0.811837


In [21]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score

param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

tree = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(tree, param_grid, scoring='f1', cv=5, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

best_tree = grid_search.best_estimator_
y_pred_best = best_tree.predict(X_test_scaled)

print("Melhores parâmetros:", grid_search.best_params_)
print("F1-score:", round(f1_score(y_test, y_pred_best), 4))
print("Acurácia:", round(accuracy_score(y_test, y_pred_best), 4))


Melhores parâmetros: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}
F1-score: 0.8116
Acurácia: 0.8193
