# Dataset Sintético com Regras Explícitas (Lógicas)
Este notebook gera um dataset artificial usando regras explícitas do tipo `if/else`, ideal para que modelos como Árvores de Decisão se destaquem.

**Modelos comparados:**
- Árvore de Decisão
- Regressão Logística
- KNN

**Objetivo:** Demonstrar como Decision Trees capturam bem estruturas baseadas em regras.

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score

In [61]:
# Criar dados com regras claras: se x1 > 0.5 e x2 < -0.5 então classe = 1, senão = 0
np.random.seed(42)
X = np.random.randn(1000, 5)
y = np.where((X[:, 0] > 0.5) & (X[:, 1] < -0.5), 1, 0)

# y = np.where((X[:, 0]**2 + X[:, 1]**2 > 1.5), 1, 0)
y = np.where((X[:, 0] + X[:, 1] > 1.0) & (X[:, 2] < 0.0), 1, 0)


# Transformar em DataFrame para visualização e compatibilidade
df = pd.DataFrame(X, columns=[f'x{i+1}' for i in range(X.shape[1])])
df['target'] = y

df.head()

Unnamed: 0,x1,x2,x3,x4,x5,target
0,0.496714,-0.138264,0.647689,1.52303,-0.234153,0
1,-0.234137,1.579213,0.767435,-0.469474,0.54256,0
2,-0.463418,-0.46573,0.241962,-1.91328,-1.724918,0
3,-0.562288,-1.012831,0.314247,-0.908024,-1.412304,0
4,1.465649,-0.225776,0.067528,-1.424748,-0.544383,0


In [62]:
X = df.drop(columns='target')
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [76]:
from sklearn.model_selection import cross_val_score
import numpy as np

# Instanciar os modelos
dt = DecisionTreeClassifier(random_state=42)
knn = KNeighborsClassifier()
lr = LogisticRegression(max_iter=1000)

# Avaliar com cross-validation (usando acurácia e F1)
scores_dt_acc = cross_val_score(dt, X_train_scaled, y_train, cv=5, scoring='accuracy')
scores_knn_acc = cross_val_score(knn, X_train_scaled, y_train, cv=5, scoring='accuracy')
scores_lr_acc = cross_val_score(lr, X_train_scaled, y_train, cv=5, scoring='accuracy')

scores_dt_f1 = cross_val_score(dt, X_train_scaled, y_train, cv=5, scoring='f1')
scores_knn_f1 = cross_val_score(knn, X_train_scaled, y_train, cv=5, scoring='f1')
scores_lr_f1 = cross_val_score(lr, X_train_scaled, y_train, cv=5, scoring='f1')

# Resultados em DataFrame
results_cv = pd.DataFrame({
    'Modelo': ['Árvore de Decisão', 'KNN', 'Regressão Logística'],
    'Acurácia Média': [scores_dt_acc.mean(), scores_knn_acc.mean(), scores_lr_acc.mean()],
    'F1-Score Médio': [scores_dt_f1.mean(), scores_knn_f1.mean(), scores_lr_f1.mean()]
})
results_cv.round(4)

Unnamed: 0,Modelo,Acurácia Média,F1-Score Médio
0,Árvore de Decisão,0.9629,0.8325
1,KNN,0.9357,0.6371
2,Regressão Logística,0.9257,0.6176


In [77]:
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train_scaled, y_train)

path = tree.cost_complexity_pruning_path(X_train_scaled, y_train)
ccp_alphas = path.ccp_alphas[:-1]  # último alpha leva a uma raiz só, geralmente inútil

In [80]:
trees = []
for alpha in ccp_alphas:
    clf = DecisionTreeClassifier(random_state=42, ccp_alpha=alpha)
    clf.fit(X_train_scaled, y_train)
    trees.append(clf)

print(len(trees))

11


In [81]:
from sklearn.model_selection import cross_val_score

val_scores = [cross_val_score(clf, X_train_scaled, y_train, cv=5).mean() for clf in trees]
best_index = np.argmax(val_scores)
best_alpha = ccp_alphas[best_index]
print(f"Melhor ccp_alpha encontrado: {best_alpha}")

Melhor ccp_alpha encontrado: 0.0


In [82]:
pruned_tree = DecisionTreeClassifier(random_state=42, ccp_alpha=best_alpha)
pruned_tree.fit(X_train_scaled, y_train)

y_pred = pruned_tree.predict(X_test_scaled)
print("Acurácia:", round(accuracy_score(y_test, y_pred), 4))
print("F1-score:", round(f1_score(y_test, y_pred), 4))

Acurácia: 0.9567
F1-score: 0.8
