# QUESTÃO 2


# Análise de Risco de Crédito com Árvores de Decisão (ID3, C4.5, CART)

In [None]:
# 1) Base de Dado

import pandas as pd
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt


dados = pd.read_csv('base.csv').to_dict(orient='records')
df = pd.DataFrame(dados)

: 

#### Arvore ID3


In [None]:
X = df[["História de Crédito", "Renda"]]
y = df["Risco"]

X_enc = pd.get_dummies(X)

# Treina modelo
clf = DecisionTreeClassifier(criterion="entropy", random_state=42)
clf.fit(X_enc, y)

plt.figure(figsize=(8,6))
plot_tree(
    clf,
    feature_names=X_enc.columns,
    class_names=clf.classes_,
    filled=True,
    rounded=True,
    fontsize=10
)
plt.show()


#### Árvore de Decisão C4.5

In [None]:
X_c45 = df[["Renda", "História de Crédito", "Garantia"]]
y_c45 = df["Risco"]

X_c45_enc = pd.get_dummies(X_c45)

# Treinar árvore
clf_c45 = DecisionTreeClassifier(
    criterion="entropy",
    min_samples_leaf=1,
    ccp_alpha=0.01,
    random_state=42
)
clf_c45.fit(X_c45_enc, y_c45)

plt.figure(figsize=(10,6))
plot_tree(
    clf_c45,
    feature_names=X_c45_enc.columns,
    class_names=clf_c45.classes_,
    filled=True,
    rounded=True,
    fontsize=10
)
plt.show()


#### Árvore de Decisão CART

In [None]:
dados = pd.read_csv('base.csv').to_dict(orient='records')
df = pd.DataFrame(dados)

X_cart = df[["História de Crédito", "Renda"]]
y_cart = df["Risco"]

X_cart_enc = pd.get_dummies(X_cart)

# Treinar árvore
clf_cart = DecisionTreeClassifier(
    criterion="gini",
    random_state=42
)
clf_cart.fit(X_cart_enc, y_cart)

plt.figure(figsize=(8,6))
plot_tree(
    clf_cart,
    feature_names=X_cart_enc.columns,
    class_names=clf_cart.classes_,
    filled=True,
    rounded=True,
    fontsize=10
)
plt.show()


#  QUESTÃO 3 ARVORE DE DECISÃO - DIABETES

#### BASE DE DADOS

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from sklearn import metrics 

In [None]:
col_names = ['gravida', 'glicose', 'bp', 'pele', 'insulina', 'bmi', 'pedigree', 'idade', 'label']

dados = pd.read_csv("diabetes.csv", header=1, names=col_names)
dados.head(10)

In [None]:
#dividi dataset em features e variavel target
feature_cols = ['gravida', 'insulina', 'bmi', 'idade','glicose','bp','pedigree']
X = dados[feature_cols] #features
y = dados.label # target 

#### TREINAMENTO

In [None]:
# divide em treino 70% e teste 30%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)


#### ARVORE DE DECISAO E TREINAMENTO

In [None]:
# arvore de decisão
clf = DecisionTreeClassifier()

# clf = DecisionTreeClassifier(
#     criterion="entropy",
#     class_weight="balanced",
#     max_depth=8,            
#     min_samples_leaf=20,
#     random_state=42
# )

# Treinar a árvore de decisão
clf = clf.fit(X_train,y_train)

# Prever a resposta para o conjunto de teste
y_pred = clf.predict(X_test)
print("Acurácia:",metrics.accuracy_score(y_test, y_pred))

#### Otimização do desempenho da árvore de decisão

In [None]:
clf = DecisionTreeClassifier(criterion="entropy", max_depth=3)

clf = clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

print("Acuracia:",metrics.accuracy_score(y_test, y_pred))



In [None]:
print(clf.classes_)

#### Métricas

In [None]:
import numpy as np
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
)
import matplotlib.pyplot as plt

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)
f1  = f1_score(y_test, y_pred, zero_division=0)


print(f"Acurácia:  {acc:.3f}")
print(f"Precisão:  {prec:.3f}")
print(f"Recall:    {rec:.3f}")
print(f"F1-Score:  {f1:.3f}")


print("\nRelatório de Classificação:")
print(classification_report(y_test, y_pred, digits=3))


#### MATRIZ DE CONFUSAO

In [None]:
cm = confusion_matrix(y_test, y_pred)
labels = ['baixo','alto'] 
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
fig, ax = plt.subplots(figsize=(4,4))
disp.plot(ax=ax, values_format="d", colorbar=False)
plt.title("Matriz de Confusão — Árvore de Decisão")
plt.tight_layout()
plt.show()

#### FIGURA ARVORE DE DECISÃO

In [None]:
!pip install pydotplus graphviz

from io import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus


dot_data = StringIO()
export_graphviz(
    clf,                       
    out_file=dot_data,
    filled=True, rounded=True,
    special_characters=True,
    feature_names=feature_cols,
    class_names=['baixo consumo','alto consumo']
)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png("arvore_decisao.png")
Image(graph.create_png())


#### REGRAS

In [None]:
from sklearn.tree import export_text
rules_text = export_text(
    clf,
    feature_names=feature_cols if feature_cols is not None else None,
    decimals=2,
    show_weights=True
)
print(rules_text)

with open("regras_arvore.txt", "w", encoding="utf-8") as f:
    f.write(rules_text)
print("Regras salvas")

#### ALGORITMO RIPPER (JRip)

In [None]:
!pip install wittgenstein imbalanced-learn


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix, ConfusionMatrixDisplay
)

import wittgenstein as lw



In [None]:
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Preenche NaN numéricos com mediana; categóricos com modo
X_train_proc = X_train.copy()
X_test_proc  = X_test.copy()

for col in X_train_proc.columns:
    if pd.api.types.is_numeric_dtype(X_train_proc[col]):
        med = X_train_proc[col].median()
        X_train_proc[col] = X_train_proc[col].fillna(med)
        X_test_proc[col]  = X_test_proc[col].fillna(med)
    else:
        mode = X_train_proc[col].mode(dropna=True)
        fill = mode.iloc[0] if len(mode) else "NA"
        X_train_proc[col] = X_train_proc[col].fillna(fill)
        X_test_proc[col]  = X_test_proc[col].fillna(fill)

pos_cl = 1 

df_train = X_train_proc.copy()
df_train["__target__"] = y_train.values

ripper = lw.RIPPER(random_state=RANDOM_STATE)
ripper.fit(df_train, class_feat="__target__", pos_class=pos_cl)

print(" REGRAS APRENDIDAS - RIPPER ")
print(ripper.ruleset_.out_pretty())

In [None]:
y_pred = ripper.predict(X_test_proc)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)
f1  = f1_score(y_test, y_pred, zero_division=0)

print(f"Acurácia:  {acc:.3f}")
print(f"Precisão:  {prec:.3f}")
print(f"Recall:    {rec:.3f}")
print(f"F1-Score:  {f1:.3f}\n")

print("Relatório de Classificação:")
print(classification_report(y_test, y_pred, digits=3))

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(cm, display_labels=["baixo ","alto"])
fig, ax = plt.subplots(figsize=(4,4))
disp.plot(ax=ax, values_format="d", colorbar=False)
plt.title("Matriz de Confusão — RIPPER")
plt.tight_layout()
plt.show()
