# 1. Instalação e importação de bibliotecas necessárias

*  Instalar e atualizar pacotes necessários
*  Importar bibliotecas do scikit-learn, XGBoost, seaborn e matplotlib

In [None]:
#!pip uninstall -y numpy pandas scikit-learn xgboost joblib seaborn
#!pip install numpy==1.26.4 pandas==2.2.2 scikit-learn==1.3.2 xgboost==1.7.6 seaborn joblib --force-reinstall

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix


Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: pandas 2.2.2
Uninstalling pandas-2.2.2:
  Successfully uninstalled pandas-2.2.2
Found existing installation: scikit-learn 1.6.1
Uninstalling scikit-learn-1.6.1:
  Successfully uninstalled scikit-learn-1.6.1
Found existing installation: xgboost 2.1.3
Uninstalling xgboost-2.1.3:
  Successfully uninstalled xgboost-2.1.3
Found existing installation: joblib 1.4.2
Uninstalling joblib-1.4.2:
  Successfully uninstalled joblib-1.4.2
Found existing installation: seaborn 0.13.2
Uninstalling seaborn-0.13.2:
  Successfully uninstalled seaborn-0.13.2
Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas==2.2.2
  Downloading pandas-2.2.2-cp311-cp311-m

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

# 2. Carregar e explorar dados


In [5]:
# Carregar os dados
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/breast-cancer-wisconsin.data"
df = pd.read_csv(url, names=["uniformity_cell_size", "uniformity_cell_shape",
                             "area", "compactness", "class"])

# Tratar valores ausentes
df.replace("?", np.nan, inplace=True)
df.dropna(inplace=True)

# Converter para tipo numérico
df = df.astype(float)

# Mapear a variável alvo (2 = benigno, 4 = maligno)
df["class"] = df["class"].map({2: 0, 4: 1})

# Exibir as primeiras linhas do dataset
df.head()


Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,uniformity_cell_size,uniformity_cell_shape,area,compactness,class
1000025,5,1,1,1,2,1.0,3.0,1.0,1.0,0
1002945,5,4,4,5,7,10.0,3.0,2.0,1.0,0
1015425,3,1,1,1,2,2.0,3.0,1.0,1.0,0
1016277,6,8,8,1,3,4.0,3.0,7.0,1.0,0
1017023,4,1,1,3,2,1.0,3.0,1.0,1.0,0


# 3. Particionamento treino e teste

In [6]:
# Separar features e variável de interesse
X = df.drop(columns=["class"])
y = df["class"]

# Dividir em treino (80%) e teste (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Tamanho do conjunto de treino: {X_train.shape}")
print(f"Tamanho do conjunto de teste: {X_test.shape}")


Tamanho do conjunto de treino: (546, 4)
Tamanho do conjunto de teste: (137, 4)


# 4. Treinamento de modelos

*  Random Forest

In [7]:
# Treinando e fazendo o fit do modelo Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Fazendo previsões
y_pred_rf = rf.predict(X_test)

# Exibindo as métricas de avaliação
print("🔹 Random Forest - Classification Report")
print(classification_report(y_test, y_pred_rf))



🔹 Random Forest - Classification Report
              precision    recall  f1-score   support

           0       0.97      0.96      0.96        89
           1       0.92      0.94      0.93        48

    accuracy                           0.95       137
   macro avg       0.94      0.95      0.94       137
weighted avg       0.95      0.95      0.95       137



In [None]:
# Criando e exibindo a matriz de confusão
cm_rf = confusion_matrix(y_test, y_pred_rf)

plt.figure(figsize=(6, 4))
sns.heatmap(cm_rf, annot=True, fmt="d", cmap="Blues", xticklabels=["Benigno", "Maligno"], yticklabels=["Benigno", "Maligno"])
plt.xlabel("Predito")
plt.ylabel("Real")
plt.title("Matriz de Confusão - Random Forest")
plt.show()

*   XGBoost

In [None]:
# Treinando e fazendo o fit do modelo XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='auc', random_state=42)
xgb.fit(X_train, y_train)

# Fazendo previsões
y_pred_xgb = xgb.predict(X_test)

# Exibindo as métricas de avaliação
print("🔹 XGBoost - Classification Report")
print(classification_report(y_test, y_pred_xgb))



In [None]:
# Criando e exibindo a matriz de confusão
cm_xgb = confusion_matrix(y_test, y_pred_xgb)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_xgb, annot=True, fmt="d", cmap="Blues", xticklabels=["Benigno", "Maligno"], yticklabels=["Benigno", "Maligno"])
plt.xlabel("Predito")
plt.ylabel("Real")
plt.title("Matriz de Confusão - XGBoost")
plt.show()



*  AdaBoost


In [None]:
# Treinando e fazendo o fit do AdaBoost
ada = AdaBoostClassifier(n_estimators=100, random_state=42)
ada.fit(X_train, y_train)

# Fazendo previsões
y_pred_ada = ada.predict(X_test)

# Exibindo as métricas de avaliação
print("🔹 AdaBoost - Classification Report")
print(classification_report(y_test, y_pred_ada))

In [None]:
# Criando e exibindo a matriz de confusão
cm_ada = confusion_matrix(y_test, y_pred_ada)

plt.figure(figsize=(6, 4))
sns.heatmap(cm_ada, annot=True, fmt="d", cmap="Blues", xticklabels=["Benigno", "Maligno"], yticklabels=["Benigno", "Maligno"])
plt.xlabel("Predito")
plt.ylabel("Real")
plt.title("Matriz de Confusão - AdaBoost")
plt.show()

# 5. Desempenho dos modelos

In [None]:
# Avaliando as métricas de cada modelo
resultados = {
    "Modelo": ["Random Forest", "XGBoost", "AdaBoost"],
    "Acurácia": [
        accuracy_score(y_test, y_pred_rf),
        accuracy_score(y_test, y_pred_xgb),
        accuracy_score(y_test, y_pred_ada)
    ],
    "Precisão": [
        precision_score(y_test, y_pred_rf),
        precision_score(y_test, y_pred_xgb),
        precision_score(y_test, y_pred_ada)
    ],
    "Recall": [
        recall_score(y_test, y_pred_rf),
        recall_score(y_test, y_pred_xgb),
        recall_score(y_test, y_pred_ada)
    ],
    "F1-score": [
        f1_score(y_test, y_pred_rf),
        f1_score(y_test, y_pred_xgb),
        f1_score(y_test, y_pred_ada)
    ]
}


df_resultados = pd.DataFrame(resultados)

print("Comparação dos Modelos")
print(df_resultados)



# 6. Importância das variáveis

In [None]:
#Importância das variáveis no XGBoost
importances_xgb = xgb.feature_importances_


df_importance_xgb = pd.DataFrame({'Feature': X_train.columns, 'Importance': importances_xgb})
df_importance_xgb = df_importance_xgb.sort_values(by="Importance", ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=df_importance_xgb, palette="colorblind")
plt.title("Importância das Variáveis - XGBoost")
plt.xlabel("Importância")
plt.ylabel("Variável")
plt.show()
