In [15]:
#imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler 
from sklearn.decomposition import PCA 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import BaggingClassifier 
from sklearn.metrics import accuracy_score 

In [8]:
# Carregar o dataset
df = pd.read_csv('data-1.csv')

In [9]:
# Preparação dos dados
X = df.iloc[:, 2:-1]  # Removendo as 2 primeiras colunas e a última
y = df.iloc[:, 1].map({'B': 0, 'M': 1})  # Mapeando 'B' para 0 e 'M' para 1

In [10]:
# Divisão em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

In [11]:
# Padronização dos dados
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
# Aplicação do PCA
pca = PCA(n_components=10, random_state=10)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [13]:
pca.explained_variance_ratio_

array([0.4383292 , 0.18413908, 0.10158532, 0.06803093, 0.05624249,
       0.03953245, 0.02437355, 0.01630403, 0.01462998, 0.01175767])

In [16]:
# Determinar o número de componentes que explicam pelo menos 70% da variância
explained_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
n_components = np.argmax(explained_variance_ratio >= 0.7) + 1

In [17]:
n_components

3

In [18]:
# Aplicação do PCA com o número de componentes adequado
pca = PCA(n_components=n_components, random_state=10)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [20]:
# Treinamento do modelo base: Decision Tree com busca de hiperparâmetros
param_grid = {
    "max_depth": [3, None],
    "min_samples_split": [2, 10]
}

In [21]:
estimator = DecisionTreeClassifier(random_state=10)

In [22]:
grid_search = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_pca, y_train)

In [23]:
# Melhor modelo encontrado
best_tree = grid_search.best_estimator_

In [25]:
best_tree

In [26]:
grid_search.cv_results_

{'mean_fit_time': array([0.00763741, 0.00114827, 0.00100093, 0.00122981]),
 'std_fit_time': array([0.01242927, 0.00067881, 0.00063301, 0.00045989]),
 'mean_score_time': array([0.00039964, 0.00068827, 0.00051079, 0.00028877]),
 'std_score_time': array([0.00048945, 0.00058468, 0.00065007, 0.00039537]),
 'param_max_depth': masked_array(data=[3, 3, None, None],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_split': masked_array(data=[2, 10, 2, 10],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 3, 'min_samples_split': 2},
  {'max_depth': 3, 'min_samples_split': 10},
  {'max_depth': None, 'min_samples_split': 2},
  {'max_depth': None, 'min_samples_split': 10}],
 'split0_test_score': array([0.9125, 0.9125, 0.95  , 0.9375]),
 'split1_test_score': array([0.975 , 0.975 , 0.9375, 0.95  ]),
 'split2_test_score': array([0.9375, 0.9375, 0.9375, 0.

In [27]:
grid_search.best_index_

0

In [28]:
grid_search.best_params_

{'max_depth': 3, 'min_samples_split': 2}

In [29]:
grid_search.best_score_

0.9219620253164557

In [31]:
# Modelo ensemble Bagging com a melhor árvore de decisão
bagging_clf = BaggingClassifier(base_estimator=best_tree, n_estimators=100, random_state=10)
bagging_clf.fit(X_train_pca, y_train)



In [48]:
# Predição e avaliação do modelo
y_pred = bagging_clf.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)

print(f"Acurácia do modelo ensemble Bagging: {accuracy:.6f}")

Acurácia do modelo ensemble Bagging: 0.959064


In [42]:
X_train_pca_pd = pd.DataFrame(X_train_pca)
X_teste_pca_pd = pd.DataFrame(X_test_pca)

In [46]:
X_train_pca_pd.describe

<bound method NDFrame.describe of             0         1         2
0   -1.042415  2.471686  0.480738
1    4.639689 -0.073490  3.692609
2   -4.193943 -2.760039  0.324628
3   -0.208213  2.222910  2.200972
4    2.589621 -0.181513 -2.080272
..        ...       ...       ...
393  7.069605 -3.341167 -0.488189
394 -1.456313  3.437994  1.229494
395 -3.323080 -1.012015 -2.245417
396 -3.298653 -2.031116 -0.642998
397  6.352843 -5.709002  0.674043

[398 rows x 3 columns]>

In [47]:
X_teste_pca_pd.describe

<bound method NDFrame.describe of              0         1         2
0     2.658385  1.416781 -2.295888
1    -3.636605  1.632069  2.967052
2    -2.785135 -1.202990 -1.264840
3     4.073665  1.419762 -1.736211
4    -3.553124 -0.164603  0.612888
..         ...       ...       ...
166  12.906366  1.626557  0.171049
167  -3.338551 -2.259822  0.142457
168  -3.653081  1.400395  2.837132
169  -2.378689 -1.343133 -0.217631
170  -2.469740 -0.526129 -0.472275

[171 rows x 3 columns]>