# Dataset

In [1]:
import pandas as pd

resultados_exames = pd.read_csv("data-set/exames.csv")
resultados_exames.head()

Unnamed: 0,id,diagnostico,exame_1,exame_2,exame_3,exame_4,exame_5,exame_6,exame_7,exame_8,...,exame_24,exame_25,exame_26,exame_27,exame_28,exame_29,exame_30,exame_31,exame_32,exame_33
0,842302,M,17.99,10.38,122.8,103.78,1001.0,0.1184,0.2776,0.3001,...,184.6,2019.0,0.1622,0.6656,0.7119,0.786,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,103.78,1326.0,0.08474,0.07864,0.0869,...,158.8,1956.0,0.1238,0.1866,0.2416,0.786,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,103.78,1203.0,0.1096,0.1599,0.1974,...,152.5,1709.0,0.1444,0.4245,0.4504,0.786,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,103.78,386.1,0.1425,0.2839,0.2414,...,98.87,567.7,0.2098,0.8663,0.6869,0.786,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,103.78,1297.0,0.1003,0.1328,0.198,...,152.2,1575.0,0.1374,0.205,0.4,0.786,0.1625,0.2364,0.07678,0.854454


In [2]:
resultados_exames.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 35 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           569 non-null    int64  
 1   diagnostico  569 non-null    object 
 2   exame_1      569 non-null    float64
 3   exame_2      569 non-null    float64
 4   exame_3      569 non-null    float64
 5   exame_4      569 non-null    float64
 6   exame_5      569 non-null    float64
 7   exame_6      569 non-null    float64
 8   exame_7      569 non-null    float64
 9   exame_8      569 non-null    float64
 10  exame_9      569 non-null    float64
 11  exame_10     569 non-null    float64
 12  exame_11     569 non-null    float64
 13  exame_12     569 non-null    float64
 14  exame_13     569 non-null    float64
 15  exame_14     569 non-null    float64
 16  exame_15     569 non-null    float64
 17  exame_16     569 non-null    float64
 18  exame_17     569 non-null    float64
 19  exame_18

In [3]:
resultados_exames.isnull().sum()

id               0
diagnostico      0
exame_1          0
exame_2          0
exame_3          0
exame_4          0
exame_5          0
exame_6          0
exame_7          0
exame_8          0
exame_9          0
exame_10         0
exame_11         0
exame_12         0
exame_13         0
exame_14         0
exame_15         0
exame_16         0
exame_17         0
exame_18         0
exame_19         0
exame_20         0
exame_21         0
exame_22         0
exame_23         0
exame_24         0
exame_25         0
exame_26         0
exame_27         0
exame_28         0
exame_29         0
exame_30         0
exame_31         0
exame_32         0
exame_33       419
dtype: int64

In [4]:
n_entradas = resultados_exames.shape[0]
n_nulos = resultados_exames.isnull().sum().sum()
percentual_nulos = (n_nulos / n_entradas) * 100
print(f"Percentual de nulos: {percentual_nulos:.2f}%")

Percentual de nulos: 73.64%


**Visto que o percentual de nulos é consideravelmente grande e concentrado em uma única variável, optará-se por remover completamente a variável.**

In [5]:
resultados_exames.drop(columns="exame_33", inplace=True)

# Treino e Teste

## Algoritmo: RandomForest

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from numpy import random

SEED = 123143
random.seed(SEED)

valores_exames = resultados_exames.drop(columns=["id", "diagnostico"])
diagnostico = resultados_exames.diagnostico

label_encoder = LabelEncoder()
diagnostico = label_encoder.fit_transform(diagnostico)
diagnostico = pd.Series(diagnostico, name="diagnostico")

treino_x, teste_x, treino_y, teste_y = train_test_split(
    valores_exames, diagnostico, test_size=0.3
)

classificador = RandomForestClassifier(n_estimators=100)
classificador.fit(treino_x, treino_y)
acuracia_classificador = classificador.score(teste_x, teste_y) * 100
print(f"Resultado da classificação: {acuracia_classificador:.2f}%")

Resultado da classificação: 92.40%


## DummyClassifier

In [7]:
from sklearn.dummy import DummyClassifier

SEED = 123143
random.seed(SEED)

classificador_bobo = DummyClassifier(strategy="most_frequent")
classificador_bobo.fit(treino_x, treino_y)
acuracia_classificador_bobo = classificador_bobo.score(teste_x, teste_y) * 100
print(f"Resultado do classificador bobo: {acuracia_classificador_bobo:.2f}%")

Resultado do classificador bobo: 66.67%


# Avaliando os exames

In [8]:
import plotly.express as px
from sklearn.preprocessing import StandardScaler

SEED = 123143
random.seed(SEED)

padronizador = StandardScaler()
padronizador.fit(valores_exames)
valores_exames_padronizados = padronizador.transform(valores_exames)
valores_exames_padronizados = pd.DataFrame(
    valores_exames_padronizados, columns=valores_exames.columns
)

dados_plot = pd.concat([diagnostico, valores_exames_padronizados], axis=1)
dados_plot.head()
dados_plot_melted = pd.melt(
    dados_plot, id_vars="diagnostico", var_name="exames", value_name="valores"
)

# Create the violin plot
fig = px.violin(
    dados_plot_melted,
    y="valores",
    box=True,
    points=False,
    color="diagnostico",
    orientation="v",
    violinmode="group",
    animation_frame="exames",
    title="Distribuição dos valores dos exames por diagnóstico",
)
fig.show()

In [9]:
def plot_violin(diagnostico, valores_exames_padronizados, inicio, fim):
    dados_plot_filtered = pd.concat(
        [diagnostico, valores_exames_padronizados.iloc[:, inicio:fim]], axis=1
    )
    fig = px.violin(
        dados_plot_filtered,
        box=True,
        points=False,
        color="diagnostico",
        orientation="v",
        violinmode="overlay",
        title="Distribuição dos valores dos exames por diagnóstico",
    )
    fig.show()


plot_violin(diagnostico, valores_exames_padronizados, 0, 10)

In [10]:
plot_violin(diagnostico, valores_exames_padronizados, 10, 20)

In [11]:
plot_violin(diagnostico, valores_exames_padronizados, 20, 32)

In [12]:
valores_exames_v2 = valores_exames.drop(columns=["exame_29", "exame_4"])


def classificar(valores):
    SEED = 123143
    random.seed(SEED)

    treino_x, teste_x, treino_y, teste_y = train_test_split(
        valores, diagnostico, test_size=0.3
    )

    classificador = RandomForestClassifier(n_estimators=100)
    classificador.fit(treino_x, treino_y)
    return classificador.score(teste_x, teste_y) * 100


acuracia_classificador_v2 = classificar(valores_exames_v2)
print(f"Resultado da classificação: {acuracia_classificador_v2:.2f}%")

Resultado da classificação: 91.81%


In [13]:
matriz_correlacao = valores_exames_v2.corr()
import plotly.graph_objects as go

fig = go.Figure(
    data=go.Heatmap(
        z=matriz_correlacao.values,
        x=matriz_correlacao.columns,
        y=matriz_correlacao.index,
        colorscale="Viridis",
    )
)

fig.update_layout(title="Matriz de correlação dos exames", width=1300, height=700)

fig.show()

In [14]:
matriz_correlacao_v1 = matriz_correlacao[matriz_correlacao > 0.99]
matriz_correlacao_v1

Unnamed: 0,exame_1,exame_2,exame_3,exame_5,exame_6,exame_7,exame_8,exame_9,exame_10,exame_11,...,exame_22,exame_23,exame_24,exame_25,exame_26,exame_27,exame_28,exame_30,exame_31,exame_32
exame_1,1.0,,0.997855,,,,,,,,...,,,,,,,,,,
exame_2,,1.0,,,,,,,,,...,,,,,,,,,,
exame_3,0.997855,,1.0,,,,,,,,...,,,,,,,,,,
exame_5,,,,1.0,,,,,,,...,,,,,,,,,,
exame_6,,,,,1.0,,,,,,...,,,,,,,,,,
exame_7,,,,,,1.0,,,,,...,,,,,,,,,,
exame_8,,,,,,,1.0,,,,...,,,,,,,,,,
exame_9,,,,,,,,1.0,,,...,,,,,,,,,,
exame_10,,,,,,,,,1.0,,...,,,,,,,,,,
exame_11,,,,,,,,,,1.0,...,,,,,,,,,,


In [15]:
variaveis_correlacionadas = matriz_correlacao_v1.sum()
variaveis_correlacionadas

exame_1     1.997855
exame_2     1.000000
exame_3     1.997855
exame_5     1.000000
exame_6     1.000000
exame_7     1.000000
exame_8     1.000000
exame_9     1.000000
exame_10    1.000000
exame_11    1.000000
exame_12    1.000000
exame_13    1.000000
exame_14    1.000000
exame_15    1.000000
exame_16    1.000000
exame_17    1.000000
exame_18    1.000000
exame_19    1.000000
exame_20    1.000000
exame_21    1.000000
exame_22    1.993708
exame_23    1.000000
exame_24    1.993708
exame_25    1.000000
exame_26    1.000000
exame_27    1.000000
exame_28    1.000000
exame_30    1.000000
exame_31    1.000000
exame_32    1.000000
dtype: float64

In [16]:
variaveis_correlacionadas = variaveis_correlacionadas[variaveis_correlacionadas > 1]
variaveis_correlacionadas

exame_1     1.997855
exame_3     1.997855
exame_22    1.993708
exame_24    1.993708
dtype: float64

In [17]:
valores_exames_v3 = valores_exames_v2.drop(columns=["exame_3", "exame_24"])

In [18]:
valores_exames_v3

Unnamed: 0,exame_1,exame_2,exame_5,exame_6,exame_7,exame_8,exame_9,exame_10,exame_11,exame_12,...,exame_21,exame_22,exame_23,exame_25,exame_26,exame_27,exame_28,exame_30,exame_31,exame_32
0,17.99,10.38,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,1.0950,...,0.006193,25.380,17.33,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,0.5435,...,0.003532,24.990,23.41,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,0.7456,...,0.004571,23.570,25.53,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,0.4956,...,0.009208,14.910,26.50,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,0.7572,...,0.005115,22.540,16.67,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,1.1760,...,0.004239,25.450,26.40,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,0.7655,...,0.002498,23.690,38.25,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,0.4564,...,0.003892,18.980,34.12,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,0.7260,...,0.006185,25.740,39.42,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [19]:
acuracia_classificador_v3 = classificar(valores_exames_v3)
print(f"Resultado da classificação: {acuracia_classificador_v3:.2f}%")

Resultado da classificação: 92.98%


# Implementando SelectKBest

In [20]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

SEED = 123143
random.seed(SEED)

selecionar_kmelhores = SelectKBest(chi2, k=5)

# Tratar os dados para utilizar os valores não normalizados
valores_exames_tratados = resultados_exames.drop(
    columns=["id", "diagnostico", "exame_3", "exame_24", "exame_29", "exame_4"]
)

treino_x, teste_x, treino_y, teste_y = train_test_split(
    valores_exames_tratados, diagnostico, test_size=0.3
)

selecionar_kmelhores.fit(treino_x, treino_y)
treino_kbest = selecionar_kmelhores.transform(treino_x)
teste_kbest = selecionar_kmelhores.transform(teste_x)

In [21]:
teste_kbest.shape

(171, 5)

In [22]:
SEED = 123143
random.seed(SEED)

classificador = RandomForestClassifier(n_estimators=100)
classificador.fit(treino_kbest, treino_y)
acuracia_classificador_kbest = classificador.score(teste_kbest, teste_y) * 100

print(f"Resultado da classificação: {acuracia_classificador_kbest:.2f}%")

Resultado da classificação: 96.49%


# Matriz de Confusão

In [23]:
from sklearn.metrics import confusion_matrix

matriz_confusao = confusion_matrix(teste_y, classificador.predict(teste_kbest))

In [24]:
def plot_confusion_matrix(confusion_matrix):
    fig = go.Figure(
        data=go.Heatmap(
            z=confusion_matrix[::-1],
            x=["Maligno", "Benigno"],
            y=["Benigno", "Maligno"],
            name="",
            hovertemplate="REAL: %{y}<br>PREDITO: %{x}<br>QUANTIDADE: %{z}",
            colorscale="Greens",
        )
    )

    fig.update_layout(
        title="Confusion Matrix", xaxis_title="PREDITO", yaxis_title="REAL"
    )

    fig.show()

In [25]:
plot_confusion_matrix(matriz_confusao)

# Implementando RFE

In [26]:
from sklearn.feature_selection import RFE

SEED = 123143
random.seed(SEED)

treino_x, teste_x, treino_y, teste_y = train_test_split(
    valores_exames_tratados, diagnostico, test_size=0.3
)

classificador = RandomForestClassifier(n_estimators=100)
classificador.fit(treino_x, treino_y)

selecionador_rfe = RFE(estimator=classificador, n_features_to_select=5, step=1)
selecionador_rfe.fit(treino_x, treino_y)

treino_rfe = selecionador_rfe.transform(treino_x)
teste_rfe = selecionador_rfe.transform(teste_x)

classificador.fit(treino_rfe, treino_y)
print(f"Acurácia após RFE: {classificador.score(teste_rfe, teste_y) * 100:.2f}%")

matriz_confusao = confusion_matrix(teste_y, classificador.predict(teste_rfe))
plot_confusion_matrix(matriz_confusao)

Acurácia após RFE: 92.40%


# Implementando RFECV

In [27]:
from sklearn.feature_selection import RFECV

SEED = 123143
random.seed(SEED)

treino_x, teste_x, treino_y, teste_y = train_test_split(
    valores_exames_tratados, diagnostico, test_size=0.3
)

classificador = RandomForestClassifier(n_estimators=100)
classificador.fit(treino_x, treino_y)

selecionador_rfecv = RFECV(estimator=classificador, cv=5, scoring="accuracy", step=1)
selecionador_rfecv.fit(treino_x, treino_y)

treino_rfecv = selecionador_rfecv.transform(treino_x)
teste_rfecv = selecionador_rfecv.transform(teste_x)

classificador.fit(treino_rfecv, treino_y)
print(f"Acurácia após RFE: {classificador.score(teste_rfecv, teste_y) * 100:.2f}%")

matriz_confusao = confusion_matrix(teste_y, classificador.predict(teste_rfecv))
plot_confusion_matrix(matriz_confusao)

Acurácia após RFE: 93.57%


In [28]:
treino_x.columns[selecionador_rfecv.support_]

Index(['exame_1', 'exame_2', 'exame_5', 'exame_6', 'exame_7', 'exame_8',
       'exame_9', 'exame_11', 'exame_12', 'exame_13', 'exame_14', 'exame_15',
       'exame_16', 'exame_17', 'exame_18', 'exame_19', 'exame_20', 'exame_21',
       'exame_22', 'exame_23', 'exame_25', 'exame_26', 'exame_27', 'exame_28',
       'exame_30', 'exame_31', 'exame_32'],
      dtype='object')

In [29]:
selecionador_rfecv.cv_results_["mean_test_score"]

array([0.87939873, 0.93471519, 0.95221519, 0.95227848, 0.95724684,
       0.95727848, 0.94971519, 0.95218354, 0.94971519, 0.94971519,
       0.95731013, 0.94718354, 0.95727848, 0.96224684, 0.94718354,
       0.95977848, 0.95727848, 0.95721519, 0.95221519, 0.96224684,
       0.95977848, 0.95974684, 0.95477848, 0.96224684, 0.9546519 ,
       0.96227848, 0.96484177, 0.95471519])

In [30]:
n = len(selecionador_rfecv.cv_results_['mean_test_score'])
x = list(range(1, n + 1))

fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=selecionador_rfecv.cv_results_['mean_test_score'], mode='lines'))

fig.update_layout(
    title="Acurácia em função do número de exames",
    xaxis_title="Número de exames",
    yaxis_title="Acurácia"
)

fig.show()


# Observando duas features em um plano

In [31]:
SEED = 123143
random.seed(SEED)

treino_x, teste_x, treino_y, teste_y = train_test_split(
    valores_exames_tratados, diagnostico, test_size=0.3
)

classificador = RandomForestClassifier(n_estimators=100)
classificador.fit(treino_x, treino_y)

selecionador_rfe = RFE(estimator=classificador, n_features_to_select=2, step=1)
selecionador_rfe.fit(treino_x, treino_y)
valores_exames_tratados_2_features = selecionador_rfe.transform(valores_exames_tratados)

valores_exames_tratados_2_features.shape

(569, 2)

In [32]:
df_plot = pd.DataFrame(
    {
        "exame_1": valores_exames_tratados_2_features[:, 0],
        "exame_2": valores_exames_tratados_2_features[:, 1],
        "diagnostico": diagnostico
    }
)

df_plot.replace({0: "Benigno", 1: "Maligno"}, inplace=True)

px.scatter(data_frame=df_plot, x="exame_1", y="exame_2", color="diagnostico")

# Implementando Principal Component Analysis (PCA)

In [33]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

SEED = 123143
random.seed(SEED)

scaler = StandardScaler()
valores_exames_tratados_padronizados = scaler.fit_transform(valores_exames_tratados)

pca = PCA(n_components=2)
pca_exames = pca.fit_transform(valores_exames_tratados_padronizados)

df_plot = pd.DataFrame(
    {
        "exame_1": pca_exames[:, 0],
        "exame_2": pca_exames[:, 1],
        "diagnostico": diagnostico
    }
)

df_plot.replace({0: "Benigno", 1: "Maligno"}, inplace=True)

px.scatter(data_frame=df_plot, x="exame_1", y="exame_2", color="diagnostico")

# Implementando t-distributed Stochastic Neighbor Embedding

from sklearn.manifold import TSNE

