## Variáveis

- **Class**: Variável alvo que indica se ocorreu recorrência do câncer de mama (no-recurrence-events = sem recorrência, recurrence-events = recorrência).
- **age**: Idade do paciente em faixas etárias (10-19, 20-29, 30-39, 40-49, 50-59, 60-69, 70-79, 80-89, 90-99 anos).
- **menopause**: Estado da menopausa do paciente (lt40 = antes dos 40 anos, ge40 = igual ou acima dos 40 anos, premeno = pré-menopausa).
- **tumor-size**: Tamanho do tumor em milímetros (0-4, 5-9, 10-14, 15-19, 20-24, 25-29, 30-34, 35-39, 40-44, 45-49, 50-54, 55-59 mm).
- **inv-nodes**: Número de linfonodos invadidos (0-2, 3-5, 6-8, 9-11, 12-14, 15-17, 18-20, 21-23, 24-26, 27-29, 30-32, 33-35, 36-39).
- **node-caps**: Se os linfonodos estão encapsulados ou não (yes = sim, no = não).
- **deg-malig**: Grau de malignidade do tumor (1, 2, 3).
- **breast**: Lado do seio afetado pelo câncer (left = esquerdo, right = direito).
- **breast-quad**: Quadrante do seio afetado pelo câncer (left-up = superior esquerdo, left-low = inferior esquerdo, right-up = superior direito, right-low = inferior direito, central = central).
- **irradiat**: Se o paciente recebeu tratamento de radioterapia ou não (yes = sim, no = não).


In [35]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

In [36]:
warnings.filterwarnings('ignore') 
pd.options.display.max_columns = None
pd.options.display.max_rows = None

seed=42

In [37]:
# Carregando os dados
data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/breast-cancer.data"
column_names = ["Class", "age", "menopause", "tumor-size", "inv-nodes", "node-caps", "deg-malig", "breast", "breast-quad", "irradiat"]
data = pd.read_csv(data_url, names=column_names)
display(data.head(), data.shape)

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


(286, 10)

In [38]:
# Verificar alores ausentes
df_null = data.isnull().mean(axis = 0)
df_null = df_null[df_null >= 0] * 100
print('Colunas com valores ausentes (qtd relativa): \n\n{}\n'.format(df_null.sort_values(axis=0, ascending=False)))


Colunas com valores ausentes (qtd relativa): 

Class          0.0
age            0.0
menopause      0.0
tumor-size     0.0
inv-nodes      0.0
node-caps      0.0
deg-malig      0.0
breast         0.0
breast-quad    0.0
irradiat       0.0
dtype: float64



In [39]:
#Transformar varável em dummys
import pandas as pd
def criar_dummy_variables(dataframe, colunas_categoricas):
    df = dataframe.copy()


    for coluna in colunas_categoricas:
        dummies = pd.get_dummies(df[coluna], prefix=f'dummy_{coluna}', dtype=int)
        df = pd.concat([df, dummies], axis=1)
        #df.drop(coluna, axis=1, inplace=True)
    return df
# Lista de variáveis categóricas para transformar em dummy
variaveis_categoricas = ['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'deg-malig', 'breast', 'breast-quad', 'irradiat']

# Chamando a função
data = criar_dummy_variables(data, variaveis_categoricas)

# Exibindo o resultado
display(data.head(), data.shape)


Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,dummy_age_20-29,dummy_age_30-39,dummy_age_40-49,dummy_age_50-59,dummy_age_60-69,dummy_age_70-79,dummy_menopause_ge40,dummy_menopause_lt40,dummy_menopause_premeno,dummy_tumor-size_0-4,dummy_tumor-size_10-14,dummy_tumor-size_15-19,dummy_tumor-size_20-24,dummy_tumor-size_25-29,dummy_tumor-size_30-34,dummy_tumor-size_35-39,dummy_tumor-size_40-44,dummy_tumor-size_45-49,dummy_tumor-size_5-9,dummy_tumor-size_50-54,dummy_inv-nodes_0-2,dummy_inv-nodes_12-14,dummy_inv-nodes_15-17,dummy_inv-nodes_24-26,dummy_inv-nodes_3-5,dummy_inv-nodes_6-8,dummy_inv-nodes_9-11,dummy_node-caps_?,dummy_node-caps_no,dummy_node-caps_yes,dummy_deg-malig_1,dummy_deg-malig_2,dummy_deg-malig_3,dummy_breast_left,dummy_breast_right,dummy_breast-quad_?,dummy_breast-quad_central,dummy_breast-quad_left_low,dummy_breast-quad_left_up,dummy_breast-quad_right_low,dummy_breast-quad_right_up,dummy_irradiat_no,dummy_irradiat_yes
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,0,1,0
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,1,0
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,0
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0


(286, 53)

In [40]:
import numpy as np
# ANTES
display(data.Class.value_counts(dropna = False, sort = False).to_frame())

 #Codificação em classes
def classes_enc(x):
    if x == 'no-recurrence-events':
        return 0
    elif x == 'recurrence-events':
        return 1

func = np.vectorize(classes_enc)
data['Class'] = func(data['Class'])

# DEPOIS
display(data.Class.value_counts(dropna = False).to_frame().sort_index())

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
no-recurrence-events,201
recurrence-events,85


Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,201
1,85


In [41]:
#Renomear variável
data = data.rename(columns = {'Class':'target'})

from sklearn.model_selection import train_test_split
# Dividir base de dados em treino e teste
X = data[data.columns.difference(['target'])]
y = data['target']


# Divide a base em partições de treino (70%) e teste (30%) com dados estratificados pela variável alvo
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed, stratify = y)


print('Número de registros em X_train: ', X_train.shape)
print('Número de registros em y_train: ', y_train.shape)
print('Número de registros em X_test: ', X_test.shape)
print('Número de registros em y_test: ', y_test.shape)


del X
del y


print('Distribuição da variável resposta na partição de treino: ')
display(pd.DataFrame(y_train.value_counts(normalize=True)))
print('Distribuição da variável resposta na partição de teste: ')
display(pd.DataFrame(y_test.value_counts(normalize=True)))


X_train=X_train.join(y_train)
X_test=X_test.join(y_test)

print('Treino')
display(X_train.head(), X_train.shape)
print('Teste')
display(X_test.head(), X_test.shape)

Número de registros em X_train:  (228, 52)
Número de registros em y_train:  (228,)
Número de registros em X_test:  (58, 52)
Número de registros em y_test:  (58,)
Distribuição da variável resposta na partição de treino: 


Unnamed: 0_level_0,proportion
target,Unnamed: 1_level_1
0,0.701754
1,0.298246


Distribuição da variável resposta na partição de teste: 


Unnamed: 0_level_0,proportion
target,Unnamed: 1_level_1
0,0.706897
1,0.293103


Treino


Unnamed: 0,age,breast,breast-quad,deg-malig,dummy_age_20-29,dummy_age_30-39,dummy_age_40-49,dummy_age_50-59,dummy_age_60-69,dummy_age_70-79,dummy_breast-quad_?,dummy_breast-quad_central,dummy_breast-quad_left_low,dummy_breast-quad_left_up,dummy_breast-quad_right_low,dummy_breast-quad_right_up,dummy_breast_left,dummy_breast_right,dummy_deg-malig_1,dummy_deg-malig_2,dummy_deg-malig_3,dummy_inv-nodes_0-2,dummy_inv-nodes_12-14,dummy_inv-nodes_15-17,dummy_inv-nodes_24-26,dummy_inv-nodes_3-5,dummy_inv-nodes_6-8,dummy_inv-nodes_9-11,dummy_irradiat_no,dummy_irradiat_yes,dummy_menopause_ge40,dummy_menopause_lt40,dummy_menopause_premeno,dummy_node-caps_?,dummy_node-caps_no,dummy_node-caps_yes,dummy_tumor-size_0-4,dummy_tumor-size_10-14,dummy_tumor-size_15-19,dummy_tumor-size_20-24,dummy_tumor-size_25-29,dummy_tumor-size_30-34,dummy_tumor-size_35-39,dummy_tumor-size_40-44,dummy_tumor-size_45-49,dummy_tumor-size_5-9,dummy_tumor-size_50-54,inv-nodes,irradiat,menopause,node-caps,tumor-size,target
215,40-49,right,left_up,2,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0-2,no,ge40,no,20-24,1
41,60-69,left,left_low,2,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0-2,no,ge40,no,25-29,0
24,50-59,right,left_up,2,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0-2,no,premeno,no,35-39,0
161,40-49,right,left_low,2,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,3-5,no,premeno,yes,30-34,0
144,60-69,left,central,3,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,6-8,no,ge40,yes,45-49,0


(228, 53)

Teste


Unnamed: 0,age,breast,breast-quad,deg-malig,dummy_age_20-29,dummy_age_30-39,dummy_age_40-49,dummy_age_50-59,dummy_age_60-69,dummy_age_70-79,dummy_breast-quad_?,dummy_breast-quad_central,dummy_breast-quad_left_low,dummy_breast-quad_left_up,dummy_breast-quad_right_low,dummy_breast-quad_right_up,dummy_breast_left,dummy_breast_right,dummy_deg-malig_1,dummy_deg-malig_2,dummy_deg-malig_3,dummy_inv-nodes_0-2,dummy_inv-nodes_12-14,dummy_inv-nodes_15-17,dummy_inv-nodes_24-26,dummy_inv-nodes_3-5,dummy_inv-nodes_6-8,dummy_inv-nodes_9-11,dummy_irradiat_no,dummy_irradiat_yes,dummy_menopause_ge40,dummy_menopause_lt40,dummy_menopause_premeno,dummy_node-caps_?,dummy_node-caps_no,dummy_node-caps_yes,dummy_tumor-size_0-4,dummy_tumor-size_10-14,dummy_tumor-size_15-19,dummy_tumor-size_20-24,dummy_tumor-size_25-29,dummy_tumor-size_30-34,dummy_tumor-size_35-39,dummy_tumor-size_40-44,dummy_tumor-size_45-49,dummy_tumor-size_5-9,dummy_tumor-size_50-54,inv-nodes,irradiat,menopause,node-caps,tumor-size,target
74,30-39,left,right_low,2,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0-2,no,premeno,no,5-9,0
134,60-69,left,left_low,2,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0-2,yes,ge40,no,30-34,0
54,70-79,right,left_up,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0-2,no,ge40,no,40-44,0
50,50-59,left,left_low,2,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0-2,no,lt40,no,15-19,0
57,50-59,right,right_up,2,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0-2,no,ge40,no,5-9,0


(58, 53)

In [42]:

# Exportar o CSV com os parâmetros especificados
X_train.to_csv(f'../data/train.csv.gz', sep=';', index=None,
        compression='gzip',
)
X_test.to_csv(f'../data/test.csv.gz', sep=';', index=None,
        compression='gzip',
)

In [1]:
#!pip install black

Collecting black
  Downloading black-24.4.2-cp312-cp312-win_amd64.whl.metadata (77 kB)
     ---------------------------------------- 0.0/77.1 kB ? eta -:--:--
     ---------- --------------------------- 20.5/77.1 kB 682.7 kB/s eta 0:00:01
     ---------------------------------------- 77.1/77.1 kB 1.1 MB/s eta 0:00:00
Collecting click>=8.0.0 (from black)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting mypy-extensions>=0.4.3 (from black)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl.metadata (1.1 kB)
Collecting pathspec>=0.9.0 (from black)
  Downloading pathspec-0.12.1-py3-none-any.whl.metadata (21 kB)
Downloading black-24.4.2-cp312-cp312-win_amd64.whl (1.4 MB)
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   -------------------------------------- - 1.4/1.4 MB 42.6 MB/s eta 0:00:01
   ---------------------------------------- 1.4/1.4 MB 30.2 MB/s eta 0:00:00
Downloading click-8.1.7-py3-none-any.whl (97 kB)
   --------------------------