In [42]:
import pandas as pd
import numpy as np

## Leitura do arquivo

In [2]:
data = pd.read_csv('breast_cancer_data.csv',sep=',')
data.head()

Unnamed: 0,1000025,5.0,1.0,1,1.1,2,1.2,3.0,1.0.1,1.3,benign,Dr. Doe
0,1002945,5.0,4.0,4,5,7,10,3.0,2.0,1,benign,Dr. Smith
1,1015425,3.0,1.0,1,1,2,2,3.0,1.0,1,benign,Dr. Lee
2,1016277,6.0,8.0,8,1,3,4,3.0,7.0,1,benign,Dr. Smith
3,1017023,4.0,1.0,1,3,2,1,3.0,1.0,1,benign,Dr. Wong
4,1017122,8.0,10.0,10,8,7,10,9.0,7.0,1,malignant,Dr. Smith


In [4]:
# para visualizar o formato do dataframe: linhas x colunas
print('formato: ', data.shape, '\n')

formado:  (698, 12) 



### não temos nome de coluna, então precisamos fornecer esses nomes e passar como parâmetro na função read_csv

In [5]:
colunas = ['id','clump_thickness','cell_size_uniformity','cell_shape_uniformity','marginal_adhesion','single_ep_cell_size',
          'bare_nuclei','bland_chromatin','normal_nucleoli','mitoses','class','doctor_name']

data = pd.read_csv('breast_cancer_data.csv',names=colunas)
data.head()

Unnamed: 0,id,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_ep_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class,doctor_name
0,1000025,5.0,1.0,1,1,2,1,3.0,1.0,1,benign,Dr. Doe
1,1002945,5.0,4.0,4,5,7,10,3.0,2.0,1,benign,Dr. Smith
2,1015425,3.0,1.0,1,1,2,2,3.0,1.0,1,benign,Dr. Lee
3,1016277,6.0,8.0,8,1,3,4,3.0,7.0,1,benign,Dr. Smith
4,1017023,4.0,1.0,1,3,2,1,3.0,1.0,1,benign,Dr. Wong


In [6]:
print("Quantidade de colunas: ",len(colunas),'\n')

# imprime o nome das colunas
data.columns

Quantidade de colunas:  12 



Index(['id', 'clump_thickness', 'cell_size_uniformity',
       'cell_shape_uniformity', 'marginal_adhesion', 'single_ep_cell_size',
       'bare_nuclei', 'bland_chromatin', 'normal_nucleoli', 'mitoses', 'class',
       'doctor_name'],
      dtype='object')

### vamos entender nossos dados

In [None]:
data.dtypes

In [None]:
list(set(data.dtypes.tolist()))

In [None]:
data_num = data.select_dtypes(include = ['float64', 'int64'])
data_num = data_num.corr()['clump_thickness'][2:] # para tirar a variável clump_thickness
data_num = data_num[abs(data_num) > 0.5].sort_values(ascending=False) #filra valores de correlaçao < 0.5 e ordena os valores
print("Existem {} variáveis correlacionadas com clump_thickness:\n{}".format(len(data_num), data_num))

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.groupby(by =['class','doctor_name']).size()

In [10]:
print(data['class'].value_counts(),'\n')
# data['doctor_name'].value_counts() para médicos

# se quisermos os valores normalizados, precisamos adicionar um parâmetro
print('Dados normalizados: \n',data['class'].value_counts(normalize=True))

benign       458
malignant    241
Name: class, dtype: int64 

Dados normalizados: 
 benign       0.655222
malignant    0.344778
Name: class, dtype: float64


In [11]:
# ordenando apenas por uma coluna
data.sort_values(by='cell_size_uniformity',ascending=False).head()

Unnamed: 0,id,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_ep_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class,doctor_name
106,1170419,10.0,10.0,10,8,2,10,4.0,1.0,1,malignant,Dr. Smith
44,1103608,10.0,10.0,10,4,8,1,8.0,10.0,1,malignant,Dr. Doe
669,1350423,5.0,10.0,10,8,5,5,7.0,10.0,1,malignant,Dr. Smith
104,1168736,10.0,10.0,10,10,10,1,8.0,8.0,8,malignant,Dr. Wong
266,1198641,10.0,10.0,6,3,3,10,4.0,3.0,2,malignant,Dr. Smith


In [14]:
# ordenando por múltiplas colunas
data.sort_values(by=['cell_size_uniformity','bare_nuclei'],ascending=[False,True]).head()

Unnamed: 0,id,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_ep_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class,doctor_name
36,1080185,10.0,10.0,10,8,6,1,8.0,9.0,1,malignant,Dr. Doe
44,1103608,10.0,10.0,10,4,8,1,8.0,10.0,1,malignant,Dr. Doe
104,1168736,10.0,10.0,10,10,10,1,8.0,8.0,8,malignant,Dr. Wong
177,1201936,5.0,10.0,10,3,8,1,5.0,10.0,3,malignant,Dr. Lee
436,1295186,10.0,10.0,10,1,6,1,2.0,8.0,1,malignant,Dr. Doe


In [18]:
# média
data.clump_thickness.mean()

4.416905444126074

In [23]:
# benigno
print('Tamanho - benigno: ',data[data['class']=='benign']['cell_size_uniformity'].mean(),'\n')
print('Tamanho - maligno: ',data[data['class']=='malignant']['cell_size_uniformity'].mean(),'\n')

Tamanho - benigno:  1.3260393873085339 

Tamanho - maligno:  6.572614107883817 



In [29]:
data[(data['class']=='malignant') & (data['doctor_name']=='Dr. Wong')]['mitoses'].max()

10

In [39]:
# Analisando as 10 primeiras linhas das colunas 2 a 5
data.loc[0:9,'cell_size_uniformity':'single_ep_cell_size']
#data.iloc[0:9,2:6] # usando iloc, podemos passar apenas os índices

Unnamed: 0,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_ep_cell_size
0,1.0,1,1,2
1,4.0,4,5,7
2,1.0,1,1,2
3,8.0,8,1,3
4,1.0,1,3,2
5,10.0,10,8,7
6,,1,1,2
7,1.0,2,1,2
8,1.0,1,1,2
9,2.0,1,1,2


In [40]:
# acessando a última linha do dataframe
data[-1:]

Unnamed: 0,id,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_ep_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class,doctor_name
698,897471,4.0,8.0,8,5,4,5,10.0,4.0,1,malignant,Dr. Wong


### Lidando com valores faltantes

In [44]:
data.isna().sum()

id                       0
clump_thickness          1
cell_size_uniformity     1
cell_shape_uniformity    0
marginal_adhesion        0
single_ep_cell_size      0
bare_nuclei              2
bland_chromatin          4
normal_nucleoli          1
mitoses                  0
class                    0
doctor_name              0
dtype: int64

In [45]:
data = data.dropna(axis = 0, how = 'any')
len(data)

690

### Lidando com valores duplicados

In [46]:
data[data.duplicated()]

Unnamed: 0,id,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_ep_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class,doctor_name
258,1198641,3.0,1.0,1,1,2,1,3.0,1.0,1,benign,Dr. Lee


In [47]:
data = data.drop_duplicates()

In [48]:
data[data.duplicated()]

Unnamed: 0,id,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_ep_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class,doctor_name


## Aplicando funções 

In [49]:
# encontrando o valor máximo em cada coluna
data.apply(np.max)

id                        13454352
clump_thickness                 10
cell_size_uniformity            10
cell_shape_uniformity           10
marginal_adhesion               10
single_ep_cell_size             10
bare_nuclei                      ?
bland_chromatin                 10
normal_nucleoli                 10
mitoses                         10
class                    malignant
doctor_name               Dr. Wong
dtype: object

In [57]:
data['bare_nuclei'].unique()

array(['1', '10', '2', '4', '3', '9', '7', 6, '5', '8', '6'], dtype=object)

In [61]:
data['bare_nuclei'] = data['bare_nuclei'].replace('?','6').astype('int64')

In [62]:
#executando de novo para testar
data.apply(np.max)

id                        13454352
clump_thickness                 10
cell_size_uniformity            10
cell_shape_uniformity           10
marginal_adhesion               10
single_ep_cell_size             10
bare_nuclei                     10
bland_chromatin                 10
normal_nucleoli                 10
mitoses                         10
class                    malignant
doctor_name               Dr. Wong
dtype: object

### Funções de agregação

In [63]:
colunas = ['cell_size_uniformity','cell_shape_uniformity','mitoses']
data.groupby(['class'])[colunas].agg([np.mean, np.min, np.max])

Unnamed: 0_level_0,cell_size_uniformity,cell_size_uniformity,cell_size_uniformity,cell_shape_uniformity,cell_shape_uniformity,cell_shape_uniformity,mitoses,mitoses,mitoses
Unnamed: 0_level_1,mean,amin,amax,mean,amin,amax,mean,amin,amax
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
benign,1.330377,1.0,9.0,1.450111,1,8,1.064302,1,8
malignant,6.605042,1.0,10.0,6.592437,1,10,2.592437,1,10


### Filtrando dados

In [None]:
repeat_patients = data.groupby(by = 'id').size().sort_values(ascending =False)
repeat_patients

In [None]:
filtered_patients = repeat_patients[repeat_patients > 2].to_frame().reset_index()
filtered_data = data[~data.id.isin(filtered_patients.id)]

In [None]:
filtered_data