## Importação dos pacotes

In [1]:
# importar pacotes necessários
import numpy as np
import pandas as pd

In [2]:
# definir parâmetros extras
pd.set_option('precision', 3)
pd.set_option('display.max_columns', 100)

## Carga dos dados

In [3]:
# carregar arquivo de dados de treino
data = pd.read_csv('heart-train.csv', index_col='Paciente')

# substituir valor '?' por nulo
data.replace('?', np.nan, inplace=True)

# converter tipos para numérico
for col in ['PAR', 'CS', 'FCM', 'DST']:
    data[col] = data[col].astype(float)
for col in ['ASJ', 'ECG', 'AIE', 'IST', 'NVP', 'Talassemia']:
    data[col] = data[col].astype(float)
    
# mostrar alguns exemplos de registros
data.head()

Unnamed: 0_level_0,Hospital,Idade,Sexo,TDP,PAR,CS,ASJ,ECG,FCM,AIE,DST,IST,NVP,Talassemia,Diagnóstico
Paciente,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
262,C,60,F,1,150.0,240.0,0.0,0.0,171.0,0.0,0.9,1.0,0.0,3.0,A
520,H,54,M,4,125.0,224.0,0.0,0.0,122.0,0.0,2.0,2.0,,,P
264,C,61,M,4,138.0,166.0,0.0,2.0,125.0,1.0,3.6,2.0,1.0,3.0,P
674,S,60,M,3,115.0,0.0,,0.0,143.0,0.0,2.4,1.0,,,P
102,C,57,F,4,128.0,303.0,0.0,2.0,159.0,0.0,0.0,1.0,1.0,3.0,A


In [4]:
# quantas linhas e colunas existem?
data.shape

(613, 15)

## Análise dos dados

In [5]:
# quais são as colunas e respectivos tipos de dados?
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 613 entries, 262 to 173
Data columns (total 15 columns):
Hospital       613 non-null object
Idade          613 non-null int64
Sexo           613 non-null object
TDP            613 non-null int64
PAR            571 non-null float64
CS             593 non-null float64
ASJ            553 non-null float64
ECG            611 non-null float64
FCM            574 non-null float64
AIE            574 non-null float64
DST            569 non-null float64
IST            398 non-null float64
NVP            207 non-null float64
Talassemia     293 non-null float64
Diagnóstico    613 non-null object
dtypes: float64(10), int64(2), object(3)
memory usage: 76.6+ KB


In [6]:
# existem colunas com dados nulos?
data[data.columns[data.isnull().any()]].isnull().sum()

PAR            42
CS             20
ASJ            60
ECG             2
FCM            39
AIE            39
DST            44
IST           215
NVP           406
Talassemia    320
dtype: int64

In [7]:
data.isna().sum()

Hospital         0
Idade            0
Sexo             0
TDP              0
PAR             42
CS              20
ASJ             60
ECG              2
FCM             39
AIE             39
DST             44
IST            215
NVP            406
Talassemia     320
Diagnóstico      0
dtype: int64

In [8]:
# sumário estatístico das características numéricas
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Idade,613.0,53.723,9.473,28.0,47.0,54.0,60.0,77.0
TDP,613.0,3.269,0.903,1.0,3.0,4.0,4.0,4.0
PAR,571.0,132.361,18.024,80.0,120.0,130.0,142.0,200.0
CS,593.0,199.779,112.728,0.0,173.0,223.0,267.0,603.0
ASJ,553.0,0.177,0.382,0.0,0.0,0.0,0.0,1.0
ECG,611.0,0.609,0.809,0.0,0.0,0.0,1.0,2.0
FCM,574.0,136.455,26.348,60.0,119.25,138.0,157.0,202.0
AIE,574.0,0.389,0.488,0.0,0.0,0.0,1.0,1.0
DST,569.0,0.871,1.099,-2.6,0.0,0.5,1.5,6.2
IST,398.0,1.754,0.606,1.0,1.0,2.0,2.0,3.0


In [9]:
# quais as correlações entre as características numéricas?
data.corr()

Unnamed: 0,Idade,TDP,PAR,CS,ASJ,ECG,FCM,AIE,DST,IST,NVP,Talassemia
Idade,1.0,0.184,0.267,-0.104,0.198,0.211,-0.362,0.228,0.252,0.11,0.371,0.113
TDP,0.184,1.0,0.056,-0.126,0.066,0.063,-0.327,0.427,0.278,0.21,0.19,0.404
PAR,0.267,0.056,1.0,0.057,0.112,0.078,-0.113,0.149,0.163,0.046,0.094,0.065
CS,-0.104,-0.126,0.057,1.0,-0.008,0.114,0.279,-0.076,0.035,-0.049,0.014,-0.193
ASJ,0.198,0.066,0.112,-0.008,1.0,0.111,-0.019,0.024,0.038,0.046,0.17,0.101
ECG,0.211,0.063,0.078,0.114,0.111,1.0,0.057,0.007,0.145,-0.056,0.21,-0.052
FCM,-0.362,-0.327,-0.113,0.279,-0.019,0.057,1.0,-0.371,-0.12,-0.341,-0.217,-0.347
AIE,0.228,0.427,0.149,-0.076,0.024,0.007,-0.371,1.0,0.381,0.32,0.107,0.306
DST,0.252,0.278,0.163,0.035,0.038,0.145,-0.12,0.381,1.0,0.432,0.268,0.276
IST,0.11,0.21,0.046,-0.049,0.046,-0.056,-0.341,0.32,0.432,1.0,0.099,0.296


In [10]:
# show variable correlation which is more than 0.6 (positive or negative)
corr = data.corr()
corr[corr != 1][abs(corr) > 0.7].dropna(how='all', axis=1).dropna(how='all', axis=0)

In [11]:
data.groupby('Diagnóstico').mean()

Unnamed: 0_level_0,Idade,TDP,PAR,CS,ASJ,ECG,FCM,AIE,DST,IST,NVP,Talassemia
Diagnóstico,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
A,50.806,2.798,129.73,231.532,0.115,0.555,148.413,0.131,0.411,1.464,0.294,3.992
P,55.914,3.623,134.439,176.636,0.23,0.649,127.096,0.59,1.234,1.908,1.194,6.0
