In [1]:
import pandas as pd
import numpy as np

In [2]:
# carregar dados das 4 fontes distintas
files = ['cleveland', 'hungarian', 'switzerland', 'va']

data = None
first = True

for file in files:
    file_name = 'data/processed.%s.data' % file
    print("Lendo [%s]..." % file_name)
    pdata = pd.read_csv(file_name, sep=',', header=None)
    pdata['Hospital'] = file[0].upper()
    
    #print(pdata.shape)
    if first:
        data = pdata
        first = False
    else:
        data = data.append(pdata, ignore_index=True)
    #print(data.shape)

Lendo [data/processed.cleveland.data]...
Lendo [data/processed.hungarian.data]...
Lendo [data/processed.switzerland.data]...
Lendo [data/processed.va.data]...


In [3]:
# embaralhar dados
data = data.sample(frac=1)

# mostrar quantidade de linhas e colunas
print(data.shape)

# mostrar exemplos de dados
data.head()

(920, 15)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,Hospital
262,60.0,0.0,1.0,150,240,0,0,171,0,0.9,1,0.0,3.0,0,C
520,54.0,1.0,4.0,125,224,0,0,122,0,2.0,2,?,?,1,H
264,61.0,1.0,4.0,138,166,0,2,125,1,3.6,2,1.0,3.0,4,C
674,60.0,1.0,3.0,115,0,?,0,143,0,2.4,1,?,?,1,S
102,57.0,0.0,4.0,128,303,0,2,159,0,0.0,1,1.0,3.0,0,C


In [4]:
# renomear coluna índice
data.index.names = ['Paciente']

In [5]:
# modificar nomes das colunas
cols = {
    0: 'Idade', # idade (em anos)
    1: 'Sexo', # sexo (0: mulher, 1: homem)
    2: 'TDP', # tipo de dor no peito (1: angina típica, 2: angina atípica, 3: dor não anginal, 4: assintomática)
    3: 'PAR', # pressão arterial em repouso ao chegar no hospital (em mmHg)
    4: 'CS', # colesterol sérico (em mg/dL)
    5: 'ASJ', # açúcar no sangue em jejum > 120 mg/dL (0: não, 1: sim)
    6: 'ECG', # resultados eletrocardiográficos em repouso (0: normal, 1: anormalidade da onda ST-T, 2: hipertrofia ventricular esquerda)
    7: 'FCM', # frequência cardíaca máxima alcançada
    8: 'AIE', # angina induzida por exercício (0: não, 1: sim)
    9: 'DST', # depressão do segmento ST induzida por exercício em relação ao repouso
    10: 'IST', # inclinação do segmento ST do pico do exercício (1: subida, 2: plana, 3: descida)
    11: 'NVP', # número de vasos principais (0-3) coloridos por fluoroscopia
    12: 'Talassemia', # talassemia: 3 = normal; 6 = defeito consertado; 7 = defeito reversível
    13: 'Diagnóstico' # diagnóstico de doença cardíaca / status da doença angiográfica (0: ausência, 1-4: presença)
}
data.rename(columns=cols, inplace=True)

In [6]:
# converter para inteiro
for col in ['Idade', 'TDP']:
    data[col] = data[col].astype(int)

In [7]:
# converter para inteiro
for col in ['NVP', 'Talassemia']:
    data[col] = data[col].apply(lambda x: x.replace('.0', ''))

In [8]:
# M: masculino, F: feminino
col = 'Sexo'
data[col] = data[col].apply(lambda x: 'M' if x > 0 else 'F')

In [9]:
# A: ausente, P: presente
col = 'Diagnóstico'
data[col] = data[col].apply(lambda x: 'P' if x > 0 else 'A')

In [10]:
# modificar a ordem de aparição das colunas
data = data[['Hospital', 'Idade', 'Sexo', 'TDP', 'PAR', 'CS', 'ASJ', 'ECG',
     'FCM', 'AIE', 'DST', 'IST', 'NVP', 'Talassemia', 'Diagnóstico']]

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 920 entries, 262 to 392
Data columns (total 15 columns):
Hospital       920 non-null object
Idade          920 non-null int64
Sexo           920 non-null object
TDP            920 non-null int64
PAR            920 non-null object
CS             920 non-null object
ASJ            920 non-null object
ECG            920 non-null object
FCM            920 non-null object
AIE            920 non-null object
DST            920 non-null object
IST            920 non-null object
NVP            920 non-null object
Talassemia     920 non-null object
Diagnóstico    920 non-null object
dtypes: int64(2), object(13)
memory usage: 115.0+ KB


In [12]:
data.describe()

Unnamed: 0,Idade,TDP
count,920.0,920.0
mean,53.51087,3.25
std,9.424685,0.930969
min,28.0,1.0
25%,47.0,3.0
50%,54.0,4.0
75%,60.0,4.0
max,77.0,4.0


In [13]:
data.describe(include=['O'])

Unnamed: 0,Hospital,Sexo,PAR,CS,ASJ,ECG,FCM,AIE,DST,IST,NVP,Talassemia,Diagnóstico
count,920,920,920,920,920,920,920,920,920.0,920,920,920,920
unique,4,2,103,336,5,7,198,5,82.0,7,5,4,2
top,C,M,120,0,0,0,?,0,0.0,?,?,?,P
freq,303,726,94,123,434,320,55,324,288.0,309,611,486,509


In [14]:
data.head(50)

Unnamed: 0_level_0,Hospital,Idade,Sexo,TDP,PAR,CS,ASJ,ECG,FCM,AIE,DST,IST,NVP,Talassemia,Diagnóstico
Paciente,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
262,C,60,F,1,150,240,0,0,171,0,0.9,1,0,3,A
520,H,54,M,4,125,224,0,0,122,0,2,2,?,?,P
264,C,61,M,4,138,166,0,2,125,1,3.6,2,1,3,P
674,S,60,M,3,115,0,?,0,143,0,2.4,1,?,?,P
102,C,57,F,4,128,303,0,2,159,0,0,1,1,3,A
315,H,35,F,1,120,160,0,1,185,0,0,?,?,?,A
471,H,56,M,3,130,?,0,0,114,0,0,?,?,?,A
914,V,46,M,4,134,310,0,0,126,0,0,?,?,3,P
31,C,60,M,4,117,230,1,0,160,1,1.4,1,2,7,P
90,C,66,M,4,120,302,0,2,151,0,0.4,2,0,3,A


In [15]:
# dividir os dados de treino e teste

divisao = int(data.shape[0] * 2 / 3)

train = data[:divisao]
test = data[divisao:]

print(train.shape, test.shape)

(613, 15) (307, 15)


In [16]:
train.head()

Unnamed: 0_level_0,Hospital,Idade,Sexo,TDP,PAR,CS,ASJ,ECG,FCM,AIE,DST,IST,NVP,Talassemia,Diagnóstico
Paciente,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
262,C,60,F,1,150,240,0,0,171,0,0.9,1,0,3,A
520,H,54,M,4,125,224,0,0,122,0,2.0,2,?,?,P
264,C,61,M,4,138,166,0,2,125,1,3.6,2,1,3,P
674,S,60,M,3,115,0,?,0,143,0,2.4,1,?,?,P
102,C,57,F,4,128,303,0,2,159,0,0.0,1,1,3,A


In [17]:
test.head()

Unnamed: 0_level_0,Hospital,Idade,Sexo,TDP,PAR,CS,ASJ,ECG,FCM,AIE,DST,IST,NVP,Talassemia,Diagnóstico
Paciente,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
25,C,50,F,3,120,219,0,0,158,0,1.6,2,0,3,A
249,C,62,M,2,128,208,1,2,140,0,0.0,1,0,3,A
166,C,52,M,3,138,223,0,0,169,0,0.0,1,?,3,A
87,C,53,F,3,128,216,0,2,115,0,0.0,1,0,?,A
529,H,38,M,4,110,?,0,0,150,1,1.0,2,?,?,P


In [18]:
# parâmetros
file_prefix = 'heart'
target_column = 'Diagnóstico'

In [19]:
example = test[[target_column]]
#example[target_column] = example.index.map(lambda x: x % 5)
example[target_column] = example.index.map(lambda x: 'P' if (x % 3 == 0) else 'A')
example.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0_level_0,Diagnóstico
Paciente,Unnamed: 1_level_1
25,A
249,P
166,A
87,P
529,A


In [20]:
example.describe()

Unnamed: 0,Diagnóstico
count,307
unique,2
top,A
freq,206


In [21]:
train.to_csv(file_prefix + '-train.csv')

test.drop([target_column], axis=1).to_csv(file_prefix + '-test.csv')
test[[target_column]].to_csv(file_prefix + '-solution.csv')

example.to_csv(file_prefix + '-example.csv')

In [22]:
!head heart-*.csv

==> heart-example.csv <==
Paciente,Diagnóstico
25,A
249,P
166,A
87,P
529,A
183,P
55,A
163,A
616,A

==> heart-solution.csv <==
Paciente,Diagnóstico
25,A
249,A
166,A
87,A
529,P
183,A
55,P
163,A
616,P

==> heart-test.csv <==
Paciente,Hospital,Idade,Sexo,TDP,PAR,CS,ASJ,ECG,FCM,AIE,DST,IST,NVP,Talassemia
25,C,50,F,3,120.0,219.0,0.0,0.0,158.0,0.0,1.6,2.0,0,3
249,C,62,M,2,128.0,208.0,1.0,2.0,140.0,0.0,0.0,1.0,0,3
166,C,52,M,3,138.0,223.0,0.0,0.0,169.0,0.0,0.0,1.0,?,3
87,C,53,F,3,128.0,216.0,0.0,2.0,115.0,0.0,0.0,1.0,0,?
529,H,38,M,4,110,?,0,0,150,1,1.0,2,?,?
183,C,59,M,1,178.0,270.0,0.0,2.0,145.0,0.0,4.2,3.0,0,7
55,C,54,M,4,124.0,266.0,0.0,2.0,109.0,1.0,2.2,2.0,1,7
163,C,58,F,4,100.0,248.0,0.0,2.0,122.0,0.0,1.0,2.0,0,3
616,S,46,M,4,115,0,0,0,113,1,1.5,2,?,7

==> heart-train.csv <==
Paciente,Hospital,Idade,Sexo,TDP,PAR,CS,ASJ,ECG,FCM,AIE,DST,IST,NVP,Talassemia,Diagnóstico
262,C,60,F,1,150.0,240.0,0.0,0.0,171.0,0.0,0.9,1.0,0,3,A
520,H,54,M,4,125,224,0,0,12