In [1]:
# importanto a biblioteca
import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# Carregando dados

data = pd.read_csv('heart_data.csv', sep=',', skiprows=None)
data.head()

Unnamed: 0,index,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [4]:
%%timeit
# Transformando cm e m

def cm_to_m(value):
    return value/100

data['height'] = data['height'].apply(cm_to_m)
data.head()

19.4 ms ± 1.06 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
%%timeit
data['height_2'] = data['height']/100
data.head()

359 µs ± 38.1 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [56]:
# Criar a coluna IMC a parta da divida da coluna de peso pela altura ao quadrada
# Com vetorização

%timeit data['imc'] =  data['weight']/(data['height']**2)
data.head()

570 µs ± 58.2 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


Unnamed: 0,index,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,imc
0,0,0,18393,2,1.68,62.0,110,80,1,1,0,0,1,0,21.96712
1,1,1,20228,1,1.56,85.0,140,90,3,1,0,0,1,1,34.927679
2,2,2,18857,1,1.65,64.0,130,70,3,1,0,0,0,1,23.507805
3,3,3,17623,2,1.69,82.0,150,100,1,1,0,0,1,1,28.710479
4,4,4,17474,1,1.56,56.0,100,60,1,1,0,0,0,0,23.011177


### DICA

O parâmetro axis na função apply do pandas especifica ao longo de qual eixo a função deve ser aplicada. No contexto do método apply, axis=1 indica que a função será aplicada ao longo das linhas do DataFrame <br>.

Quando axis=0, a função é aplicada às colun <br>as.
Quando axis=1, a função é aplicada às linhas.

In [6]:
%%timeit
# Sem vetorização

def calc_imc(row):
    return row['weight']/(row['height']**2)

data['imc2'] = data.apply(calc_imc, axis=1).head()
data.head()



806 ms ± 31.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### DICA 

O método iterrows() no pandas é um iterador que permite iterar sobre as linhas de um DataFrame, retornando um par contendo o índice da linha e uma Series representando os dados daquela linha<br>.

No entanto, é importante notar que iterrows() não é a abordagem mais eficiente para percorrer um DataFrame, especialmente para conjuntos de dados grandes, porque envolve a criação de objetos Series para cada linha.

In [60]:
# Sem vetorização parte 2

imc_list = []

%timeit
for i, row in data.iterrows():
    imc =  row['weight']/(row['height']**2)
    imc_list.append(imc)

data['imc3'] = imc_list
data.head()

Unnamed: 0,index,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,imc,imc2,imc3
0,0,0,18393,2,1.68,62.0,110,80,1,1,0,0,1,0,21.96712,21.96712,21.96712
1,1,1,20228,1,1.56,85.0,140,90,3,1,0,0,1,1,34.927679,34.927679,34.927679
2,2,2,18857,1,1.65,64.0,130,70,3,1,0,0,0,1,23.507805,23.507805,23.507805
3,3,3,17623,2,1.69,82.0,150,100,1,1,0,0,1,1,28.710479,28.710479,28.710479
4,4,4,17474,1,1.56,56.0,100,60,1,1,0,0,0,0,23.011177,23.011177,23.011177


Agora vamos colocar uma condicial, ou seja, classificar entre ['Baixo Peso', 'Normal', 'Sobrepeso', 'Obesidade'] de acordo com o valor do IMC

<img src= 'imc.png'>

In [25]:
condicoes = [(data['imc']<18.5),
            ((data['imc']>=18.5) & (data['imc']<24.99)),
            ((data['imc']>=24.99) & (data['imc']<29.90)),
            (data['imc']>=30)]

classificacao = ['Baixo Peso', 'Normal', 'Sobrepeso', 'Obesidade']
data['class_imc'] = np.select(condicoes, classificacao, default=np.mean)
data.head()

Unnamed: 0,index,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,imc2,imc,imc3,class_imc
0,0,0,18393,2,1.68,62.0,110,80,1,1,0,0,1,0,21.96712,21.96712,21.96712,Normal
1,1,1,20228,1,1.56,85.0,140,90,3,1,0,0,1,1,34.927679,34.927679,34.927679,Obesidade
2,2,2,18857,1,1.65,64.0,130,70,3,1,0,0,0,1,23.507805,23.507805,23.507805,Normal
3,3,3,17623,2,1.69,82.0,150,100,1,1,0,0,1,1,28.710479,28.710479,28.710479,Sobrepeso
4,4,4,17474,1,1.56,56.0,100,60,1,1,0,0,0,0,23.011177,23.011177,23.011177,Normal


In [29]:
# Agora sem o uso do numpy

def class_imc(df):
    if (df['imc']<18.5):
        return 'Baixo Peso'
    elif (df['imc']>=18.5) and (df['imc']<24.99):
        return 'Normal'
    elif (df['imc']>=24.99) and (df['imc']<29.90):
        return 'Sobrepeso'
    else:
        return 'Obesidade'

data['class_imc2'] = data.apply(class_imc, axis=1)
data.head()

Unnamed: 0,index,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,imc2,imc,imc3,class_imc,class_imc2
0,0,0,18393,2,1.68,62.0,110,80,1,1,0,0,1,0,21.96712,21.96712,21.96712,Normal,Normal
1,1,1,20228,1,1.56,85.0,140,90,3,1,0,0,1,1,34.927679,34.927679,34.927679,Obesidade,Obesidade
2,2,2,18857,1,1.65,64.0,130,70,3,1,0,0,0,1,23.507805,23.507805,23.507805,Normal,Normal
3,3,3,17623,2,1.69,82.0,150,100,1,1,0,0,1,1,28.710479,28.710479,28.710479,Sobrepeso,Sobrepeso
4,4,4,17474,1,1.56,56.0,100,60,1,1,0,0,0,0,23.011177,23.011177,23.011177,Normal,Normal


### DICA

Utilzar np.where() ou np.select(), para usar no lugar do IF-ELSE-ELIF