# Feature engineering

In [None]:
# Função para calcular Z-Score dentro do grupo
def z_score_group(x):
    return (x - x.mean()) / x.std()

# --- Ideia 1 e 4: Faixas Etárias e Z-Score Proxy ---
# Criando faixas etárias fisiológicas
bins = [-1, 2, 5, 12, 20]
labels = ['Bebê (0-2)', 'Pre-Escolar (2-5)', 'Escolar (5-12)', 'Adolescente (12+)']
df['Faixa_Etaria'] = pd.cut(df['IDADE'], bins=bins, labels=labels)

# Cria o desvio padrão relativo à idade (O quanto foge da média daquele grupo)
df['IMC_Z_Score_Proxy'] = df.groupby('Faixa_Etaria')['IMC'].transform(z_score_group)
df['FC_Z_Score_Proxy'] = df.groupby('Faixa_Etaria')['FC'].transform(z_score_group)

In [8]:
import pandas as pd
import numpy as np

# Carregando o dataset
df = pd.read_csv(r'C:\Users\João Pedro\Documents\UFG\MD\estudo_caso\data\UCMF_cleaned.csv')


# --- Ideia 2: Flags de Texto (One-Hot Encoding manual) ---
# Normaliza para minúsculas para facilitar a busca
df['HDA_lower'] = df['HDA'].str.lower()

df['Sintoma_Dor'] = df['HDA_lower'].str.contains('dor').astype(int)
df['Sintoma_Cianose'] = df['HDA_lower'].str.contains('cianose').astype(int)
df['Sintoma_Dispn'] = df['HDA_lower'].str.contains('dispneia|dispnéia').astype(int)
df['Checkup_Assintomatico'] = df['HDA_lower'].str.contains('assintomático|check-up').astype(int)

# --- Ideia 3: Limpeza da PA ---
# Lista de termos que indicam pressão alta
termos_hipertensao = ['Hipertensão', 'Elevada', 'HAS']
df['PPA_Alterada'] = df['PPA'].apply(lambda x: 1 if any(t in str(x) for t in termos_hipertensao) else 0)

# Visualizando o resultado
df.head()

Unnamed: 0,Peso,Altura,IMC,IDADE,PULSOS,PPA,NORMAL X ANORMAL,B2,SOPRO,FC,MOTIVO,HDA,HDA_lower,Sintoma_Dor,Sintoma_Cianose,Sintoma_Dispn,Checkup_Assintomatico,PPA_Alterada
0,5.0,51.0,19.0,0.115068,Normais,Não Calculado,Anormal,Normal,Sistólico,112.0,palpitação/taquicardia/arritmia,palpitacao,palpitacao,0,0,0,0,0
1,3.5,50.0,14.0,0.016438,Normais,Não Calculado,Anormal,Normal,ausente,128.0,dispnéia,dispneia,dispneia,0,0,1,0,0
2,16.0,99.0,17.0,3.823288,Normais,Não Calculado,Anormal,Normal,Sistólico,88.0,check-up,assintomático,assintomático,0,0,0,1,0
3,8.1,65.0,19.0,0.484932,Normais,Não Calculado,Anormal,Normal,ausente,92.0,parecer cardiológico,assintomático,assintomático,0,0,0,1,0
4,40.0,151.0,18.0,12.427397,Normais,Não Calculado,Anormal,Normal,ausente,96.0,parecer cardiológico,dor precordial,dor precordial,1,0,0,0,0


In [9]:
# --- Tratamento da Variável MOTIVO (Novo) ---
# Normalizando texto
df['MOTIVO_lower'] = df['MOTIVO'].str.lower()

# Criando as flags que trazem informação EXTRA à HDA
df['Motivo_Cirurgia'] = df['MOTIVO_lower'].str.contains('cirurgia').astype(int)
df['Motivo_Sopro'] = df['MOTIVO_lower'].str.contains('sopro').astype(int)
df['Motivo_Congenita'] = df['MOTIVO_lower'].str.contains('congenita|congênita').astype(int)


In [10]:
df.head()

Unnamed: 0,Peso,Altura,IMC,IDADE,PULSOS,PPA,NORMAL X ANORMAL,B2,SOPRO,FC,...,HDA_lower,Sintoma_Dor,Sintoma_Cianose,Sintoma_Dispn,Checkup_Assintomatico,PPA_Alterada,MOTIVO_lower,Motivo_Cirurgia,Motivo_Sopro,Motivo_Congenita
0,5.0,51.0,19.0,0.115068,Normais,Não Calculado,Anormal,Normal,Sistólico,112.0,...,palpitacao,0,0,0,0,0,palpitação/taquicardia/arritmia,0,0,0
1,3.5,50.0,14.0,0.016438,Normais,Não Calculado,Anormal,Normal,ausente,128.0,...,dispneia,0,0,1,0,0,dispnéia,0,0,0
2,16.0,99.0,17.0,3.823288,Normais,Não Calculado,Anormal,Normal,Sistólico,88.0,...,assintomático,0,0,0,1,0,check-up,0,0,0
3,8.1,65.0,19.0,0.484932,Normais,Não Calculado,Anormal,Normal,ausente,92.0,...,assintomático,0,0,0,1,0,parecer cardiológico,0,0,0
4,40.0,151.0,18.0,12.427397,Normais,Não Calculado,Anormal,Normal,ausente,96.0,...,dor precordial,1,0,0,0,0,parecer cardiológico,0,0,0


In [5]:
df.columns

Index(['Peso', 'Altura', 'IMC', 'IDADE', 'PULSOS', 'PPA', 'NORMAL X ANORMAL',
       'B2', 'SOPRO', 'FC', 'MOTIVO', 'HDA', 'HDA_lower', 'Sintoma_Dor',
       'Sintoma_Cianose', 'Sintoma_Dispn', 'Checkup_Assintomatico',
       'PPA_Alterada', 'MOTIVO_lower', 'Motivo_Cirurgia', 'Motivo_Sopro',
       'Motivo_Congenita'],
      dtype='object')

In [6]:
correcoes = {
    'anormal': 'Anormal',
    'Normais': 'Normal',
    'Anormal': 'Anormal',
    'Normal': 'Normal'
}
df['NORMAL X ANORMAL'] = df['NORMAL X ANORMAL'].map(correcoes)

# 2. Aplicar Label Encoding (0 e 1)
# Anormal = 1 (Classe Positiva/Doente)
# Normal = 0 (Classe Negativa/Saudável)
df['Target'] = df['NORMAL X ANORMAL'].map({'Anormal': 1, 'Normal': 0})

# Verificar se sobrou algum valor nulo
print(df['Target'].value_counts())

# Agora você pode excluir a coluna de texto original
df = df.drop(columns=['NORMAL X ANORMAL'])

Target
0    6726
1    4952
Name: count, dtype: int64


In [11]:
mapa_sopro = {
    'sistólico': 'Sistolico',  # Corrige minúscula e remove acento
    'Sistólico': 'Sistolico',  # Remove acento
    'ausente': 'Ausente',      # Capitaliza
    'Não Informado': 'Ausente',# Trata nulo como ausente
    'contínuo': 'Continuo',
    'Contínuo': 'Continuo',
    'diastólico': 'Diastolico',
    'Sistolico e diastólico': 'Sistolico_Diastolico' # Caso raro combinado
}

df['SOPRO'] = df['SOPRO'].map(mapa_sopro).fillna('Ausente')

print("Valores únicos após limpeza:")
print(df['SOPRO'].value_counts())

Valores únicos após limpeza:
SOPRO
Ausente                 7257
Sistolico               4362
Continuo                  47
Diastolico                 9
Sistolico_Diastolico       3
Name: count, dtype: int64


In [8]:
cols_to_drop = [
    'Peso', 'Altura', 'IMC', 'FC', 'PULSOS', 'PPA', 'MOTIVO', 'HDA', 'Faixa_Etaria','HDA_lower', 'MOTIVO_lower'
]

df_final = df.drop(columns=cols_to_drop)

In [9]:
df_final

Unnamed: 0,IDADE,NORMAL X ANORMAL,B2,SOPRO,IMC_Z_Score_Proxy,FC_Z_Score_Proxy,Sintoma_Dor,Sintoma_Cianose,Sintoma_Dispn,Checkup_Assintomatico,PPA_Alterada,Motivo_Cirurgia,Motivo_Sopro,Motivo_Congenita
0,0.115068,Anormal,Normal,Sistólico,0.804795,0.097480,0,0,0,0,0,0,0,0
1,0.016438,Anormal,Normal,ausente,-0.751108,0.962465,0,0,1,0,0,0,0,0
2,3.823288,Anormal,Normal,Sistólico,0.052897,-0.463518,0,0,0,1,0,0,0,0
3,0.484932,Anormal,Normal,ausente,0.804795,-0.983750,0,0,0,1,0,0,0,0
4,12.427397,Anormal,Normal,ausente,-0.083274,1.618005,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11673,1.161644,Normal,Normal,ausente,-0.128747,-0.983750,0,0,0,1,0,1,0,0
11674,0.778082,Normal,Normal,ausente,1.427156,-1.091873,0,0,0,1,0,1,0,0
11675,23.476712,Normal,Normal,ausente,,,1,0,0,0,1,0,0,0
11676,3.823288,Normal,Normal,Sistólico,-0.438714,-0.131457,0,0,0,0,0,0,0,0


In [10]:
normal_col = df_final.pop('NORMAL X ANORMAL')

df_final['NORMAL X ANORMAL'] = normal_col

In [16]:
df_final.to_csv(r'C:\Users\João Pedro\Documents\UFG\MD\estudo_caso\data\UCMF_feature_engineered.csv', index=False)

In [26]:
df_final = pd.read_csv(r'C:\Users\João Pedro\Documents\UFG\MD\estudo_caso\data\UCMF_feature_engineered.csv')
df_final

Unnamed: 0,IDADE,B2,SOPRO,IMC_Z_Score_Proxy,FC_Z_Score_Proxy,Sintoma_Dor,Sintoma_Cianose,Sintoma_Dispn,Checkup_Assintomatico,PPA_Alterada,Motivo_Cirurgia,Motivo_Sopro,Motivo_Congenita,NORMAL X ANORMAL
0,0.115068,Normal,Sistólico,0.804795,0.097480,0,0,0,0,0,0,0,0,Anormal
1,0.016438,Normal,ausente,-0.751108,0.962465,0,0,1,0,0,0,0,0,Anormal
2,3.823288,Normal,Sistólico,0.052897,-0.463518,0,0,0,1,0,0,0,0,Anormal
3,0.484932,Normal,ausente,0.804795,-0.983750,0,0,0,1,0,0,0,0,Anormal
4,12.427397,Normal,ausente,-0.083274,1.618005,1,0,0,0,0,0,0,0,Anormal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11673,1.161644,Normal,ausente,-0.128747,-0.983750,0,0,0,1,0,1,0,0,Normal
11674,0.778082,Normal,ausente,1.427156,-1.091873,0,0,0,1,0,1,0,0,Normal
11675,23.476712,Normal,ausente,,,1,0,0,0,1,0,0,0,Normal
11676,3.823288,Normal,Sistólico,-0.438714,-0.131457,0,0,0,0,0,0,0,0,Normal


In [28]:
df_final

Unnamed: 0,IDADE,B2,SOPRO,IMC_Z_Score_Proxy,FC_Z_Score_Proxy,Sintoma_Dor,Sintoma_Cianose,Sintoma_Dispn,Checkup_Assintomatico,PPA_Alterada,Motivo_Cirurgia,Motivo_Sopro,Motivo_Congenita,Target
0,0.115068,Normal,Sistólico,0.804795,0.097480,0,0,0,0,0,0,0,0,1
1,0.016438,Normal,ausente,-0.751108,0.962465,0,0,1,0,0,0,0,0,1
2,3.823288,Normal,Sistólico,0.052897,-0.463518,0,0,0,1,0,0,0,0,1
3,0.484932,Normal,ausente,0.804795,-0.983750,0,0,0,1,0,0,0,0,1
4,12.427397,Normal,ausente,-0.083274,1.618005,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11673,1.161644,Normal,ausente,-0.128747,-0.983750,0,0,0,1,0,1,0,0,0
11674,0.778082,Normal,ausente,1.427156,-1.091873,0,0,0,1,0,1,0,0,0
11675,23.476712,Normal,ausente,,,1,0,0,0,1,0,0,0,0
11676,3.823288,Normal,Sistólico,-0.438714,-0.131457,0,0,0,0,0,0,0,0,0


In [29]:
df_final['SOPRO'].value_counts()

SOPRO
ausente                   7257
Sistólico                 3639
sistólico                  723
contínuo                    28
Contínuo                    19
diastólico                   9
Sistolico e diastólico       3
Name: count, dtype: int64

In [31]:
df_final['SOPRO'] = df_final['SOPRO'].replace('Não Informado', 'ausente')

df_final = pd.get_dummies(df_final, columns=['SOPRO'], prefix='SOPRO')

cols_sopro = [c for c in df_final.columns if 'SOPRO_' in c]
df_final[cols_sopro] = df_final[cols_sopro].astype(int)


In [32]:
df_final

Unnamed: 0,IDADE,B2,IMC_Z_Score_Proxy,FC_Z_Score_Proxy,Sintoma_Dor,Sintoma_Cianose,Sintoma_Dispn,Checkup_Assintomatico,PPA_Alterada,Motivo_Cirurgia,Motivo_Sopro,Motivo_Congenita,Target,SOPRO_Ausente,SOPRO_Continuo,SOPRO_Diastolico,SOPRO_Sistolico,SOPRO_Sistolico_Diastolico
0,0.115068,Normal,0.804795,0.097480,0,0,0,0,0,0,0,0,1,0,0,0,1,0
1,0.016438,Normal,-0.751108,0.962465,0,0,1,0,0,0,0,0,1,1,0,0,0,0
2,3.823288,Normal,0.052897,-0.463518,0,0,0,1,0,0,0,0,1,0,0,0,1,0
3,0.484932,Normal,0.804795,-0.983750,0,0,0,1,0,0,0,0,1,1,0,0,0,0
4,12.427397,Normal,-0.083274,1.618005,1,0,0,0,0,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11673,1.161644,Normal,-0.128747,-0.983750,0,0,0,1,0,1,0,0,0,1,0,0,0,0
11674,0.778082,Normal,1.427156,-1.091873,0,0,0,1,0,1,0,0,0,1,0,0,0,0
11675,23.476712,Normal,,,1,0,0,0,1,0,0,0,0,1,0,0,0,0
11676,3.823288,Normal,-0.438714,-0.131457,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [36]:
df_final['B2'].value_counts()

B2
Normal           11081
Hiperfonética      295
Desdob fixo        149
Única               77
Outro               76
Name: count, dtype: int64

In [37]:
mapa_b2 = {
    'Normal': 'Normal',
    'Hiperfonética': 'Hiperfonetica', # Tirando acento
    'Desdob fixo': 'Desdobramento_Fixo', # Nome mais descritivo
    'Única': 'Unica', # Tirando acento
    'Outro': 'Outro'
}
df_final['B2'] = df_final['B2'].map(mapa_b2).fillna('Outro')

df_final = pd.get_dummies(df_final, columns=['B2'], prefix='B2')

# Converter para 0 e 1
cols_b2 = [c for c in df_final.columns if 'B2_' in c]
df_final[cols_b2] = df_final[cols_b2].astype(int)

In [38]:
df_final

Unnamed: 0,IDADE,IMC_Z_Score_Proxy,FC_Z_Score_Proxy,Sintoma_Dor,Sintoma_Cianose,Sintoma_Dispn,Checkup_Assintomatico,PPA_Alterada,Motivo_Cirurgia,Motivo_Sopro,...,SOPRO_Ausente,SOPRO_Continuo,SOPRO_Diastolico,SOPRO_Sistolico,SOPRO_Sistolico_Diastolico,B2_Desdobramento_Fixo,B2_Hiperfonetica,B2_Normal,B2_Outro,B2_Unica
0,0.115068,0.804795,0.097480,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
1,0.016438,-0.751108,0.962465,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
2,3.823288,0.052897,-0.463518,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,1,0,0
3,0.484932,0.804795,-0.983750,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0
4,12.427397,-0.083274,1.618005,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11673,1.161644,-0.128747,-0.983750,0,0,0,1,0,1,0,...,1,0,0,0,0,0,0,1,0,0
11674,0.778082,1.427156,-1.091873,0,0,0,1,0,1,0,...,1,0,0,0,0,0,0,1,0,0
11675,23.476712,,,1,0,0,0,1,0,0,...,1,0,0,0,0,0,0,1,0,0
11676,3.823288,-0.438714,-0.131457,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0


In [40]:
normal_col = df_final.pop('Target')

df_final['Target'] = normal_col

In [41]:
df_final

Unnamed: 0,IDADE,IMC_Z_Score_Proxy,FC_Z_Score_Proxy,Sintoma_Dor,Sintoma_Cianose,Sintoma_Dispn,Checkup_Assintomatico,PPA_Alterada,Motivo_Cirurgia,Motivo_Sopro,...,SOPRO_Continuo,SOPRO_Diastolico,SOPRO_Sistolico,SOPRO_Sistolico_Diastolico,B2_Desdobramento_Fixo,B2_Hiperfonetica,B2_Normal,B2_Outro,B2_Unica,Target
0,0.115068,0.804795,0.097480,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,1
1,0.016438,-0.751108,0.962465,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
2,3.823288,0.052897,-0.463518,0,0,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,1
3,0.484932,0.804795,-0.983750,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,1
4,12.427397,-0.083274,1.618005,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11673,1.161644,-0.128747,-0.983750,0,0,0,1,0,1,0,...,0,0,0,0,0,0,1,0,0,0
11674,0.778082,1.427156,-1.091873,0,0,0,1,0,1,0,...,0,0,0,0,0,0,1,0,0,0
11675,23.476712,,,1,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
11676,3.823288,-0.438714,-0.131457,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0


In [46]:
df_final.isna().sum()

IDADE                         0
IMC_Z_Score_Proxy             0
FC_Z_Score_Proxy              0
Sintoma_Dor                   0
Sintoma_Cianose               0
Sintoma_Dispn                 0
Checkup_Assintomatico         0
PPA_Alterada                  0
Motivo_Cirurgia               0
Motivo_Sopro                  0
Motivo_Congenita              0
SOPRO_Ausente                 0
SOPRO_Continuo                0
SOPRO_Diastolico              0
SOPRO_Sistolico               0
SOPRO_Sistolico_Diastolico    0
B2_Desdobramento_Fixo         0
B2_Hiperfonetica              0
B2_Normal                     0
B2_Outro                      0
B2_Unica                      0
Target                        0
dtype: int64

In [45]:
df_final = df_final.fillna(0)

cols_to_fix = ['IMC_Z_Score_Proxy', 'FC_Z_Score_Proxy']
df_final[cols_to_fix] = df_final[cols_to_fix].fillna(0)

In [47]:
df_final.to_csv(r'C:\Users\João Pedro\Documents\UFG\MD\estudo_caso\data\UCMF_feature_engineered.csv', index=False)