# Fase 1: Processamento dos Dados

Este arquivo realiza o **pré-processamento** dos dados, preparando-os para as fases subsequentes de modelagem e análise. Abaixo está um resumo das principais etapas e funcionalidades implementadas nesta etapa.

## Etapas do Processamento

### 1. Importação das Bibliotecas
As bibliotecas necessárias para o processamento dos dados são importadas no início do arquivo, como:
- `pandas` para manipulação de dados
- `numpy` para operações numéricas

### 2. Carregamento do Dataset
O arquivo carrega os dados a partir de um arquivo CSV utilizando `pandas`


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [6]:
data = pd.read_csv('../data/heart_2022_with_nans.csv')

In [7]:
data.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,,No,...,,,,No,No,Yes,No,"Yes, received tetanus shot but not sure what type",No,No
1,Alabama,Female,Excellent,0.0,0.0,,No,6.0,,No,...,1.6,68.04,26.57,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,No
2,Alabama,Female,Very good,2.0,3.0,Within past year (anytime less than 12 months ...,Yes,5.0,,No,...,1.57,63.5,25.61,No,No,No,No,,No,Yes
3,Alabama,Female,Excellent,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,7.0,,No,...,1.65,63.5,23.3,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No
4,Alabama,Female,Fair,2.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,,No,...,1.57,53.98,21.77,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,No


## Substituir valores nulos

In [8]:
# Substituir valores nulos nas variáveis numéricas pela média
data.fillna(data.select_dtypes(include='number').mean(), inplace=True)

# Substituir valores nulos nas variáveis categóricas pela moda
data.fillna(data.select_dtypes(include='object').mode().iloc[0], inplace=True)

## Transformação de variáveis categóricas em numéricas

In [9]:
# Cópia do dataframe
data_copy = data.copy()
data_copy.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,None of them,No,...,1.702691,83.07447,28.529842,No,No,Yes,No,"Yes, received tetanus shot but not sure what type",No,No
1,Alabama,Female,Excellent,0.0,0.0,Within past year (anytime less than 12 months ...,No,6.0,None of them,No,...,1.6,68.04,26.57,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,No
2,Alabama,Female,Very good,2.0,3.0,Within past year (anytime less than 12 months ...,Yes,5.0,None of them,No,...,1.57,63.5,25.61,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,Yes
3,Alabama,Female,Excellent,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,7.0,None of them,No,...,1.65,63.5,23.3,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No
4,Alabama,Female,Fair,2.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.57,53.98,21.77,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,No


In [10]:
# Lista de colunas de Sim ou Não a serem mapeadas
yes_no_columns = [
    'PhysicalActivities', 'HadHeartAttack', 'HadAngina', 'HadStroke',
    'HadAsthma', 'HadSkinCancer', 'HadCOPD', 'HadDepressiveDisorder',
    'HadKidneyDisease', 'HadArthritis', 'DeafOrHardOfHearing',
    'BlindOrVisionDifficulty', 'DifficultyConcentrating',
    'DifficultyWalking', 'DifficultyDressingBathing',
    'DifficultyErrands', 'ChestScan', 'AlcoholDrinkers',
    'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver',
    'HighRiskLastYear'
]

# Mapeando "Yes" para 1 e "No" para 0
data_copy[yes_no_columns] = data_copy[yes_no_columns].replace({'Yes': 1, 'No': 0})  #.astype(int) para converter para inteiro após remoção de valores nulos

  data_copy[yes_no_columns] = data_copy[yes_no_columns].replace({'Yes': 1, 'No': 0})  #.astype(int) para converter para inteiro após remoção de valores nulos


In [11]:
# Dicionário de mapeamento coluna GeneralHealth
health_mapping = {
    'Excellent': 5,
    'Very good': 4,
    'Good': 3,
    'Fair': 2,
    'Poor': 1
}

data_copy['GeneralHealth'] = data_copy['GeneralHealth'].map(health_mapping)

In [12]:
# Mapeamento coluna Sex
sex_mapping = {
    'Male': 0,
    'Female': 1
}

data_copy['Sex'] = data_copy['Sex'].map(sex_mapping)

In [13]:
# Mapeamento coluna RemovedTeeth
removed_teeth_mapping = {
    'None of them': 0,
    '1 to 5': 1,
    '6 or more, but not all': 2,
    'All': 3
}

data_copy['RemovedTeeth'] = data_copy['RemovedTeeth'].map(removed_teeth_mapping)

In [14]:
# Mapeamento coluna AgeCategory
age_mapping = {
    'Age 18 to 24': 0,
    'Age 25 to 29': 1,
    'Age 30 to 34': 2,
    'Age 35 to 39': 3,
    'Age 40 to 44': 4,
    'Age 45 to 49': 5,
    'Age 50 to 54': 6,
    'Age 55 to 59': 7,
    'Age 60 to 64': 8,
    'Age 65 to 69': 9,
    'Age 70 to 74': 10,
    'Age 75 to 79': 11,
    'Age 80 or older': 12
}

data_copy['AgeCategory'] = data_copy['AgeCategory'].map(age_mapping)

In [15]:
# Mapeamento coluna TetanusLast10Tdap
tetanus_mapping = {
    'No, did not receive any tetanus shot in the past 10 years': 0,
    'Yes, received tetanus shot but not sure what type': 1,
    'Yes, received Tdap': 2,
    'Yes, received tetanus shot, but not Tdap': 3
}

data_copy['TetanusLast10Tdap'] = data_copy['TetanusLast10Tdap'].map(tetanus_mapping)

In [16]:
# Mapeamento da coluna LastCheckupTime
checkup_mapping = {
    "Within past year (anytime less than 12 months ago)": 0,
    "Within past 2 years (1 year but less than 2 years ago)": 1,
    "Within past 5 years (2 years but less than 5 years ago)": 2,
    "5 or more years ago": 3
}

# Aplicar o mapeamento à coluna LastCheckupTime
data_copy['LastCheckupTime'] = data_copy['LastCheckupTime'].map(checkup_mapping)

In [17]:
# Aplicar one-hot encoding
data_copy = pd.get_dummies(data_copy, columns=['SmokerStatus', 'ECigaretteUsage', 'RaceEthnicityCategory', 'CovidPos', 'HadDiabetes'], drop_first=True, dtype='int')

In [18]:
data_copy.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,ECigaretteUsage_Use them some days,RaceEthnicityCategory_Hispanic,"RaceEthnicityCategory_Multiracial, Non-Hispanic","RaceEthnicityCategory_Other race only, Non-Hispanic","RaceEthnicityCategory_White only, Non-Hispanic",CovidPos_Tested positive using home test without a health professional,CovidPos_Yes,"HadDiabetes_No, pre-diabetes or borderline diabetes",HadDiabetes_Yes,"HadDiabetes_Yes, but only during pregnancy (female)"
0,Alabama,1,4,0.0,0.0,0,0,8.0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,Alabama,1,5,0.0,0.0,0,0,6.0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,Alabama,1,4,2.0,3.0,0,1,5.0,0,0,...,0,0,0,0,1,0,1,0,0,0
3,Alabama,1,5,0.0,0.0,0,1,7.0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,Alabama,1,2,2.0,0.0,0,1,9.0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [19]:
if 'State' in data_copy.columns:
  data_copy = data_copy.drop('State', axis=1)

In [20]:
data_copy.to_csv('../data/Fase1-output_processed_data.csv', index=False)