# Data Transformation and Clean

#### Read the downloaded data

In [40]:
import pandas as pd

heart_df = pd.read_csv('../data/heart.csv', sep = ',')
heart_df.rename(columns={'HeartDisease':'label'}, inplace=True)


heart_df.head(5)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,label
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


#### Split into train, dev and test

In [41]:
import sys
import os
# Adiciona o diretório acima ao PATH do Python
sys.path.append(os.path.abspath(os.path.join('..')))

from sampling.SplitData import SplitData


splitter = SplitData(partitions = [0.8, 0.2])


heart_df_label = heart_df.pop('label')

train, train_label, test, test_label = \
                                    splitter.get_two_sets(heart_df, 
                                                          heart_df_label)

print(train.shape)
print(test.shape)

Training data size: (734, 11)
Test data size: (184, 11)
 
(734, 11)
(184, 11)


#### Perform the transformation over the numerical and categorical variable

In [42]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np

std_scaler_obj = StandardScaler()
ohe_obj = OneHotEncoder(sparse_output = False, 
                        handle_unknown = 'ignore')

# Lista das colunas categóricas
train_num = train.select_dtypes(include=[np.number])
num_cols = train_num.columns

# Lista das colunas categóricas
train_cat = train.select_dtypes(include=['object'])
cat_cols = train_cat.columns


# Criar o transformador
preprocessor = ColumnTransformer(
    transformers = [
                     ('stdscaler', std_scaler_obj, num_cols) 
                    ,('ohe', ohe_obj, cat_cols)
                    ],
    remainder = 'drop'  # Mantém as colunas não transformadas
)


train_transf_output = pd.DataFrame(
                             preprocessor.fit_transform(train) 
                            ,columns = preprocessor.get_feature_names_out()
                            ,index=train.index
                         )

test_transf_output = pd.DataFrame(
                             preprocessor.transform(test) 
                            ,columns = preprocessor.get_feature_names_out()
                            ,index=test.index
                         )


print(train_transf_output.columns)
print(train_transf_output.shape)

print(test_transf_output.columns)
print(test_transf_output.shape)

Index(['stdscaler__Age', 'stdscaler__RestingBP', 'stdscaler__Cholesterol',
       'stdscaler__FastingBS', 'stdscaler__MaxHR', 'stdscaler__Oldpeak',
       'ohe__Sex_F', 'ohe__Sex_M', 'ohe__ChestPainType_ASY',
       'ohe__ChestPainType_ATA', 'ohe__ChestPainType_NAP',
       'ohe__ChestPainType_TA', 'ohe__RestingECG_LVH',
       'ohe__RestingECG_Normal', 'ohe__RestingECG_ST', 'ohe__ExerciseAngina_N',
       'ohe__ExerciseAngina_Y', 'ohe__ST_Slope_Down', 'ohe__ST_Slope_Flat',
       'ohe__ST_Slope_Up'],
      dtype='object')
(734, 20)
Index(['stdscaler__Age', 'stdscaler__RestingBP', 'stdscaler__Cholesterol',
       'stdscaler__FastingBS', 'stdscaler__MaxHR', 'stdscaler__Oldpeak',
       'ohe__Sex_F', 'ohe__Sex_M', 'ohe__ChestPainType_ASY',
       'ohe__ChestPainType_ATA', 'ohe__ChestPainType_NAP',
       'ohe__ChestPainType_TA', 'ohe__RestingECG_LVH',
       'ohe__RestingECG_Normal', 'ohe__RestingECG_ST', 'ohe__ExerciseAngina_N',
       'ohe__ExerciseAngina_Y', 'ohe__ST_Slope_Down', 'ohe

#### Save the transformed dataset predictors

In [43]:

train_transf_output.to_csv('../data/train_transformed.csv', sep = ';', index = True)
test_transf_output.to_csv('../data/test_transformed.csv', sep = ';', index = True)


#### Save the transformed dataset labels

In [44]:

train_label_df = pd.DataFrame(
                             train_label
                            ,index=train_label.index
                         )

test_label_df = pd.DataFrame(
                             test_label
                            ,index=test_label.index
                         )



train_label_df.to_csv('../data/train_label_df.csv', sep = ';', index = True)
test_label_df.to_csv('../data/test_label_df.csv', sep = ';', index = True)

