# <h4>Bibliotecas necessárias</h4>

In [1]:
# Bibliotecas para preparar os dados
import numpy as np
import pandas as pd

# Bibliotecas processar os dados para treino
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Classes para classificação
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

# Bibliotecas para medir desempenho dos algoritmos
from sklearn.metrics import classification_report

# <h4>Importação do dataset</h4>

In [2]:
df = pd.read_csv('../data/dataset.csv')

# <h3>Pré processamento dos dados</h3>

# <h4>Busca por NaN e valores nulos</h4>

In [73]:
df.isna().sum()

HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

In [74]:
df.isnull().sum()

HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

# <h4>Renomear colunas</h4>

In [3]:
df.rename(columns={'HeartDisease': 'target', 'BMI': 'IMC'}, inplace=True)

# <h4>Substituir faixas de idade por média</h4>

In [4]:
df.AgeCategory.unique()

array(['55-59', '80 or older', '65-69', '75-79', '40-44', '70-74',
       '60-64', '50-54', '45-49', '18-24', '35-39', '30-34', '25-29'],
      dtype=object)

In [157]:
def get_median_age(interval):
  values = interval.split('-')
  values = list(map(int, values))
  media = np.mean(values)
  
  return media

In [158]:
for item in df.AgeCategory.unique():
  if item == '80 or older':
    df.AgeCategory.replace(item, 80, inplace=True)
  else:
    df.AgeCategory.replace(item, get_median_age(item), inplace=True)

In [159]:
df.AgeCategory.astype(float)

0         57.0
1         80.0
2         67.0
3         77.0
4         42.0
          ... 
319790    62.0
319791    37.0
319792    47.0
319793    27.0
319794    80.0
Name: AgeCategory, Length: 319795, dtype: float64

# <h4>Converter valores em numéricos</h4>

In [5]:
ALL_COLUMNS = df.columns
ALL_COLUMNS

Index(['target', 'IMC', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',
       'Asthma', 'KidneyDisease', 'SkinCancer'],
      dtype='object')

In [148]:
def convert_in_number(column):
  values_column = df[column].unique()
  # print(f'{column} \n {df[column].value_counts()}')
  
  for i in range(len(values_column)):
    df[column].replace(values_column[i], i + 1, inplace=True)
  # print(f'{column} \n {df[column].value_counts()}')
  # print('\n\n')
  
  # O método value_counts() foi utilizado para organizar os valores no formulário

In [149]:
columns = []

for i in range(len(ALL_COLUMNS)):
  if df[ALL_COLUMNS[i]].dtype != 'float64':
    columns.append(str(ALL_COLUMNS[i]))
columns

['target',
 'Smoking',
 'AlcoholDrinking',
 'Stroke',
 'DiffWalking',
 'Sex',
 'Race',
 'Diabetic',
 'PhysicalActivity',
 'GenHealth',
 'Asthma',
 'KidneyDisease',
 'SkinCancer']

In [None]:
for i in range(len(columns)):
  convert_in_number(columns[i])

In [161]:
df.head(50)

Unnamed: 0,target,IMC,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,57.0,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80.0,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,67.0,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,77.0,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,42.0,White,No,Yes,Very good,8.0,No,No,No
5,Yes,28.87,Yes,No,No,6.0,0.0,Yes,Female,77.0,Black,No,No,Fair,12.0,No,No,No
6,No,21.63,No,No,No,15.0,0.0,No,Female,72.0,White,No,Yes,Fair,4.0,Yes,No,Yes
7,No,31.64,Yes,No,No,5.0,0.0,Yes,Female,80.0,White,Yes,No,Good,9.0,Yes,No,No
8,No,26.45,No,No,No,0.0,0.0,No,Female,80.0,White,"No, borderline diabetes",No,Fair,5.0,No,Yes,No
9,No,40.69,No,No,No,0.0,0.0,Yes,Male,67.0,White,No,Yes,Good,10.0,No,No,No


# <h4>Alterando a escala dos dodos - Normalização</h4>

In [85]:
scale_dataset = StandardScaler()

columns = df.columns[1:]
df[columns] = scale_dataset.fit_transform(df[columns].astype(float))
df

Unnamed: 0,target,IMC,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,1,-1.844750,-1.193474,-0.27032,-0.198040,-0.046751,3.281069,-0.401578,-0.951711,0.149220,-0.468951,-2.189273,-0.538256,-1.083956,-1.460354,-2.541515,-0.195554,-3.118419
1,1,-1.256338,0.837890,-0.27032,5.049478,-0.424070,-0.490039,-0.401578,-0.951711,1.447159,-0.468951,0.217526,-0.538256,-1.083956,-0.067601,0.393466,-0.195554,0.320675
2,1,-0.274603,-1.193474,-0.27032,-0.198040,2.091388,3.281069,-0.401578,1.050739,0.713541,-0.468951,-2.189273,-0.538256,-0.420104,0.628776,-2.541515,-0.195554,0.320675
3,1,-0.647473,0.837890,-0.27032,-0.198040,-0.424070,-0.490039,-0.401578,-0.951711,1.277863,-0.468951,0.217526,1.857852,0.243748,-0.763977,0.393466,-0.195554,-3.118419
4,1,-0.726138,0.837890,-0.27032,-0.198040,3.097572,-0.490039,2.490174,-0.951711,-0.697262,-0.468951,0.217526,-0.538256,-1.083956,0.628776,0.393466,-0.195554,0.320675
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,2,-0.144019,-1.193474,-0.27032,-0.198040,0.456341,-0.490039,2.490174,1.050739,0.431381,2.713424,-2.189273,1.857852,-0.420104,-0.763977,-2.541515,-0.195554,0.320675
319791,1,0.238291,-1.193474,-0.27032,-0.198040,-0.424070,-0.490039,-0.401578,1.050739,-0.979422,2.713424,0.217526,-0.538256,-1.083956,-1.460354,-2.541515,-0.195554,0.320675
319792,1,-0.642753,0.837890,-0.27032,-0.198040,-0.424070,-0.490039,-0.401578,-0.951711,-0.415101,2.713424,0.217526,-0.538256,0.243748,-0.763977,0.393466,-0.195554,0.320675
319793,1,0.705560,0.837890,-0.27032,-0.198040,-0.424070,-0.490039,-0.401578,-0.951711,-1.543744,2.713424,0.217526,1.857852,0.243748,3.414282,0.393466,-0.195554,0.320675


# <h3>Algoritmos de classificação</h3>

# <h4>Preparação</h4>

In [86]:
columns = df.columns[1:]
X = df[columns]
y = df.target

In [87]:
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.45, random_state=42)

# <h4>Definição do cross-validation</h4>

In [88]:
def set_cross_validate(model):
  from sklearn.model_selection import cross_validate
  from sklearn.model_selection import StratifiedKFold

  metric_names = ['accuracy', 'precision_macro', 'recall_macro','f1_macro']
  cross_val = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

  metricas = cross_validate(model, X, y, cv=cross_val, scoring=metric_names)
  for metrica in metricas:
    print(f"- {metrica}:")
    print(f"-- {metricas[metrica]}")
    print(f"-- {np.mean(metricas[metrica])} +- {np.std(metricas[metrica])}\n")


# <h4>Decision Tree</h4>

In [89]:
model_tree = DecisionTreeClassifier(random_state=42)
model_tree.fit(train_X, train_y)

In [90]:
set_cross_validate(model_tree)

- fit_time:
-- [2.80127311 2.33856034 2.68734455 2.40052056 3.66174388]
-- 2.7778884887695314 +- 0.474417783731477

- score_time:
-- [0.14690924 0.12891984 0.14191365 0.1219275  0.17789054]
-- 0.14351215362548828 +- 0.01936389256634092

- test_accuracy:
-- [0.86374083 0.86267765 0.86632061 0.8646633  0.86438187]
-- 0.8643568536093434 +- 0.0011958324355292992

- test_precision_macro:
-- [0.57997607 0.58019779 0.58323546 0.58199833 0.58027949]
-- 0.5811374294409817 +- 0.0012736281602973355

- test_recall_macro:
-- [0.58704131 0.58894372 0.58827116 0.58885487 0.58654884]
-- 0.5879319802826916 +- 0.0009692010944167679

- test_f1_macro:
-- [0.58323254 0.58415219 0.58561928 0.58517496 0.58319707]
-- 0.5842752073964441 +- 0.0009880036131906106



# <h4>Random Forest</h4>

In [91]:
model_forest = RandomForestClassifier(criterion='entropy', random_state=42)
model_forest.fit(train_X, train_y)

In [92]:
set_cross_validate(model_forest)

- fit_time:
-- [63.21105266 55.78263235 55.4861095  55.55976772 55.01828814]
-- 57.011570072174074 +- 3.109720410063553

- score_time:
-- [2.24861503 2.27359939 2.23862123 2.26660156 2.27559733]
-- 2.2606069087982177 +- 0.01453704430838072

- test_accuracy:
-- [0.90678403 0.90575212 0.90689348 0.90514236 0.90568958]
-- 0.9060523147641458 +- 0.0006771532548072959

- test_precision_macro:
-- [0.64196128 0.63215024 0.64140512 0.63308525 0.63492296]
-- 0.6367049691481192 +- 0.0041651810187408635

- test_recall_macro:
-- [0.54864908 0.54518713 0.54738227 0.54890799 0.54788282]
-- 0.547601859499399 +- 0.0013236877506575264

- test_f1_macro:
-- [0.56344421 0.55838206 0.56173617 0.5632903  0.56207019]
-- 0.5617845839989919 +- 0.0018266143773715841



# <h4>Ada Boost - DecisionTree</h4>

In [93]:
model_ada = AdaBoostClassifier(algorithm='SAMME', random_state=42)
model_ada.fit(train_X, train_y)


In [94]:
set_cross_validate(model_ada)

- fit_time:
-- [12.98100305 11.35300899 11.47892785 11.40297532 12.04858255]
-- 11.852899551391602 +- 0.6169419026282139

- score_time:
-- [0.35277891 0.35777855 0.35977936 0.35078263 0.35977387]
-- 0.35617866516113283 +- 0.0037182155694548743

- test_accuracy:
-- [0.9154302  0.91510186 0.91580544 0.91505496 0.91478916]
-- 0.9152363232695946 +- 0.0003499729952292643

- test_precision_macro:
-- [0.72684119 0.72103104 0.73301    0.72131955 0.71661917]
-- 0.7237641894304115 +- 0.005647133103412824

- test_recall_macro:
-- [0.54518042 0.54756742 0.54662671 0.54257425 0.54648488]
-- 0.5456867351070336 +- 0.0017321380525583223

- test_f1_macro:
-- [0.56081735 0.56438432 0.56314847 0.55666854 0.56265392]
-- 0.5615345199611892 +- 0.002689659929398254



# <h4>Naive Bayes</h4>

In [95]:
model_nb = GaussianNB()
model_nb.fit(train_X, train_y)

In [96]:
set_cross_validate(model_nb)

- fit_time:
-- [0.31780529 0.20386958 0.27682924 0.27083516 0.26683712]
-- 0.26723527908325195 +- 0.03654503936759986

- score_time:
-- [0.133919   0.13791561 0.18288684 0.16289449 0.18888259]
-- 0.1612997055053711 +- 0.022476129567572663

- test_accuracy:
-- [0.84712081 0.84607327 0.84896574 0.84618271 0.84532278]
-- 0.8467330633687207 +- 0.0012539789838762457

- test_precision_macro:
-- [0.60816237 0.60502033 0.61137669 0.60408923 0.60462158]
-- 0.6066540404186661 +- 0.0027552372064494368

- test_recall_macro:
-- [0.67274999 0.66679574 0.67736623 0.66425597 0.66709675]
-- 0.6696529357560614 +- 0.004752086504474939

- test_f1_macro:
-- [0.62731692 0.62337331 0.63116646 0.62210593 0.62299997]
-- 0.6253925161451728 +- 0.00339543380187503



# <h4>Ada Boost - NaiveBayes</h4>

In [98]:
model_ada = AdaBoostClassifier(GaussianNB(), random_state=42)
model_ada.fit(train_X, train_y)

In [99]:
set_cross_validate(model_ada)

  _warn_prf(average, modifier, msg_start, len(result))


- fit_time:
-- [33.31547618 31.75043941 32.94870234 26.91641998 23.67641163]
-- 29.721489906311035 +- 3.791033160464504

- score_time:
-- [4.15643764 4.04450727 3.48685074 3.13506556 2.88822079]
-- 3.542216396331787 +- 0.49519920588494254

- test_accuracy:
-- [0.799606   0.28576119 0.91439829 0.08571116 0.14584343]
-- 0.44626401288325324 +- 0.34351486105025714

- test_precision_macro:
-- [0.50949178 0.44958439 0.45719914 0.54280554 0.45384586]
-- 0.48258534275375287 +- 0.03717501555480263

- test_recall_macro:
-- [0.51496461 0.36687578 0.5        0.50005985 0.45115908]
-- 0.46661186331367804 +- 0.05432788898255401

- test_f1_macro:
-- [0.50729859 0.25404103 0.47764266 0.07897949 0.14581726]
-- 0.2927558071101526 +- 0.1726278309669599



# <h3>Gerar o modelo</h3>

In [100]:
import pickle
with open('../models/modelo.pkl', 'wb') as file:
    pickle.dump(model_nb, file)