# <h4>Bibliotecas necessárias</h4>

In [1]:
# Bibliotecas para preparar os dados
import numpy as np
import pandas as pd

# Bibliotecas processar os dados para treino
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Classes para classificação
from sklearn.tree import ExtraTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Bibliotecas para medir desempenho dos algoritmos
from sklearn.metrics import classification_report

# <h4>Importação do dataset</h4>

In [2]:
df = pd.read_csv('dataset.csv')

In [None]:
df.head(15)

# <h3>Pré processamento dos dados</h3>

# <h4>Busca por NaN e valores nulos</h4>

In [None]:
df.isna().sum()

In [None]:
df.isnull().sum()

# <h4>Renomear colunas</h4>

In [3]:
df.rename(columns={'HeartDisease': 'target', 'BMI': 'IMC'}, inplace=True)

# <h4>Substituir faixas de idade por média</h4>

In [None]:
df.AgeCategory.unique()

In [4]:
def get_median_age(interval):
  values = interval.split('-')
  values = list(map(int, values))
  media = np.mean(values)
  
  return media

In [5]:
for item in df.AgeCategory.unique():
  if item == '80 or older':
    df.AgeCategory.replace(item, 80, inplace=True)
  else:
    df.AgeCategory.replace(item, get_median_age(item), inplace=True)

In [6]:
df.AgeCategory.astype(float)

0         57.0
1         80.0
2         67.0
3         77.0
4         42.0
          ... 
319790    62.0
319791    37.0
319792    47.0
319793    27.0
319794    80.0
Name: AgeCategory, Length: 319795, dtype: float64

# <h4>Converter valores em numéricos</h4>

In [8]:
ALL_COLUMNS = df.columns
ALL_COLUMNS

Index(['target', 'IMC', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',
       'Asthma', 'KidneyDisease', 'SkinCancer'],
      dtype='object')

In [9]:
def convert_in_number(column):
  values_column = df[column].unique()
  
  for i in range(len(values_column)):
    df[column].replace(values_column[i], i + 1, inplace=True)

In [10]:
columns = []

for i in range(len(ALL_COLUMNS)):
  if df[ALL_COLUMNS[i]].dtype != 'float64':
    columns.append(str(ALL_COLUMNS[i]))
columns

['target',
 'Smoking',
 'AlcoholDrinking',
 'Stroke',
 'DiffWalking',
 'Sex',
 'Race',
 'Diabetic',
 'PhysicalActivity',
 'GenHealth',
 'Asthma',
 'KidneyDisease',
 'SkinCancer']

In [11]:
for i in range(len(columns)):
  convert_in_number(columns[i])

In [None]:
df.head(50)

# <h3>Algoritmos de classificação</h3>

# <h4>Decision Tree</h4>

In [13]:
scale_dataset = StandardScaler()

columns = df.columns[1:]
df[columns] = scale_dataset.fit_transform(df[columns].astype(float))
df

Unnamed: 0,target,IMC,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,1,-1.844750,-1.193474,-0.27032,-0.198040,-0.046751,3.281069,-0.401578,-0.951711,0.149220,-0.468951,-2.189273,-0.538256,-1.083956,-1.460354,-2.541515,-0.195554,-3.118419
1,1,-1.256338,0.837890,-0.27032,5.049478,-0.424070,-0.490039,-0.401578,-0.951711,1.447159,-0.468951,0.217526,-0.538256,-1.083956,-0.067601,0.393466,-0.195554,0.320675
2,1,-0.274603,-1.193474,-0.27032,-0.198040,2.091388,3.281069,-0.401578,1.050739,0.713541,-0.468951,-2.189273,-0.538256,-0.420104,0.628776,-2.541515,-0.195554,0.320675
3,1,-0.647473,0.837890,-0.27032,-0.198040,-0.424070,-0.490039,-0.401578,-0.951711,1.277863,-0.468951,0.217526,1.857852,0.243748,-0.763977,0.393466,-0.195554,-3.118419
4,1,-0.726138,0.837890,-0.27032,-0.198040,3.097572,-0.490039,2.490174,-0.951711,-0.697262,-0.468951,0.217526,-0.538256,-1.083956,0.628776,0.393466,-0.195554,0.320675
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,2,-0.144019,-1.193474,-0.27032,-0.198040,0.456341,-0.490039,2.490174,1.050739,0.431381,2.713424,-2.189273,1.857852,-0.420104,-0.763977,-2.541515,-0.195554,0.320675
319791,1,0.238291,-1.193474,-0.27032,-0.198040,-0.424070,-0.490039,-0.401578,1.050739,-0.979422,2.713424,0.217526,-0.538256,-1.083956,-1.460354,-2.541515,-0.195554,0.320675
319792,1,-0.642753,0.837890,-0.27032,-0.198040,-0.424070,-0.490039,-0.401578,-0.951711,-0.415101,2.713424,0.217526,-0.538256,0.243748,-0.763977,0.393466,-0.195554,0.320675
319793,1,0.705560,0.837890,-0.27032,-0.198040,-0.424070,-0.490039,-0.401578,-0.951711,-1.543744,2.713424,0.217526,1.857852,0.243748,3.414282,0.393466,-0.195554,0.320675


In [14]:
columns = df.columns[1:]
X = df[columns]
y = df.target

In [15]:
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.45, random_state=42)

In [16]:
model_tree = ExtraTreeClassifier(max_features=5, max_depth=20, min_samples_split=30, random_state=42)
model_tree.fit(train_X,train_y)

In [18]:
import pickle
with open('modelo.pkl', 'wb') as file:
    pickle.dump(model_tree, file)

In [19]:
knn_model = KNeighborsClassifier(n_neighbors=50)
knn_model.fit(train_X, train_y)

In [73]:
forest_model = RandomForestClassifier(criterion='entropy', oob_score=True, warm_start=True, bootstrap=True, max_depth=15, min_samples_split=35, random_state=0)
forest_model.fit(train_X, train_y)

In [17]:
def set_cross_validate(model):
  from sklearn.model_selection import cross_validate
  from sklearn.model_selection import StratifiedKFold

  metric_names = ['accuracy', 'precision_macro', 'recall_macro','f1_macro']
  cross_val = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

  metricas = cross_validate(model, X, y, cv=cross_val, scoring=metric_names)
  for metrica in metricas:
    print(f"- {metrica}:")
    print(f"-- {metricas[metrica]}")
    print(f"-- {np.mean(metricas[metrica])} +- {np.std(metricas[metrica])}\n")


In [18]:
set_cross_validate(model_tree)

- fit_time:
-- [0.45471931 0.46571398 0.45172334 0.49868989 0.60562801]
-- 0.4952949047088623 +- 0.057627793410051635

- score_time:
-- [0.08994365 0.09594202 0.08794761 0.12392521 0.09094524]
-- 0.0977407455444336 +- 0.013354704683144368

- test_accuracy:
-- [0.91139636 0.91138073 0.91228756 0.91142763 0.91180287]
-- 0.9116590315670976 +- 0.00035092166011536913

- test_precision_macro:
-- [0.67165771 0.67264181 0.68386584 0.67023221 0.67746753]
-- 0.675173020929336 +- 0.004980079388121115

- test_recall_macro:
-- [0.54173285 0.54313176 0.54511698 0.53951473 0.54344478]
-- 0.5425882179679433 +- 0.0018762615026616126

- test_f1_macro:
-- [0.55465612 0.55675913 0.55994014 0.55127674 0.557322  ]
-- 0.555990825984974 +- 0.002897013585067725



In [20]:
set_cross_validate(knn_model)

- fit_time:
-- [0.16190028 0.09194541 0.09094262 0.08894539 0.09993768]
-- 0.1067342758178711 +- 0.02783527759153037

- score_time:
-- [60.51952505 60.60464525 60.82451034 61.38498211 62.52200198]
-- 61.1711329460144 +- 0.7397603674082986

- test_accuracy:
-- [0.91477353 0.91447646 0.91580544 0.91527385 0.91494551]
-- 0.9150549570818806 +- 0.0004554059127816624

- test_precision_macro:
-- [0.7216706  0.71088681 0.76861872 0.74053562 0.73026214]
-- 0.7343947770939894 +- 0.019700739261676925

- test_recall_macro:
-- [0.51956989 0.51957303 0.52005598 0.5210897  0.5185097 ]
-- 0.5197596598038738 +- 0.0008354728937618089

- test_f1_macro:
-- [0.51719336 0.51729482 0.51776469 0.5198733  0.51512683]
-- 0.5174506006525551 +- 0.0015138364399505874



In [27]:
ada_model = AdaBoostClassifier(
  DecisionTreeClassifier(max_depth=20, random_state=42),
  n_estimators=300,
  algorithm='SAMME',
  random_state=42
)
ada_model.fit(train_X, train_y)


In [28]:
set_cross_validate(ada_model)

In [74]:
set_cross_validate(forest_model)

- fit_time:
-- [56.3023541  53.71594596 46.6233058  82.54920292 78.77952099]
-- 63.59406595230102 +- 14.34327911383388

- score_time:
-- [1.88883734 1.36016035 1.63499832 2.25960851 1.7938962 ]
-- 1.7875001430511475 +- 0.29633513040675974

- test_accuracy:
-- [0.91553964 0.91561782 0.91596179 0.91544583 0.91569599]
-- 0.9156522147000423 +- 0.00017561401625539895

- test_precision_macro:
-- [0.74582496 0.74926162 0.76367241 0.73955975 0.75348251]
-- 0.7503602478744066 +- 0.008067079548038661

- test_recall_macro:
-- [0.52388001 0.52367438 0.52394912 0.52623301 0.52347268]
-- 0.5242418385251147 +- 0.0010094545758772098

- test_f1_macro:
-- [0.52496833 0.52457394 0.52499372 0.52924274 0.524178  ]
-- 0.5255913476790218 +- 0.0018498982556751282



In [23]:
print(classification_report(val_y, forest_model.predict(val_X)))

NameError: name 'forest_model' is not defined

In [108]:
df.head(15)

Unnamed: 0,target,IMC,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,1,-1.84475,-1.193474,-0.27032,-0.19804,-0.046751,3.281069,-0.401578,-0.951711,0.14922,-0.468951,-2.189273,-0.538256,-1.083956,-1.460354,-2.541515,-0.195554,-3.118419
1,1,-1.256338,0.83789,-0.27032,5.049478,-0.42407,-0.490039,-0.401578,-0.951711,1.447159,-0.468951,0.217526,-0.538256,-1.083956,-0.067601,0.393466,-0.195554,0.320675
2,1,-0.274603,-1.193474,-0.27032,-0.19804,2.091388,3.281069,-0.401578,1.050739,0.713541,-0.468951,-2.189273,-0.538256,-0.420104,0.628776,-2.541515,-0.195554,0.320675
3,1,-0.647473,0.83789,-0.27032,-0.19804,-0.42407,-0.490039,-0.401578,-0.951711,1.277863,-0.468951,0.217526,1.857852,0.243748,-0.763977,0.393466,-0.195554,-3.118419
4,1,-0.726138,0.83789,-0.27032,-0.19804,3.097572,-0.490039,2.490174,-0.951711,-0.697262,-0.468951,0.217526,-0.538256,-1.083956,0.628776,0.393466,-0.195554,0.320675
5,2,0.085682,-1.193474,-0.27032,-0.19804,0.330568,-0.490039,2.490174,-0.951711,1.277863,0.167524,0.217526,1.857852,-0.420104,3.414282,0.393466,-0.195554,0.320675
6,1,-1.053383,0.83789,-0.27032,-0.19804,1.462524,-0.490039,-0.401578,-0.951711,0.995702,-0.468951,0.217526,-0.538256,-0.420104,-2.15673,-2.541515,-0.195554,-3.118419
7,1,0.521484,-1.193474,-0.27032,-0.19804,0.204795,-0.490039,2.490174,-0.951711,1.447159,-0.468951,-2.189273,1.857852,0.243748,1.325152,-2.541515,-0.195554,0.320675
8,1,-0.295055,0.83789,-0.27032,-0.19804,-0.42407,-0.490039,-0.401578,-0.951711,1.447159,-0.468951,2.624324,1.857852,-0.420104,-1.460354,0.393466,5.113667,0.320675
9,1,1.945316,0.83789,-0.27032,-0.19804,-0.42407,-0.490039,2.490174,1.050739,0.713541,-0.468951,0.217526,-0.538256,0.243748,2.021529,0.393466,-0.195554,0.320675


In [112]:
df.target.value_counts()

1    292422
2     27373
Name: target, dtype: int64