In [1]:
from sklearn.model_selection import train_test_split

import pandas as pd

data = pd.read_csv("./dataset/fetal_health.csv")
data = data.dropna()

target = "fetal_health"

X = data.drop(target, axis=1)
y = data[target]
print(f"shape X={X.shape}, shape y={y.shape}")

X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=42)
print(f"TR: shape X_tr={X_tr.shape}, shape y_tr={y_tr.shape}")
print(f"TE: shape X_te={X_te.shape}, shape y_te={y_te.shape}")

shape X=(2126, 21), shape y=(2126,)
TR: shape X_tr=(1594, 21), shape y_tr=(1594,)
TE: shape X_te=(532, 21), shape y_te=(532,)


In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

import numpy as np

classificadores = { "knn": KNeighborsClassifier(), "tree": DecisionTreeClassifier(), "svc": SVC() }

parametros = { "knn": { "modelo__n_neighbors": list(range(1,15,2)),
                        "modelo__weights": ['uniform', 'distance'] }, 
               "tree": { "modelo__splitter": ["best", "random"],
                         "modelo__min_samples_split": list(range(2,6,2)), 
                         "modelo__max_depth": [None] + list(range(7,15,2)),
                         "modelo__criterion": ["gini", "entropy"],
                         "modelo__max_features": [None] + ["auto", "sqrt", "log2"] }, 
               "svc": { "modelo__kernel": ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
                        "modelo__degree": list(range(1,5,2)),
                        "modelo__gamma": ['scale', 'auto'],
                        "modelo__C": np.arange(0.05, 1.0, 0.1) } 
             }

In [3]:
import warnings

warnings.filterwarnings('ignore')

In [4]:

import matplotlib.pylab as plt

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.pipeline import Pipeline

for key in classificadores.keys():

    pipeTotal = Pipeline([
        ("padronizacao", StandardScaler()),
        ("modelo", classificadores[key])
    ])

    param_grid = parametros[key]

    modelo = GridSearchCV(pipeTotal, param_grid=param_grid)
    modelo.fit(X_tr, y_tr)
    y_pr = modelo.predict(X_te)

    print(key)
    hits = y_pr == y_te
    print(f'accuracy={sum(hits)/len(hits)}')
    print(f"best_params={modelo.best_params_}\n")

knn
accuracy=0.9304511278195489
best_params={'modelo__n_neighbors': 5, 'modelo__weights': 'distance'}

tree
accuracy=0.9285714285714286
best_params={'modelo__criterion': 'entropy', 'modelo__max_depth': 7, 'modelo__max_features': None, 'modelo__min_samples_split': 4, 'modelo__splitter': 'best'}

svc
accuracy=0.8966165413533834
best_params={'modelo__C': 0.9500000000000002, 'modelo__degree': 1, 'modelo__gamma': 'scale', 'modelo__kernel': 'linear'}



In [6]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier

import itertools

estimators = [
    ("knn", KNeighborsClassifier(n_neighbors=5, weights="distance")),
    ("tree", DecisionTreeClassifier(criterion='entropy', max_depth=None, max_features=None, min_samples_split=4, splitter='best')),
    ("svc", SVC(C=0.95, degree=1, gamma='scale', kernel='linear'))]

ensembles = { "voting": VotingClassifier(estimators=estimators), "stacking": StackingClassifier(estimators=estimators) }

weights = [[1,1,1], [3,2,2], [2,2,3], [2,3,2]] + [list(t) for t in itertools.permutations([1,2,3], 3)]

parametros["voting"] = { "voting": ["hard", "soft"], "weights": weights }

cv_5 = RepeatedKFold(n_splits=5, n_repeats=1, random_state=42)
cv_7 = RepeatedKFold(n_splits=7, n_repeats=1, random_state=42)
cv_9 = RepeatedKFold(n_splits=9, n_repeats=1, random_state=42)

parametros["stacking"] = { "cv": [cv_5, cv_7, cv_9], "passthrough": [True, False]}

for key in ensembles.keys():

    param_grid = parametros[key]

    modelo = GridSearchCV(ensembles[key], param_grid=param_grid)
    modelo.fit(X_tr, y_tr)
    en_pr = modelo.predict(X_te)

    print(key)
    en_hits = en_pr == y_te
    print(f'accuracy={sum(en_hits)/len(en_hits)}')
    print(f"best_params={modelo.best_params_}\n")


voting
accuracy=0.9266917293233082
best_params={'voting': 'hard', 'weights': [1, 3, 2]}

stacking
accuracy=0.9342105263157895
best_params={'cv': RepeatedKFold(n_repeats=1, n_splits=9, random_state=42), 'passthrough': False}

