In [3]:
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot  as plt

In [14]:
df_diabet = pd.read_csv("diabetes.csv")

In [15]:
df_diabet.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [16]:
df_diabet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
Pregnancies                 768 non-null int64
Glucose                     768 non-null int64
BloodPressure               768 non-null int64
SkinThickness               768 non-null int64
Insulin                     768 non-null int64
BMI                         768 non-null float64
DiabetesPedigreeFunction    768 non-null float64
Age                         768 non-null int64
Outcome                     768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


### Normalização dos dados em uma escala de 0-1

In [17]:
from sklearn import preprocessing

In [18]:
normalizer = preprocessing.MinMaxScaler().fit(df_diabet)

In [19]:
df_diabet = pd.DataFrame(normalizer.transform(df_diabet), columns=df_diabet.columns)

In [20]:
df_diabet.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.352941,0.743719,0.590164,0.353535,0.0,0.500745,0.234415,0.483333,1.0
1,0.058824,0.427136,0.540984,0.292929,0.0,0.396423,0.116567,0.166667,0.0
2,0.470588,0.919598,0.52459,0.0,0.0,0.347243,0.253629,0.183333,1.0
3,0.058824,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,0.0,0.0
4,0.0,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,0.2,1.0


### Separação da base de teste e base de treino

In [49]:
from sklearn.model_selection import train_test_split

X = df_diabet[["Pregnancies", "Glucose", "BloodPressure", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age"]]
y = df_diabet.Outcome

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

### Modelagem

In [32]:
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier

In [54]:
# K Neighbors Classifier 

knc = KNeighborsClassifier(n_neighbors=3)

knc.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [55]:
predicao = knc.predict(X_test)

In [56]:
from sklearn import metrics

pd.DataFrame(metrics.confusion_matrix(y_test, predicao))

Unnamed: 0,0,1
0,85,14
1,22,33


In [57]:
print("Precisao do kNeighborsClassifier: ",metrics.accuracy_score(y_test, predicao))

Precisao do kNeighborsClassifier:  0.7662337662337663


In [89]:
# Radius Neighbors Classifier 

rnc = RadiusNeighborsClassifier(radius=0.45)

rnc.fit(X_train, y_train)

RadiusNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                          metric_params=None, n_jobs=None, outlier_label=None,
                          p=2, radius=0.45, weights='uniform')

In [90]:
predicaor = rnc.predict(X_test)

In [91]:
pd.DataFrame(metrics.confusion_matrix(y_test, predicaor))

Unnamed: 0,0,1
0,93,6
1,34,21


In [92]:
print("Precisao do RadiusNeighborsClassifier: ",metrics.accuracy_score(y_test, predicaor))

Precisao do RadiusNeighborsClassifier:  0.7402597402597403


### Tenstando diversos hyperparametros

In [94]:
from sklearn.model_selection import GridSearchCV

In [107]:
hyperparametros = {"n_neighbors":[1, 3, 5, 7, 9], "weights":["uniform", "distance"], "metric":['euclidean', "manhattan"]}

gs = GridSearchCV(KNeighborsClassifier(), hyperparametros, cv=3, verbose=1)

In [108]:
gs.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:    0.5s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'metric': ['euclidean', 'manhattan'],
                         'n_neighbors': [1, 3, 5, 7, 9],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [110]:
gs.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='distance')

In [113]:
gs.best_score_

0.744299674267101

In [114]:
gs.best_params_

{'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}

In [1]:
import tensorflow as tf