In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV, KFold, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [2]:
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# Questão 1

### Sem escalonamento

In [3]:
X = df.drop(['Outcome'],axis=1).values
y = df['Outcome'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train_holdout, X_validacao_holdout, y_train_holdout, y_validacao_holdout = train_test_split(X_test, y_test, test_size=0.3)

In [4]:
scores = []
for i in range(1,30):
    pipe_cls = make_pipeline(KNeighborsClassifier(n_neighbors=i))
    pipe_cls.fit(X_train, y_train)
    y_pred = pipe_cls.predict(X_validacao_holdout)
    scores.append(f1_score(y_validacao_holdout, y_pred))

print('Test F1-score: %.3f' % max(scores))
print('Better k value =', scores.index(max(scores)))

better_k = scores.index(max(scores))

scores.sort(reverse = True)
for item in scores:
    print(f'value: {item}')

Test F1-score: 0.500
Better k value = 12
value: 0.5
value: 0.5
value: 0.4666666666666667
value: 0.4666666666666667
value: 0.4666666666666667
value: 0.4666666666666667
value: 0.4516129032258065
value: 0.4516129032258065
value: 0.4516129032258065
value: 0.4516129032258065
value: 0.4516129032258065
value: 0.4516129032258065
value: 0.42857142857142855
value: 0.42857142857142855
value: 0.4242424242424242
value: 0.41379310344827586
value: 0.41379310344827586
value: 0.39999999999999997
value: 0.39999999999999997
value: 0.39999999999999997
value: 0.39999999999999997
value: 0.3870967741935484
value: 0.3870967741935484
value: 0.3846153846153846
value: 0.37499999999999994
value: 0.37499999999999994
value: 0.36363636363636365
value: 0.35714285714285715
value: 0.3076923076923077


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

pipe_cls = make_pipeline(KNeighborsClassifier(n_neighbors=better_k))
pipe_cls.fit(X_train, y_train)
y_pred = pipe_cls.predict(X_test)
print(f'final model f1-score : {f1_score(y_test, y_pred)*100}%')

final model f1-score : 51.06382978723405%


### Com escalonamento

In [6]:
X_ = df.drop(['Outcome'],axis=1)
y_ = df['Outcome']

X_train_, X_test_, y_train_, y_test_ = train_test_split(X_, y_, test_size=0.2, random_state=42, stratify=y)
X_train_holdout_, X_validacao_holdout_, y_train_holdout_, y_validacao_holdout_ = train_test_split(X_test_, y_test_, test_size=0.3)

In [7]:
scores_ = []
for i in range(1,30):
    pipe_cls = make_pipeline(StandardScaler(), SimpleImputer(strategy='mean'),KNeighborsClassifier(n_neighbors=i))
    pipe_cls.fit(X_train_, y_train_)
    y_pred_ = pipe_cls.predict(X_validacao_holdout_)
    scores_.append(f1_score(y_validacao_holdout_, y_pred_))
    #print('Test Accuracy: %.3f' % pipe_cls.score(X_test_, y_test_))
    #print('Test F1-score: %.3f' % f1_score(y_test_, y_pred_))
print('Test F1-score: %.3f' % max(scores_))
print('Better k value =', scores_.index(max(scores_)))

better_k_ = scores_.index(max(scores_))

scores_.sort(reverse = True)
for item in scores_:
    print(f'value: {item}')

Test F1-score: 0.621
Better k value = 11
value: 0.6206896551724138
value: 0.6206896551724138
value: 0.6206896551724138
value: 0.6206896551724138
value: 0.5999999999999999
value: 0.5925925925925927
value: 0.5925925925925927
value: 0.5925925925925927
value: 0.5925925925925927
value: 0.5806451612903226
value: 0.5806451612903226
value: 0.5806451612903226
value: 0.5806451612903226
value: 0.5714285714285715
value: 0.5714285714285715
value: 0.5714285714285715
value: 0.5625
value: 0.5599999999999999
value: 0.5599999999999999
value: 0.5599999999999999
value: 0.5517241379310345
value: 0.5517241379310345
value: 0.5454545454545455
value: 0.5384615384615384
value: 0.5185185185185186
value: 0.5161290322580646
value: 0.5
value: 0.48000000000000004
value: 0.4545454545454546


In [8]:
X_train_, X_test_, y_train_, y_test_ = train_test_split(X_, y_, test_size=0.2, random_state=42, stratify=y)

pipe_cls = make_pipeline(StandardScaler(), SimpleImputer(strategy='mean'),KNeighborsClassifier(n_neighbors=better_k_))
pipe_cls.fit(X_train_, y_train_)
y_pred_ = pipe_cls.predict(X_test_)
print(f'final model f1-score : {f1_score(y_test_, y_pred_)*100}%')

final model f1-score : 59.04761904761905%


# Questão 2

### sem escalonamento

In [9]:
kfold = StratifiedKFold(n_splits=5).split(X_train, y_train)

#sc = []
f1_sc = []

for j in range(1,30):
    pipe_lr = make_pipeline(SimpleImputer(strategy='mean'),KNeighborsClassifier(n_neighbors=j))
    for k, (train, test) in enumerate(kfold):
        pipe_lr.fit(X_train[train], y_train[train])
        y_pred_2 = pipe_lr.predict(X_test)
        #score = pipe_cls.score(X_train[test], y_train[test])
        #sc.append(score)
        f1_sc.append(f1_score(y_test, y_pred_2))


print(f'max f1-score in array {max(f1_sc) : .2f}')
kfold_result = cross_val_score(pipe_lr, X, y, cv=5)
print(f'mean of cross_val_score with n_split=5: {np.mean(cross_val_score(pipe_lr, X, y, cv=5)) : .2f}')
max_kfold_index = np.where(kfold_result == np.amax(kfold_result))
print(f'better kfold :  {int(max_kfold_index[0])}')

pipe = make_pipeline(SimpleImputer(strategy='mean'),KNeighborsClassifier(n_neighbors=int(max_kfold_index[0])))
pipe.fit(X_train_, y_train_)
y_pred = pipe.predict(X_test)
print(f'f1 score kfold = 5 without scaler {f1_score(y_test, y_pred) : .2f}')


f1_sc.sort(reverse = True)
for item in f1_sc:
    print(f'value: {item}')

max f1-score in array  0.52
mean of cross_val_score with n_split=5:  0.74
better kfold :  2
f1 score kfold = 5 without scaler  0.48
value: 0.5225225225225226
value: 0.5225225225225226
value: 0.5172413793103449
value: 0.509433962264151
value: 0.509090909090909


### Com StandardScaler

In [10]:
pipe = make_pipeline(StandardScaler(),SimpleImputer(strategy='mean'),KNeighborsClassifier(n_neighbors=int(max_kfold_index[0])))
pipe.fit(X_train_, y_train_)
y_pred = pipe.predict(X_test)
print(f'f1 score kfold = 5 with StandardScaler {f1_score(y_test, y_pred) : .2f}')

f1 score kfold = 5 with StandardScaler  0.50


In [11]:
print(f'mean of cross_val_score with n_split=5: {np.mean(cross_val_score(pipe_lr, X, y, cv=5)) : .2f}')
max_kfold_index = np.where(kfold_result == np.amax(kfold_result))
print(f'better kfold :  {int(max_kfold_index[0])}')

f1_sc.sort(reverse = True)
for item in f1_sc:
    print(f'value: {item}')

mean of cross_val_score with n_split=5:  0.74
better kfold :  2
value: 0.5225225225225226
value: 0.5225225225225226
value: 0.5172413793103449
value: 0.509433962264151
value: 0.509090909090909


# Questão 3

In [31]:
X = df.drop(['Outcome'],axis=1).values
y = df['Outcome'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train_holdout, X_validacao_holdout, y_train_holdout, y_validacao_holdout = train_test_split(X_test, y_test, test_size=0.3)

### Sem escalonamento

In [32]:
scores_gsv = []
for i in range(1,30):
    pipe_knn = make_pipeline(KNeighborsClassifier(n_neighbors=i))
    pipe_knn.fit(X_train, y_train)
    y_pred = pipe_knn.predict(X_validacao_holdout)
    scores_gsv.append(f1_score(y_validacao_holdout, y_pred))

print('Test F1-score: %.3f' % max(scores_gsv))
print('Better k value =', scores_gsv.index(max(scores_gsv)))

Test F1-score: 0.703
Better k value = 2


In [52]:
arr = []
for elem in range(1,30):
    arr.append(elem)

grid_params = {
    'n_neighbors' : arr
}

gs = GridSearchCV(
    KNeighborsClassifier(),
    grid_params,
    verbose=1,
    cv = 5,
    n_jobs=-1)

gs_result = gs.fit(X_train, y_train)
gs_pred = gs_result.predict(X_validacao_holdout)
print(f'f1-score with GridsearchCV = 5: {f1_score(gs_pred, y_pred)*100 : .2f}%')

Fitting 5 folds for each of 29 candidates, totalling 145 fits
f1-score with GridsearchCV = 5:  64.00%


### Com escalonamento

In [55]:
X_ = df.drop(['Outcome'],axis=1).values
y_ = df['Outcome'].values

X_train_, X_test_, y_train_, y_test_ = train_test_split(X_, y_, test_size=0.2, random_state=42, stratify=y)
X_train_holdout_, X_validacao_holdout_, y_train_holdout_, y_validacao_holdout_ = train_test_split(X_test_, y_test_, test_size=0.3)

In [56]:
scores_gsv_ = []
for i in range(1,30):
    pipe_knn = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=i))
    pipe_knn.fit(X_train_, y_train_)
    y_pred_ = pipe_knn.predict(X_validacao_holdout_)
    scores_gsv_.append(f1_score(y_validacao_holdout_, y_pred_))

print('Test F1-score: %.3f' % max(scores_gsv_))
print('Better k value =', scores_gsv_.index(max(scores_gsv_)))

Test F1-score: 0.733
Better k value = 5


In [59]:
arr_ = []
for elem in range(1,30):
    arr_.append(elem)

grid_params = {
    'n_neighbors' : arr_
}

gs_ = GridSearchCV(
    KNeighborsClassifier(),
    grid_params,
    verbose=1,
    cv = 5,
    n_jobs=-1)

gs_result_ = gs_.fit(X_train_, y_train_)
gs_pred_ = gs_result_.predict(X_validacao_holdout_)
print(f'f1-score with GridsearchCV = 5: {f1_score(gs_pred_, y_pred_)*100 : .2f}%')

Fitting 5 folds for each of 29 candidates, totalling 145 fits
f1-score with GridsearchCV = 5:  63.16%
