# 이진분류 (Binary Classification)
- diabetes dataset

In [382]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [383]:
diabetes = pd.read_csv('diabetes.csv')
diabetes.shape

(768, 9)

In [384]:
df = diabetes.copy()
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [385]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [386]:
df.isna().sum(axis=0)

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [387]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,120.894531,31.972618,0.0,99.0,117.0,140.25,199.0
BloodPressure,768.0,69.105469,19.355807,0.0,62.0,72.0,80.0,122.0
SkinThickness,768.0,20.536458,15.952218,0.0,0.0,23.0,32.0,99.0
Insulin,768.0,79.799479,115.244002,0.0,0.0,30.5,127.25,846.0
BMI,768.0,31.992578,7.88416,0.0,27.3,32.0,36.6,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
Age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


In [388]:
df.loc[df['BMI']==0, 'BMI'] = df['BMI'].mean()

In [389]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [390]:
x = df[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']]
y = df['Outcome']

In [391]:
x.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [392]:
y.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [393]:
idx = int(len(x) * 0.8)
idx

614

In [394]:
x_train = x.iloc[:idx, :]
x_test = x.iloc[idx:,:]

In [395]:
x_train.shape, x_test.shape

((614, 8), (154, 8))

In [396]:
y_train = y[:idx]
y_test = y[idx:]
y_train.shape, y_test.shape

((614,), (154,))

In [397]:
u_p = x_train['Pregnancies'].mean()
std_p = x_train['Pregnancies'].std()
x_train['Pregnancies'] = (x_train['Pregnancies'] - u_p) / std_p
x_test['Pregnancies'] = (x_test['Pregnancies'] - u_p) / std_p

u_g = x_train['Glucose'].mean()
std_g = x_train['Glucose'].std()
x_train['Glucose'] = (x_train['Glucose'] - u_g) / std_g
x_test['Glucose'] = (x_test['Glucose'] - u_g) / std_g

u_bp = x_train['BloodPressure'].mean()
std_bp = x_train['BloodPressure'].std()
x_train['BloodPressure'] = (x_train['BloodPressure'] - u_bp) / std_bp
x_test['BloodPressure'] = (x_test['BloodPressure'] - u_bp) / std_bp

u_st = x_train['SkinThickness'].mean()
std_st = x_train['SkinThickness'].std()
x_train['SkinThickness'] = (x_train['SkinThickness'] - u_st) / std_st
x_test['SkinThickness'] = (x_test['SkinThickness'] - u_st) / std_st

u_i = x_train['Insulin'].mean()
std_i = x_train['Insulin'].std()
x_train['Insulin'] = (x_train['Insulin'] - u_i) / std_i
x_test['Insulin'] = (x_test['Insulin'] - u_i) / std_i

u_bmi = x_train['BMI'].mean()
std_bmi = x_train['BMI'].std()
x_train['BMI'] = (x_train['BMI'] - u_bmi) / std_bmi
x_test['BMI'] = (x_test['BMI'] - u_bmi) / std_bmi

u_dpf = x_train['DiabetesPedigreeFunction'].mean()
std_dpf = x_train['DiabetesPedigreeFunction'].std()
x_train['DiabetesPedigreeFunction'] = (x_train['DiabetesPedigreeFunction'] - u_dpf) / std_dpf
x_test['DiabetesPedigreeFunction'] = (x_test['DiabetesPedigreeFunction'] - u_dpf) / std_dpf

u_a = x_train['Age'].mean()
std_a = x_train['Age'].std()
x_train['Age'] = (x_train['Age'] - u_a) / std_a
x_test['Age'] = (x_test['Age'] - u_a) / std_a

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train['Pregnancies'] = (x_train['Pregnancies'] - u_p) / std_p
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test['Pregnancies'] = (x_test['Pregnancies'] - u_p) / std_p
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train['Glucose'] = (x_train['Glucose'] - u_g) / std_g
A value is trying to be

In [398]:
x_train.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.657075,0.842473,0.177335,0.898828,-0.688416,0.178234,0.432875,1.427267
1,-0.837162,-1.083905,-0.128662,0.522845,-0.688416,-0.827391,-0.386442,-0.186622
2,1.254769,1.912683,-0.230661,-1.294406,-0.688416,-1.301471,0.56646,-0.101681
3,-0.837162,-0.961595,-0.128662,0.146862,0.117454,-0.6119,-0.932654,-1.036038
4,-1.136009,0.506121,-1.454649,0.898828,0.751863,1.543011,5.363623,-0.016739


In [399]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

def print_matrics(y_test, y_pred):
  acc = accuracy_score(y_test, y_pred)
  re = recall_score(y_test, y_pred)
  pr = precision_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)

  print('정확도:', acc)
  print('재현율:', np.around(re, 4))
  print('정밀도:', np.around(pr, 4))
  print('f1:', np.around(f1, 4))

In [400]:
from sklearn.model_selection import GridSearchCV

grid_params = {
    'n_neighbors':list(range(3, 100)),
    'algorithm' : ['ball_tree', 'kd_tree', 'brute']
}

clf = KNeighborsClassifier()
grid = GridSearchCV(clf, grid_params, cv=5) # 데이터를 5등분한다.
grid.fit(x_train, y_train)

In [410]:
print('Best parameter:', grid.best_params_)
print('Best score:', grid.best_score_)

Best parameter: {'algorithm': 'ball_tree', 'n_neighbors': 13}
Best score: 0.7622684259629481


In [411]:
clf = KNeighborsClassifier(n_neighbors=15, algorithm='ball_tree')
clf.fit(x_train, y_train)

In [412]:
y_pred = clf.predict(x_test)
print_matrics(y_test, y_pred)

정확도: 0.7467532467532467
재현율: 0.4727
정밀도: 0.7222
f1: 0.5714
