In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import numpy as np
import pandas as pd

##  K-Nearest Neighbors Classification
### 1 Digits Classification Exercise

https://scikit-learn.org/stable/auto_examples/exercises/plot_digits_classification_exercise.html#sphx-glr-auto-examples-exercises-plot-digits-classification-exercise-py

In [None]:
from sklearn import datasets, neighbors, linear_model
from sklearn.neighbors import KNeighborsClassifier

X_digits, y_digits = datasets.load_digits(return_X_y=True)
X_digits = X_digits / X_digits.max()

In [None]:
# # 나중에 비교해 볼 것
# from sklearn.preprocessing import StandardScaler
# sc_X = StandardScaler()
# X_digits = sc_X.fit_transform(X_digits,)

In [None]:
n_samples = len(X_digits)
cut_idx = int(.9 * n_samples)

X_train = X_digits[:cut_idx]
y_train = y_digits[:cut_idx]
X_test  = X_digits[cut_idx:]
y_test  = y_digits[cut_idx:]

knn      = neighbors.KNeighborsClassifier()
logistic = linear_model.LogisticRegression(max_iter=1000)

print(f'KNN score               : {knn.fit(X_train, y_train).score(X_test, y_test)}')
print(f'LogisticRegression score: {logistic.fit(X_train, y_train).score(X_test, y_test)}')

In [None]:
knn.get_params()

### 2 Step by Step Diabetes Classification-KNN-detailed
https://www.kaggle.com/shrutimechlearn/step-by-step-diabetes-classification-knn-detailed

In [None]:
#Loading the dataset
diabetes_data = pd.read_csv('../data/diabetes.csv')
#Print the first 5 rows of the dataframe.
diabetes_data.head()

In [None]:
def df_quailty(df) :
    tf = pd.DataFrame({'데이터형태(dtypes)' : df.dtypes,
                       '비 결측치 수(notnull)': df.notnull().sum(),
                       '결측치 수(null)' : df.isnull().sum(),
                       '고유값 수(nunique)' : df.nunique()})
    return tf

df_quailty(diabetes_data)

In [None]:
diabetes_data.hist(figsize = (15,15))

In [None]:
plt.figure(figsize=(8,6))  
sns.heatmap(diabetes_data.corr(), annot=True,cmap ='Blues')  

#### heatmap 대신 간단히 사용할 수 있는 방법

In [None]:
diabetes_data.corr().style.background_gradient(cmap='Blues')

In [None]:
X = diabetes_data.drop(["Outcome"],axis = 1)
y = diabetes_data.Outcome

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    stratify=y, random_state=11)

from sklearn.model_selection import KFold
kf = KFold(n_splits=3, shuffle=True, random_state=11)

#### randomized search

In [None]:
from sklearn.model_selection import RandomizedSearchCV
param_dist= dict(n_neighbors = list(range(5,105, 2)),
                 weights     = ['uniform','distance'],
                 algorithm   = ['ball_tree', 'kd_tree'],
                 leaf_size   = list(range(5,50)))
np.random.seed(1357)

randomized = RandomizedSearchCV(KNeighborsClassifier(), 
                                param_distributions= param_dist, cv=kf,
                                n_iter=64, scoring= 'recall', verbose=True)
randomized.fit(X_train, y_train)

In [None]:
print(f'optimal train score: {randomized.best_score_:.3f}') 
print(f'test score         : {randomized.score(X_test, y_test):.3f}')
print(f'optimal parameter  : {randomized.best_params_}')

In [None]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X = pd.DataFrame(sc_X.fit_transform(diabetes_data.drop(["Outcome"],axis = 1),),
                 columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
                          'BMI', 'DiabetesPedigreeFunction', 'Age'])
y = diabetes_data.Outcome

In [None]:
y.value_counts(normalize=True)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    stratify=y, random_state=11)

from sklearn.model_selection import KFold
kf = KFold(n_splits=3, shuffle=True, random_state=11)

#### randomized search

In [None]:
from sklearn.model_selection import RandomizedSearchCV
param_dist= dict(n_neighbors = list(range(5,105, 2)),
                 weights     = ['uniform','distance'],
                 algorithm   = ['ball_tree', 'kd_tree'],
                 leaf_size   = list(range(5,50)))
np.random.seed(1357)

randomized = RandomizedSearchCV(KNeighborsClassifier(), 
                                param_distributions= param_dist, cv=kf,
                                n_iter=64, scoring= 'recall', verbose=True)
randomized.fit(X_train, y_train)

In [None]:
print(f'optimal train score: {randomized.best_score_:.3f}') 
print(f'test score         : {randomized.score(X_test, y_test):.3f}')
print(f'optimal parameter  : {randomized.best_params_}')

In [None]:
results = pd.DataFrame(randomized.cv_results_)
results.head()

In [None]:
results = pd.DataFrame(randomized.cv_results_)[['params', 'mean_test_score', 'rank_test_score']]
results.sort_values('rank_test_score').round(4).head(10)

In [None]:
from sklearn.metrics import plot_confusion_matrix, classification_report
sns.reset_defaults()
plot_confusion_matrix(randomized, X_test, y_test)
y_pred = randomized.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
import scikitplot as skplt 
y_probas = randomized.predict_proba(X_test)
skplt.metrics.plot_roc(y_test, y_probas)

In [None]:
skplt.metrics.plot_precision_recall(y_test, y_probas)

## K-Nearest Neighbors Regression

In [None]:
np.random.seed(0)
X = np.sort(5 * np.random.rand(40, 1), axis=0)
T = np.linspace(0, 5, 500).reshape(-1,1)
y = np.sin(X).reshape(-1,)

# noise 넣기
y[::5] += (0.5 - np.random.rand(8))


# neighbour 개수와 weight 변화시키며 regression model 적합 결과
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10,6))

for i, weights in enumerate(['uniform', 'distance']):
    for j, n_neighbors in enumerate([3, 5]):
        knn = neighbors.KNeighborsRegressor(n_neighbors, weights=weights)
        y_ = knn.fit(X, y).predict(T)

        axes[i,j].scatter(X, y, color='darkorange', label='data')
        axes[i,j].plot(T, y_, color='navy', label='prediction')
        axes[i,j].axis('tight')
        axes[i,j].legend()
        axes[i,j].title.set_text(f"KNeighborsRegressor (k = {n_neighbors}, weights = '{weights}')")

plt.tight_layout()
plt.show()

## KNN Imputer

In [None]:
from sklearn.impute import KNNImputer
X = [[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]]
X
imputer = KNNImputer(n_neighbors=2)
imputer.fit_transform(X)