## Create K-Nearest Neighbor Classifier Model
References: 
- Documentation scikit-learn (http://scikit-learn.org/stable/documentation.html)
- Introduction to Machine Learning with Python (http://shop.oreilly.com/product/0636920030515.do)

Scikit-Learn References:
- https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
- https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
- https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
- https://scikit-learn.org/stable/modules/generated/sklearn.metrics.plot_confusion_matrix.html

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import joblib

In [None]:
# Load data file that I serialized in exploration
with open("data/weather.pkl", "rb") as frb:
    weather = joblib.load(frb)

weather.head()

### Prepare Data for Modeling

- Divide the data into train and test subsets
- Encode categorical features for training and testing independently
- Scale numeric features for the training and testing subsets independently
- Concatenate the transformed discrete with the scaled numeric features in each subset

- __We handled the missing values in the Data Cleaning and Exploration__
    - Next week, we will look at a better way to handle missing values

In [None]:
from sklearn.model_selection import train_test_split

# Names of different columns
categorical_cols = ["WindGustDir", "RainToday", "Month"]
continuous_cols = ["Sunshine", "Humidity3pm", "MaxTemp"]

predictor_cols = categorical_cols + continuous_cols
target_col = "RainTomorrow"

X=weather[predictor_cols]
y=weather[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y)

print("Population:\n",y.value_counts(normalize=True)*100)
print("Train:\n", y_train.value_counts(normalize=True)*100)
print("Test:\n", y_test.value_counts(normalize=True)*100)

In [None]:
X_train.shape, X_test.shape

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

encoder = OneHotEncoder(sparse=False, dtype=np.int32, handle_unknown='ignore')
encoder.fit(X_train[categorical_cols])

scaler = MinMaxScaler().fit(X_train[continuous_cols])

# create numeric indicator features for the discrete features to be used in modeling
Xtn_discrete = pd.DataFrame(encoder.transform(X_train[categorical_cols]), columns=list(encoder.get_feature_names(input_features=categorical_cols)))

# scale the continuous features since I'm using a distance-based algorithm
Xtn_continuous = pd.DataFrame(scaler.transform(X_train[continuous_cols]), columns=list(X_train[continuous_cols].columns))
 
#concatenate the continuous and discrete features into one dataframe
X_train = pd.concat([Xtn_continuous, Xtn_discrete], axis = 1)

X_train.head()

In [None]:
X_train.shape

### Prepare the test data set

In [None]:
# create numeric indicator features for the discrete features to be used in modeling
Xt_discrete = pd.DataFrame(encoder.transform(X_test[categorical_cols]), columns=list(encoder.get_feature_names(input_features=categorical_cols)))
Xt_continuous = pd.DataFrame(scaler.transform(X_test[continuous_cols]), columns=list(X_test[continuous_cols].columns))

#concatenate the continuous and discrete features into one dataframe
X_test = pd.concat([Xt_continuous, Xt_discrete], axis = 1)

X_test.head(1)

In [None]:
X_test.shape

### Define and Fit a KNN Classifier

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

Distance metrics
https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.DistanceMetric.html#sklearn.neighbors.DistanceMetric

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
knn_model.fit(X_train, y_train)

### Now let's get a prediction

In [None]:
knn_model.predict(X_test.head(1))

In [None]:
knn_model.predict_proba(X_test.head(1))#[:,1]

### Confusion Matrix & Accuracy

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report

print(y_test.value_counts())

# Make predictions against the test set
pred = knn_model.predict(X_test)

# Show the confusion matrix
''' confusion matrix returned with Predicted as the Columns and Actual as the Rows
         PN  PP
     AN [tn  fp] 
     AP [fn  tp]
'''
print("confusion matrix:")
print(confusion_matrix(y_test, pred))
tn,fp,fn,tp=confusion_matrix(y_test, pred).ravel()
print('tn: ',tn)
print('fp: ',fp)
print('fn: ',fn)
print('tp: ',tp)

# Find the accuracy scores of the predictions against the true classes
print("accuracy: %0.3f" % accuracy_score(y_test, pred))
print("recall: %0.3f" % recall_score(y_test, pred, pos_label='Yes'))
print("precision: %0.3f" % precision_score(y_test, pred, pos_label='Yes'))
print("f-measure: %0.3f" % f1_score(y_test, pred, pos_label='Yes'))
print(classification_report(y_test,pred))

### Scikit-Learn Plot Confusion Matrix

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.plot_confusion_matrix.html

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix

#fig= plt.figure(figsize=(10,10))

negative_label='No Rain'
positive_label='Rain'

# Plot non-normalized confusion matrix
titles_options = [("Confusion matrix, without normalization", None),
                  ("Normalized confusion matrix", 'true')]
for title, normalize in titles_options:
    disp = plot_confusion_matrix(knn_model, X_test, y_test,
                                 display_labels=[negative_label,positive_label],
                                 cmap=plt.cm.Blues,
                                 normalize=normalize)
    disp.ax_.set_title(title)
    disp.ax_.set_xlabel('Predicted')
    disp.ax_.set_ylabel('Actual')

    print(title)
    print(disp.confusion_matrix)

plt.show()

### Build a Pipeline

In [None]:
# Load data
weather = pd.read_csv('data/weather.csv', parse_dates=['Date'])
weather['Month'] = pd.Categorical(weather.Date.dt.month)
weather.head()

In [None]:
from sklearn.model_selection import train_test_split

# Names of different columns
categorical_cols = ["WindGustDir", "RainToday", "Month"]
continuous_cols = ["Sunshine", "Humidity3pm", "MaxTemp"]

predictor_cols = categorical_cols + continuous_cols
target_col = "RainTomorrow"

X=weather[predictor_cols]
y=weather[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y)

print("Population:\n",y.value_counts(normalize=True)*100)
print("Train:\n", y_train.value_counts(normalize=True)*100)
print("Test:\n", y_test.value_counts(normalize=True)*100)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier

num_transformer = Pipeline(steps=[('impute', SimpleImputer(strategy='mean'))
                                 ,('scale', MinMaxScaler())])

cat_transformer = Pipeline(steps=[('impute',SimpleImputer(strategy='most_frequent'))
                                 ,('enc', OneHotEncoder(sparse = False, drop='first', handle_unknown='error'
                                                        ,dtype=np.int32))])


preprocessor = ColumnTransformer(transformers=[('num', num_transformer, continuous_cols),
                                               ('cat', cat_transformer, categorical_cols)]
                                 ,remainder='passthrough')

pipe_knn = Pipeline(steps=[('preprocess', preprocessor)
                            ,('knn', KNeighborsClassifier(n_neighbors=3, metric='euclidean'))])

pipe_knn.steps

In [None]:
knn_model = pipe_knn.fit(X_train, y_train)

### Now let's get a prediction

In [None]:
knn_model.predict(X_test.head(1))

In [None]:
knn_model.predict_proba(X_test.head(1))#[:,1]

### Confusion Matrix & Accuracy

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report

print(y_test.value_counts())

# Make predictions against the test set
pred = knn_model.predict(X_test)

# Show the confusion matrix
''' confusion matrix returned with Predicted as the Columns and Actual as the Rows
         PN  PP
     AN [tn  fp] 
     AP [fn  tp]
'''
print("confusion matrix:")
print(confusion_matrix(y_test, pred))
tn,fp,fn,tp=confusion_matrix(y_test, pred).ravel()
print('tn: ',tn)
print('fp: ',fp)
print('fn: ',fn)
print('tp: ',tp)

# Find the accuracy scores of the predictions against the true classes
print("accuracy: %0.3f" % accuracy_score(y_test, pred))
print("recall: %0.3f" % recall_score(y_test, pred, pos_label='Yes'))
print("precision: %0.3f" % precision_score(y_test, pred, pos_label='Yes'))
print("f-measure: %0.3f" % f1_score(y_test, pred, pos_label='Yes'))
print(classification_report(y_test,pred))

### Serialize My Model

In [None]:
knn_model

In [None]:
with open("data/knn_model.pkl", "wb") as fwb:
    joblib.dump(knn_model, fwb)