# Optimize KNN
This Notebook contains the optimized KNN model with the 51 selected features. This dataset contains multiple categorical variables and a few numerical variables. Since this is a supervised classification problem, we can apply a popular classification algorithm like KNN. The K-nearest neighbor method is the simplest classification method that classifies based on distance measures and based on our assumption,
that given histories of other customers and the current customer’s data is a good fit to predict customer churn.

In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import math

# own modules
import eda_methods as eda

# visualization
import seaborn as sns
sns.set(style="white")  
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
from pandas.plotting import scatter_matrix

# warnings handler
import warnings
warnings.filterwarnings("ignore")

# Machine Learning Libraries
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import fbeta_score, accuracy_score, f1_score, recall_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer 
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

#Pipeline
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import fbeta_score, make_scorer

random_state=1

In [2]:
from sklearn.metrics import fbeta_score, accuracy_score

def train_predict(modelname, y_train, y_test, predictions_train, predictions_test):
    '''
    inputs:
       - learner: the learning algorithm to be trained and predicted on
       -
       - y_train: income training set
       -
       - y_test: income testing set
    '''
    
    results = {}
    # model name
    results['model'] = modelname
    # accuracy
    results['acc_train'] = accuracy_score(y_train,predictions_train)
    results['acc_test'] = accuracy_score(y_test,predictions_test)
    # F-score
    #results[‘f_train’] = fbeta_score(y_train,predictions_train,0.5)
    #results[‘f_test’] = fbeta_score(y_test,predictions_test,0.5)
    # F1-score
    results['f1_train'] = f1_score(y_train,predictions_train)
    results['f1_test'] = f1_score(y_test,predictions_test)
    # Recall
    results['recall_train'] = recall_score(y_train,predictions_train)
    results['recall_test'] = recall_score(y_test,predictions_test)
    # Precision
    results['precision_train'] = precision_score(y_train,predictions_train)
    results['precision_test'] = precision_score(y_test,predictions_test)
    #fbets
    results['fbeta_train'] = fbeta_score(y_train, predictions_train, beta = .5, average = 'weighted').round(2)
    results['fbeta_test'] =fbeta_score(y_test, predictions_test, beta = .5, average = 'weighted').round(2)
    # Return the results
    return results

## Data

In [3]:
# new feature dataframe
df = pd.read_csv('data/df_clean_engineered_all.csv')
y = df['churn']
# Drop some obvious unnecessary features
df = df.drop(['churn','plz_3','abo_registrierung_min','nl_registrierung_min','ort'], axis = 1)
# Get dummies for the categorial features
df = pd.get_dummies(df, columns = ['kanal', 'objekt_name', 'aboform_name', 'zahlung_rhythmus_name',
                                   'zahlung_weg_name', 'plz_1', 'plz_2', 'land_iso_code', 
                                   'anrede','titel'], drop_first = True)

In [7]:
df.shape

(184660, 307)

In [9]:
# 51 selected features 
X = df[['zahlung_weg_name_Rechnung',
 'zahlung_rhythmus_name_halbjährlich',
 'rechnungsmonat',
 'received_anzahl_6m',
 'openedanzahl_6m',
 'objekt_name_ZEIT Digital',
 'nl_zeitbrief',
 'nl_aktivitaet',
 'liefer_beginn_evt',
 'cnt_umwandlungsstatus2_dkey',
 'clickrate_3m',
 'anrede_Frau',
 'aboform_name_Geschenkabo',
 'unsubscribed_anzahl_1m',
 'studentenabo',
 'received_anzahl_bestandskunden_6m',
 'openrate_produktnews_3m',
 'opened_anzahl_bestandskunden_6m',
 'objekt_name_DIE ZEIT - CHRIST & WELT',
 'nl_zeitshop',
 'nl_opt_in_sum',
 'nl_opened_1m',
 'kanal_andere',
 'kanal_B2B',
 'clicked_anzahl_6m',
 'che_reg',
 'MONTH_DELTA_nl_min',
 'zon_zp_red',
 'zahlung_rhythmus_name_vierteljährlich',
 'unsubscribed_anzahl_hamburg_1m',
 'unsubscribed_anzahl_6m',
 'sum_zon',
 'sum_reg',
 'shop_kauf',
 'plz_2_10',
 'plz_1_7',
 'plz_1_1',
 'openrate_zeitbrief_3m',
 'openrate_produktnews_1m',
 'openrate_3m',
 'openrate_1m',
 'nl_unsubscribed_6m',
 'nl_fdz_organisch',
 'metropole',
 'cnt_abo_magazin',
 'cnt_abo_diezeit_digital',
 'cnt_abo',
 'clicked_anzahl_bestandskunden_3m',
 'aboform_name_Probeabo',
 'aboform_name_Negative Option',
 'MONTH_DELTA_abo_min']]
        
X.shape

(184660, 51)

## RandomSearch CV

In [13]:
def pipeline_optimization(X,y,balance=None):
    
    # devide features
    categoric_features = list(X.columns[X.dtypes==object])

    numeric_features = list(X.columns[X.dtypes != object])

    # split train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,stratify=y)
    
    if balance == 'over':
        # define oversampling strategy
        print('Oversampling')
        oversample = RandomOverSampler(sampling_strategy='minority')
        X_train, y_train = oversample.fit_resample(X_train, y_train)

    if balance == 'under':
        print('Undersampling')
        # define undersample strategy
        undersample = RandomUnderSampler(sampling_strategy='majority')
        X_train, y_train = undersample.fit_resample(X_train, y_train)
        
    # Hyperparameter grid
    param_knn = {'KNN__n_neighbors':[10,15,20],
                 'KNN__weights':['uniform', 'distance'],
                 'KNN__p': [1,2],
                 'KNN__metric':['euclidean', 'manhattan', 'minkowski']
                }

    
    models={
        'KNN' : KNeighborsClassifier(n_jobs=-1)
        }  
    
    # create preprocessors
    numeric_transformer = Pipeline(steps=[
        ('imputer_num', SimpleImputer(strategy='median')),
        ('scaler', MinMaxScaler())
    ])

    categorical_transformer = Pipeline(steps=[
            ('imputer_cat', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categoric_features)
        ])

    model_results = pd.DataFrame(columns=['model','acc_train','acc_test','f1_train','f1_test','recall_train','recall_test','precision_train','precision_test'])
    
    
    # process pipeline for every model
    for model in models.items():
        
        print(model[0])
        pipe = Pipeline(steps=[('preprocessor', preprocessor), 
                               (model[0], model[1])
                               ])

        grid_knn = RandomizedSearchCV(pipe, param_knn, cv=3, scoring='precision', 
                           verbose=5, n_jobs=-1, n_iter = 100)
        # fit model
        grid_knn.fit(X_train, y_train)
    
        # Show best parameters
        print('Best score:\n{:.2f}'.format(grid_knn.best_score_))
        print("Best parameters:\n{}".format(grid_knn.best_params_))
        
        # Save best model as best_model
        best_model = grid_knn.best_estimator_
        
        #predict results
        y_train_pred = grid_knn.predict(X_train)

        y_test_pred = grid_knn.predict(X_test)
        
        results = train_predict(model[0],y_train, y_test, y_train_pred, y_test_pred)
        
        model_results = pd.concat([model_results, pd.DataFrame(results,index=[0])])
        # print results
        print("\nConfusion matrix on test")
        print(confusion_matrix(y_test, y_test_pred))
        print("\n")
        
    return model_results

In [14]:
RandomizedSearch_knn = pipeline_optimization(X,y,balance='under')

Undersampling
KNN
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 34.5min
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed: 59.4min finished


Best score:
0.70
Best parameters:
{'KNN__weights': 'distance', 'KNN__p': 1, 'KNN__n_neighbors': 20, 'KNN__metric': 'manhattan'}

Confusion matrix on test
[[22084 10086]
 [ 3174 10821]]




In [15]:
RandomizedSearch_knn

Unnamed: 0,model,acc_train,acc_test,f1_train,f1_test,recall_train,recall_test,precision_train,precision_test,fbeta_train,fbeta_test
0,KNN,0.999321,0.712769,0.999321,0.620079,0.998833,0.773205,0.999809,0.517578,1.0,0.75


## Conclusion
The Random Grid Search resulted in a result of __recall: 0.77__, __f1: 0.62__ and a __precision: 0.52__. All three score are significant improvements to the first results of __recall: 0.46__, __f1: 0.51__ and a __precision: 0.58__ from the not tuend and unengineered baseline model. 