# Imports

In [None]:
%matplotlib notebook

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as sstats
import seaborn as sns
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn import ensemble
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve 
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance
from mpl_toolkits.mplot3d import axes3d

from sklearn.feature_selection import SelectFromModel

from xgboost import XGBClassifier
from sklearn.feature_selection import SequentialFeatureSelector
import time

customers = pd.read_csv('customers.csv')
pd.options.display.max_columns = 30

customers.drop(columns=["Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2",
                       "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1"], 
               inplace=True)

data = customers.copy()
data["Target"] = (customers["Attrition_Flag"] == "Attrited Customer").astype(np.int8)
data.drop(columns="Attrition_Flag", inplace=True)
data["Flag_Male"] = (customers["Gender"] == "M").astype(np.int8)
data.drop(columns="Gender", inplace=True)
data["Card_Category"] = data["Card_Category"].replace(["Blue", 
                                                       "Silver",
                                                       "Gold",
                                                       "Platinum"],
                                                      [0,1,2,3])
# data_dummies["Income_Category"] = data_dummies["Income_Category"].replace(["Less than $40K", 
#                                                            "$40K - $60K",
#                                                            "$60K - $80K",
#                                                            "$80K - $120K",
#                                                            "$120K +",
#                                                            "Unknown"],
#                                                           [20,50,70,100,140,None])

data_dummies = data.copy()

data_dummies = pd.get_dummies(data_dummies, prefix="Flag_Inc", columns=["Income_Category"])
data_dummies = pd.get_dummies(data_dummies, prefix="Flag_Edu", columns=["Education_Level"])
data_dummies = pd.get_dummies(data_dummies, prefix="Flag_Mar", columns=["Marital_Status"])
data_dummies



data_dummies_without_first = data.copy()
data_dummies_without_first = pd.get_dummies(data_dummies_without_first, drop_first=True, 
                                                  prefix="Flag_Inc", columns=["Income_Category"])
data_dummies_without_first = pd.get_dummies(data_dummies_without_first, drop_first=True, 
                                                  prefix="Flag_Edu", columns=["Education_Level"])
data_dummies_without_first = pd.get_dummies(data_dummies_without_first, drop_first=True, 
                                                  prefix="Flag_Mar", columns=["Marital_Status"])
data_dummies_without_first



def KNNFillerMissingData(data, cols=["Education_Level", "Marital_Status", "Income_Category"], 
                         missing_indicator="Unknown"):
    fitted_KNeighborsClassifiers = []
    data_knn = data.copy()
    for column_name in cols:
        X_train = data[data[column_name] != missing_indicator]
        X_train = X_train.drop(columns=cols)
        X_train = X_train.drop(columns=["CLIENTNUM"])
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)

        y_train = data[data[column_name] != missing_indicator][column_name]

        X_test = data[data[column_name] == missing_indicator]
        X_test = X_test.drop(columns=cols)
        X_test = X_test.drop(columns=["CLIENTNUM"])
        scaler = StandardScaler()
        X_test = scaler.fit_transform(X_test)

        model = KNeighborsClassifier(n_neighbors=5)
        model.fit(X_train, y_train)
        fitted_KNeighborsClassifiers.append(model)
        y_preds = model.predict(X_test)

        data_knn.loc[data_knn[column_name] == missing_indicator, column_name] = y_preds
    data_knn.drop_duplicates(inplace=True)
    return data_knn, fitted_KNeighborsClassifiers


data_imputed = KNNFillerMissingData(data)[0]
data_imputed["Income_Category"] = data_imputed["Income_Category"].replace(["Less than $40K", 
                                                           "$40K - $60K",
                                                           "$60K - $80K",
                                                           "$80K - $120K",
                                                           "$120K +"],
                                                          [20,50,70,100,140])
data_imputed


data_imputed_dummy = data_imputed.copy()
data_imputed_dummy = pd.get_dummies(data_imputed_dummy, prefix="Flag", columns=["Education_Level"])
data_imputed_dummy = pd.get_dummies(data_imputed_dummy, prefix="Flag", columns=["Marital_Status"])
data_imputed_dummy


data_imputed_dummy_without_first = data_imputed.copy()
data_imputed_dummy_without_first = pd.get_dummies(data_imputed_dummy_without_first, drop_first=True, 
                                                  prefix="Flag", columns=["Education_Level"])
data_imputed_dummy_without_first = pd.get_dummies(data_imputed_dummy_without_first, drop_first=True, 
                                                  prefix="Flag", columns=["Marital_Status"])
data_imputed_dummy_without_first

def cross_validation(model, sampling, data, k=5, return_importances=False):
    score = 0.
    balanced_acc = 0.
    rec_scor = np.array([0., 0.])
    if return_importances:
        importances = {}
    skf = StratifiedKFold(n_splits=k, shuffle=True)
    if "CLIENTNUM" in data.columns:
        data = data.drop(columns=["CLIENTNUM"])
    X = data.drop(columns=["Target"])
    y = data["Target"]
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]
        X_resampled, y_resampled, X_test, y_test = sampling(X_train, y_train, X_test, y_test)
        model.fit(X_resampled, y_resampled)
        y_pred = model.predict(X_test)
        score += model.score(X_test, y_test)
        balanced_acc += balanced_accuracy_score(y_test, y_pred)
        rec_scor += recall_score(y_test, y_pred, average=None)
        if return_importances:
            for key, val in zip(X_resampled.columns, data_tree.feature_importances_):
                if key in importances:
                    importances[key] += val
                else:
                    importances[key] = val
    if return_importances:
        for key, val in importances.items():
            importances[key] /= k
        importances = sorted(list(importances.items()), key=lambda x: -x[1])
        return score/k, balanced_acc/k, rec_scor/k, importances
    return score/k, balanced_acc/k, rec_scor/k



def over_sampling(X_train, y_train, X_test, y_test):
    ros = RandomOverSampler(random_state=0)
    X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
    return X_resampled, y_resampled, X_test, y_test


def under_sampling(X_train, y_train, X_test, y_test):
    ros = RandomUnderSampler(random_state=0)
    X_train.reset_index(inplace=True, drop=True)
    y_train.reset_index(inplace=True, drop=True)
    X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
    X_test = pd.concat([X_test, X_train.loc[list(set(X_train.index).difference(set(ros.sample_indices_)))]], axis=0)
    y_test = pd.concat([y_test, y_train.loc[list(set(y_train.index).difference(set(ros.sample_indices_)))]], axis=0)
    return X_resampled, y_resampled, X_test, y_test


def smote_sampling(categorical_columns, columns_to_dummy):
    def f(X_train, y_train, X_test, y_test):
        sm = SMOTENC(categorical_features=categorical_columns)
        X_resampled, y_resampled = sm.fit_resample(X_train, y_train)
        for col in columns_to_dummy:
            X_resampled = pd.get_dummies(X_resampled, prefix="Flag_" + col[:3], columns=[col])
            X_test = pd.get_dummies(X_test, prefix="Flag_" + col[:3], columns=[col])
        return X_resampled, y_resampled, X_test, y_test
    return f


def model_score_recall_with_wrapper(data, model, num_features, sampling=over_sampling, direction='forward'):
    X = data.drop(columns=["CLIENTNUM", 'Target'])
    y = data["Target"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
    ros = RandomOverSampler(random_state = 2137)
    X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
    
    sfs = SequentialFeatureSelector(estimator = model, n_features_to_select = num_features, 
                                direction = direction, n_jobs=3)
    sfs.fit(X_resampled, y_resampled)
    
    columns = list(X.columns[sfs.get_support()])
    
    temp_data = data[columns + ['Target']]
        
    score, balance, recalls, importances = cross_validation(model, sampling, temp_data, k=5, 
                                                        return_importances=True)
    return score, recalls



def model_score_recall_with_wrapper_iterative(data, model, max_num_features=25, sampling=over_sampling, 
                                              direction='forward', verbose=True):
    scores = []
    recall_0 = []
    recall_1 = []
    if verbose:
        t0 = time.time()
    
    try: 
        for num_features in range(2, max_num_features + 1):
            score, recalls = model_score_recall_with_wrapper(data, model, num_features, sampling, direction)
            scores.append(score)
            recall_0.append(recalls[0])
            recall_1.append(recalls[1])
            
            if verbose:
                print("One iteration", num_features, ' Time:', int(time.time() - t0) // 60)
            
    except KeyboardInterrupt:
        return scores, recall_0, recall_1
        
    return scores, recall_0, recall_1

# Report