In [None]:
# to select columns based on dtypes
from sklearn.compose import make_column_selector as selector

In [None]:
%run ./Ready_Codes.ipynb
import classif_models

# Import

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pandas_bokeh
pandas_bokeh.output_notebook()

import scikitplot as skplt
import seaborn as sns
sns.set()

from sklearn import metrics
from sklearn.model_selection import cross_validate

import warnings
warnings.filterwarnings('ignore')

plt.rcParams['axes.labelsize'] = 15
plt.rcParams['axes.titlesize'] = 15

#Importing Classification algorithms

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import (
    LogisticRegression,
    LogisticRegressionCV,
    SGDClassifier,
    Perceptron,
    PassiveAggressiveClassifier,
    RidgeClassifier, 
    RidgeClassifierCV
)

from sklearn.svm import LinearSVC, SVC, NuSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import  GaussianNB, BernoulliNB
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier

from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from rgf.sklearn import RGFClassifier, FastRGFClassifier
from gpboost import GPBoostClassifier
from sklearn.experimental import enable_hist_gradient_boosting

from sklearn.ensemble import (
    RandomForestClassifier, 
    AdaBoostClassifier, 
    GradientBoostingClassifier, 
    ExtraTreesClassifier, 
    IsolationForest, 
    BaggingClassifier, 
    HistGradientBoostingClassifier
)

from imblearn.ensemble import (
    EasyEnsembleClassifier, 
    RUSBoostClassifier, 
    BalancedBaggingClassifier, 
    BalancedRandomForestClassifier 
)

from numpy import interp

class Progress:
    def __init__(self, value, end, title='Progress',buffer=100):
        self.title = title
        #when calling in a for loop it doesn't include the last number
        self.end = end -1
        self.buffer = buffer
        self.value = value
        self.progress()

    def progress(self):
        maped = int(interp(self.value, [0, self.end], [0, self.buffer]))
        print(f'{self.title}: [{"#"*maped}{"-"*(self.buffer - maped)}]{self.value}/{self.end} {((self.value/self.end)*100):.2f}%', end='\r')

from sklearn import set_config
set_config(display='diagram')        

seed = #seed

In [None]:
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight('balanced', np.unique(y_train.Loan_Status), y_train.Loan_Status)

print(class_weights)

# Stacking Classifier

In [None]:
%%time
from mlxtend.classifier import StackingClassifier
from sklearn import metrics
import scikitplot as skplt

sclf = StackingClassifier(classifiers=[cat, bag_clf, gbc, et_clf], 
                          meta_classifier=LogisticRegression(random_state = seed), 
                          use_probas = True)

sclf.fit(X_train_pars, y_train)

y_pred = sclf.predict(X_test_pars)
y_probs = sclf.predict_proba(X_test_pars)

print()
print(f'Stacking Classifier F1 score on TEST set: {metrics.f1_score(y_test, y_pred)*100:.4f} %')
print() 
print(f'Stacking Classifier Accuracy on TEST set: {metrics.accuracy_score(y_test, y_pred)*100:.4f} %')
print()
print(f'Stacking Classifier ROC AUC Score: {metrics.roc_auc_score(y_test, y_probs[:,1])*100:.4f} %')
print()
print(metrics.classification_report(y_test, y_pred))
skplt.metrics.plot_roc(y_test, y_probs);
skplt.metrics.plot_confusion_matrix(y_test,y_pred, text_fontsize = 'large', cmap='YlGn');

# Ensemble Voting

In [None]:
%%time
from mlxtend.classifier import EnsembleVoteClassifier

evc = EnsembleVoteClassifier(clfs=calibrated_models, voting = 'soft')

evc.fit(X_train_pars, y_train)

y_pred = evc.predict(X_test_pars)
y_probs = evc.predict_proba(X_test_pars)

print()
print(f'Voting Classifier F1 score on TEST set: {metrics.f1_score(y_test, y_pred)*100:.4f} %')
print() 
print(f'Voting Classifier Accuracy on TEST set: {metrics.accuracy_score(y_test, y_pred)*100:.4f} %')
print()
print(f'Voting Classifier ROC AUC Score: {metrics.roc_auc_score(y_test, y_probs[:,1])*100:.4f} %')
print()
print(metrics.classification_report(y_test, y_pred))
skplt.metrics.plot_roc(y_test, y_probs);
skplt.metrics.plot_confusion_matrix(y_test,y_pred, text_fontsize = 'large', cmap='YlGn');

# Custom Functions

##### Try all models

In [None]:
#Function for selecting the right model

class classif_models:
    
    def __init__(self, Xtrain, ytrain, preprocessor, cv):
        self.Xtrain = Xtrain
        self.ytrain = ytrain
        self.preprocessor = preprocessor 
        self.cv = cv
        
    def check_clf_models(self):
        
        models = [
            LogisticRegression(random_state = seed),
            LogisticRegressionCV(cv=10, random_state = seed),
            SGDClassifier(tol = 0.1, early_stopping = True, validation_fraction = 0.2, random_state = seed),
            Perceptron(tol = 0.1, early_stopping = True, validation_fraction = 0.2, random_state = seed),
            PassiveAggressiveClassifier(tol = 0.1, early_stopping = True, validation_fraction = 0.2, random_state = seed),
            RidgeClassifier(random_state = seed),
            RidgeClassifierCV(cv=10),
            LinearSVC(loss = 'hinge', random_state = seed),
            SVC(kernel = 'rbf', random_state = seed),
            NuSVC(random_state = seed),
            KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2),
            GaussianNB(), 
            BernoulliNB(),
            MLPClassifier(tol=0.1, early_stopping = True, validation_fraction = 0.2, random_state = seed),
            GaussianProcessClassifier(random_state = seed)
        ]
        
        acc = []
        acc_std = []
        f1 = []
        roc_auc = []
        prec = []
        recall = []
        bal_acc = []
        model_names = []
        
        print('Training using Non-Tree based models...')
        for model, i in zip(models, range(len(models))):
            pipe = Pipeline(steps = [('preprocessor', preprocessor), (type(model).__name__, model)])
            
            scores = cross_validate(pipe, 
                                    self.Xtrain, 
                                    self.ytrain,
                                    scoring = ['accuracy', 'f1', 'roc_auc', 'precision', 'recall', 
                                               'balanced_accuracy'],
                                    cv = self.cv, 
                                    n_jobs = -1)
            
            acc.append(list(scores.values())[2].mean()*100)
            acc_std.append(list(scores.values())[2].std()*100)
            f1.append(list(scores.values())[3].mean()*100)
            roc_auc.append(list(scores.values())[4].mean()*100)
            prec.append(list(scores.values())[5].mean()*100)
            recall.append(list(scores.values())[6].mean()*100)
            bal_acc.append(list(scores.values())[7].mean()*100)
            
            model_names.append(type(model).__name__)
            Progress(i, len(models))
            
        print()

    #Using Ensemble Models

        tree_models = [
            DecisionTreeClassifier(criterion = 'entropy', max_depth = 6, random_state = seed),
            RandomForestClassifier(criterion='entropy', max_depth=6, class_weight='balanced', n_jobs=-1, random_state = seed), 
            XGBClassifier(use_label_encoder=False, eval_metric = 'error', seed = seed), 
            CatBoostClassifier(verbose = False, loss_function='CrossEntropy', eval_metric='TotalF1', random_seed = seed), 
            LGBMClassifier(random_state = seed), 
            AdaBoostClassifier(random_state = seed), 
            GradientBoostingClassifier(random_state = seed), 
            BaggingClassifier(random_state = seed), 
            ExtraTreesClassifier(criterion='entropy', max_depth=6, class_weight='balanced', n_jobs=-1, random_state = seed), 
            HistGradientBoostingClassifier(random_state = seed), 
            EasyEnsembleClassifier(random_state = seed), 
            RUSBoostClassifier(random_state = seed), 
            BalancedBaggingClassifier(random_state = seed), 
            BalancedRandomForestClassifier(n_estimators = 100, criterion = 'entropy', max_depth = 6, random_state = seed), 
            RGFClassifier(loss = 'Log', algorithm='RGF_Sib'), 
            FastRGFClassifier(loss='LOGISTIC'),
            GPBoostClassifier(random_state = seed)
        ]
        
        print()
        print('Training Using Ensemble models...')
        for model, i in zip(tree_models, range(len(tree_models))):
            
            scores = cross_validate(model, 
                                    self.Xtrain, 
                                    self.ytrain,
                                    scoring = ['accuracy', 'f1', 'roc_auc', 'precision', 'recall', 
                                               'balanced_accuracy'],
                                    cv = self.cv, 
                                    n_jobs = -1)
            
            acc.append(list(scores.values())[2].mean()*100)
            acc_std.append(list(scores.values())[2].std()*100)
            f1.append(list(scores.values())[3].mean()*100)
            roc_auc.append(list(scores.values())[4].mean()*100)
            prec.append(list(scores.values())[5].mean()*100)
            recall.append(list(scores.values())[6].mean()*100)
            bal_acc.append(list(scores.values())[7].mean()*100)
            
            model_names.append(type(model).__name__)
            Progress(i, len(tree_models))
        
        print()
        self.df = pd.DataFrame({
            'Models': model_names, 
            'Acc %': acc, 
            'Acc STD %': acc_std, 
            'f1 %':f1,
            'ROC_AUC %' :roc_auc, 
            'Precision %': prec,
            'Recall %': recall, 
            'Balanced_Acc %': bal_acc 
        }).sort_values('f1 %', ascending = False, ignore_index = True).style.highlight_max(color = 'green')

        return self.df
    
# classif_models(Xtrain, ytrain, cv)
#.check_clf_models()

### Validate Model with the right Threshold

In [None]:
#for evaluating model

class validate_on_test:
    
    def __init__(self, model, Xtrain, ytrain, Xtest, ytest, cv, beta, optimize):
        self.model = model
        self.Xtrain = Xtrain
        self.ytrain = ytrain
        self.Xtest = Xtest
        self.ytest = ytest
        self.cv = cv
        self.beta = beta
        self.optimize = optimize
        
    def evaluate_model(self):
        
        from texttable import Texttable
        t = Texttable()
        
        if type(self.model).__name__ == 'CalibratedClassifierCV':
            if type(self.model.base_estimator).__name__ == 'Pipeline':
                model_name = 'Cal_' + type(list(self.model.base_estimator.named_steps.items())[1][1]).__name__
            else:
                model_name = 'Cal_' + type(self.model.base_estimator).__name__
        else:
            try: 
                type(list(self.model.named_steps.items())[1][1]).__name__
                
            except AttributeError:
                model_name = type(self.model).__name__
                
            else:
                model_name = type(list(self.model.named_steps.items())[1][1]).__name__
            
        print('+' * len(f' {model_name} '))
        print(f' {model_name} ')
        print('+' * len(f' {model_name} '))
        print()

        print('Performing Cross-Validation...')
        print('-------------------------------------------------------------------------------')
        t.set_deco(t.VLINES)
        t.add_rows([['CV#', 'Accuracy %', 'f1 Score %', 'ROC-AUC %', 'Max Acc. Thresh %', 
                     'Max F1 Thresh %']], header = False)
        print(t.draw())
        print('-------------------------------------------------------------------------------')
        t.reset()
        t.set_deco(t.HLINES)
        
        thresholds = np.arange(0, 1, 0.001)
        
        # apply threshold to positive probabilities to create labels
        def to_labels(pos_probs, threshold):
            return (pos_probs > threshold).astype('int64')

        accuracy = []
        f1_score = []
        roc_auc_score = []
        max_acc_thresh = []
        max_f1_thresh = []
        fold_no = 1
        
        fig = plt.figure(figsize = (11.5,5));
        ax1 = fig.add_subplot(121);
        ax2 = fig.add_subplot(122);
        
        for train_index, test_index in self.cv.split(self.Xtrain,self.ytrain):
            X_train_kfold, X_val_kfold = self.Xtrain[train_index], self.Xtrain[test_index]
            y_train_kfold, y_val_kfold = self.ytrain[train_index], self.ytrain[test_index]

            classifier = self.model

            classifier.fit(X_train_kfold, y_train_kfold) 
            y_pred = classifier.predict(X_val_kfold)
            y_probs = classifier.predict_proba(X_val_kfold)
            
            acc_thresh = [metrics.accuracy_score(y_val_kfold,
                                                 to_labels(y_probs[:,1], 
                                                           t)) for t in thresholds]
            
            f_thresh = [metrics.f1_score(y_val_kfold,
                                         to_labels(y_probs[:,1], 
                                                   t)) for t in thresholds]
            
            accuracy.append(np.round(metrics.accuracy_score(y_val_kfold, y_pred)*100,3))
            f1_score.append(np.round(metrics.f1_score(y_val_kfold, y_pred)*100,3))
            roc_auc_score.append(np.round(metrics.roc_auc_score(y_val_kfold, y_probs[:,1])*100,3))
            
            max_f1_thresh.append(np.round(thresholds[np.argmax(f_thresh)]*100,3))
            max_acc_thresh.append(np.round(thresholds[np.argmax(acc_thresh)]*100,3))
            
            t.set_cols_align(["c", "c", "c", "c", "c", "c"])
            t.add_row([fold_no, accuracy[fold_no-1],  f1_score[fold_no-1],  roc_auc_score[fold_no-1], 
                      max_acc_thresh[fold_no-1], max_f1_thresh[fold_no-1]])
            print(t.draw())
            t.reset()
            
            sns.lineplot(x = thresholds, y = f_thresh, hue = 0.95, legend = False, ax = ax1).\
            set_title('Threshold for Max F-score');
            
            ax1.set_xlabel('Threshold')
            ax1.set_ylabel('F1')
            
            ax1.axvline(thresholds[np.argmax(f_thresh)], color = 'grey', linestyle = '--', alpha = 0.8, 
                       lw = 1);
            
            
            sns.lineplot(x = thresholds, y = acc_thresh, hue = 0.95, legend = False, ax = ax2).\
            set_title('Threshold for Max Accuracy');
            
            ax2.set_xlabel('Threshold')
            ax2.set_ylabel('Accuracy')
            
            ax2.axvline(thresholds[np.argmax(acc_thresh)], color = 'grey', linestyle = '--', alpha = 0.8, 
                       lw = 1);
            
            fold_no += 1
        
        all_f = [metrics.f1_score(self.ytrain, 
                                  to_labels(classifier.predict_proba(self.Xtrain)[:,1],
                                                     t)) for t in thresholds]
        
        all_acc = [metrics.accuracy_score(self.ytrain, 
                                          to_labels(classifier.predict_proba(self.Xtrain)[:,1],
                                                    t)) for t in thresholds]
            
            
        sns.lineplot(x = thresholds, y = all_f, ax = ax1, label = 'Train F1 scores')
        
        ax1.axvline(thresholds[np.argmax(all_f)], color = 'red', linestyle = '--', 
           lw = 2, label = f'Train Thresh = {thresholds[np.argmax(all_f)]*100:.2f} %');
        
        ax1.axvline(np.mean(max_f1_thresh)/100, color = 'green', linestyle = '-.', 
           lw = 2, label = f'Mean Thresh = {np.mean(max_f1_thresh):.2f} %');
        
        
        sns.lineplot(x = thresholds, y = all_acc, ax = ax2, label = 'Train Accuracy')
        
        ax2.axvline(thresholds[np.argmax(all_acc)], color = 'red', linestyle = '--', 
           lw = 2, label = f'Train Thresh = {thresholds[np.argmax(all_acc)]*100:.2f} %');
        
        ax2.axvline(np.mean(max_acc_thresh)/100, color = 'green', linestyle = '-.', 
           lw = 2, label = f'Mean Thresh = {np.mean(max_acc_thresh):.2f} %');

        plt.tight_layout();
                
        print()
        print(f'*** {model_name} Mean CV Scores ***')
        print('=' * len(f'*** {model_name} Mean CV Scores ***'))
        print(f'ROC AUC   : {np.mean(roc_auc_score):.3f} ± {np.std(roc_auc_score):.1f} %')
        print(f'f1        : {np.mean(f1_score):.3f} ± {np.std(f1_score):.1f} %')
        print(f'Accuracy  : {np.mean(accuracy):.3f} ± {np.std(accuracy):.1f} %')
        print()
        print(f'Max Accuracy Threshold : {np.mean(max_acc_thresh):.2f} ± {np.std(max_acc_thresh):.1f} %')
        print(f'Max f1 Threshold       : {np.mean(max_f1_thresh):.2f} ± {np.std(max_f1_thresh):.1f} %')
        print()
        print("---" * 40)

        y_pred = classifier.predict(self.Xtest)
        y_probs = classifier.predict_proba(self.Xtest)
        
        self.y_probs = y_probs

        print()
        print('====================================')
        print("Classification report on Test set:")
        print('====================================')
        print()
        print(metrics.classification_report(self.ytest, y_pred))
        print()
        print("---" * 40)
        print()
        
        print(f'*** {model_name} scores on TEST set ***')
        print('=' * len(f'*** {model_name} scores on TEST set ***'))
        print(f'ROC AUC   : {metrics.roc_auc_score(self.ytest, y_probs[:,1])*100:.4f} %')
        print(f'f1        : {metrics.f1_score(self.ytest, y_pred)*100:.4f} %')
        print(f'Accuracy  : {metrics.accuracy_score(self.ytest, y_pred)*100:.4f} %')
        print()
        
        print("---" * 40)
        
        if self.beta == 'auto':
            if self.optimize == 'fbeta':
                beta_array = np.arange(0.5,2.05,0.05)

                # evaluate each threshold
                beta_val_array = []
                best_f1 = []
                best_threshold = []
                best_acc = []
                print()
                print('Estimating New Probability Threshold...')

                for b, i in zip(beta_array, range(len(beta_array))):

                    beta_val_array.append(b)

                    fbeta_score = [metrics.fbeta_score(self.ytest, 
                                                       to_labels(self.y_probs[:,1], t), 
                                                       beta = b) for t in thresholds]

                    acc = [metrics.accuracy_score(self.ytest,
                                                  to_labels(self.y_probs[:,1], t)) for t in thresholds]
                    
                    ix = np.argmax(fbeta_score)
                        
                    best_f1.append(fbeta_score[ix])
                    best_acc.append(acc[ix])
                    best_threshold.append(thresholds[ix])

                    Progress(i, len(beta_array))
            
                print()
                score_df = pd.DataFrame({
                    'threshold': best_threshold,
                    'Beta' : beta_val_array,
                    'fbeta': best_f1, 
                    'accuracy' : best_acc
                }).sort_values(self.optimize, ascending = False, ignore_index = True)

                self.score_df = score_df
                
                print()
                print('Best F-beta, Accuracy, Beta & Threshold:')
                print('========================================')
                print(f'Beta      : {score_df.iloc[0, 1]:.2f}')
                print(f'Threshold : {score_df.iloc[0, 0]*100:.2f} %')
                print(f'F-beta    : {score_df.iloc[0, 2]*100:.4f} %')
                print(f'Accuracy  : {score_df.iloc[0, 3]*100:.4f} %')
                print()
                
                thresh_val = score_df.iloc[0,0]
                self.tuned_pred = np.where(self.y_probs[:,1] > thresh_val, 1, 0).astype('int64')
                self.new_prob_threshold = thresh_val
                
                plot_df = pd.DataFrame({
                    'threshold' : thresholds, 
                    'fbeta' : [metrics.fbeta_score(self.ytest,
                                                   to_labels(self.y_probs[:,1], t),
                                                   beta = score_df.iloc[0, 1]) for t in thresholds], 
                    'accuracy': acc
                })
                
            else:
                fbeta_score = [metrics.fbeta_score(self.ytest, 
                                                       to_labels(self.y_probs[:,1], t), 
                                                       beta = 1) for t in thresholds]
                
                acc = [metrics.accuracy_score(self.ytest,
                                                  to_labels(self.y_probs[:,1], t)) for t in thresholds]
                
                ix = np.argmax(acc)
                
                score_df = pd.DataFrame({
                    'threshold': thresholds,
                    'fbeta': fbeta_score, 
                    'accuracy' : acc
                }).sort_values(self.optimize, ascending = False, ignore_index = True)

                self.score_df = score_df
                plot_df = score_df
            
                print()
                print('Best F1, Accuracy, & Threshold:')
                print('===============================')
                print(f'Threshold : {score_df.iloc[0, 0]*100:.2f} %')
                print(f'F1        : {score_df.iloc[0, 1]*100:.4f} %')
                print(f'Accuracy  : {score_df.iloc[0, 2]*100:.4f} %')
                print()
                
                thresh_val = score_df.iloc[0, 0]
                self.tuned_pred = np.where(self.y_probs[:,1] > thresh_val, 1, 0).astype('int64')
                self.new_prob_threshold = thresh_val
            
        else:
            beta_val = self.beta

            # evaluate each threshold
            fbeta_score = [metrics.fbeta_score(self.ytest, 
                                               to_labels(self.y_probs[:,1], t), 
                                               beta = beta_val) for t in thresholds]
            
            acc = [metrics.accuracy_score(self.ytest,
                                          to_labels(self.y_probs[:,1], t)) for t in thresholds]
                    
            score_df = pd.DataFrame({
                'threshold': thresholds,
                'fbeta': fbeta_score, 
                'accuracy' : acc
            }).sort_values(self.optimize, ascending = False, ignore_index = True)
            
            self.score_df = score_df
            plot_df = score_df
            
            print('Best F-beta, Accuracy, Beta & Threshold:')
            print('========================================')
            print(f'Beta      : {self.beta}')
            print(f'Threshold : {score_df.iloc[0,0]*100:.4f} %')
            print(f'F-beta    : {score_df.iloc[0,1]*100:.4f} %')
            print(f'Accuracy  : {score_df.iloc[0,2]*100:.4f} %')
            print()
            
            thresh_val = score_df.iloc[0,0]
            self.new_prob_threshold = thresh_val
            self.tuned_pred = np.where(self.y_probs[:,1] > thresh_val, 1, 0).astype('int64')
            
        print("---" * 40)
        
        print('=================================================================================')
        print(' Threshold-Tuning Curve, ROC-AUC Plot, Precision-Recall Curve & Confusion Matrix ')
        print('=================================================================================')
        
        f_df = plot_df.sort_values('fbeta', ascending = False, ignore_index = True)
        acc_df = plot_df.sort_values('accuracy', ascending = False, ignore_index = True)
        
        sns.lineplot(x = 'threshold', y = 'fbeta', data = plot_df, 
                     label = 'Test F-score', ax = ax1).set_title('Threshold for Max F-score');
        
        ax1.axvline(f_df.iloc[0,0], color = 'black', linestyle = '--', 
                   label = f' Test F-score Thresh = {f_df.iloc[0,0]*100:.2f} %');
        
        ax1.legend(loc='lower right');
        
        sns.lineplot(x = 'threshold', y = 'accuracy', data = plot_df, 
                     label = 'Test Accuracy', ax = ax2).set_title('Threshold for Max Accuracy');
        
        ax2.axvline(acc_df.iloc[0,0], color = 'black', linestyle = '--', 
                   label = f' Test Accuracy Thresh = {acc_df.iloc[0,0]*100:.2f} %');

        ax2.legend(loc='lower right');
        plt.tight_layout()

        fig = plt.figure(figsize = (13,4.5));
        ax1 = fig.add_subplot(121);
        ax2 = fig.add_subplot(122);
        
        skplt.metrics.plot_roc(self.ytest, y_probs, ax = ax1);
        
        skplt.metrics.plot_precision_recall(self.ytest, y_probs, ax = ax2);
        
        fig = plt.figure(figsize = (13,4));
        ax1 = fig.add_subplot(121);
        ax2 = fig.add_subplot(122);
        
        skplt.metrics.plot_confusion_matrix(self.ytest, y_pred, ax = ax1, text_fontsize = 'large', 
                                            cmap='YlGn');
    
        
        skplt.metrics.plot_confusion_matrix(self.ytest, self.tuned_pred, ax = ax2, text_fontsize = 'large',
                                            cmap='YlGn');
        
        ax1.title.set_text('With Prob. Threshold = 50 %');
        ax2.title.set_text(f'With New Prob. Threshold = {thresh_val*100:.2f} %');
        
# validate_on_test(model, Xtrain, ytrain, Xtest, ytest, cv, beta = 'auto', optimize = 'fbeta')
#.evaluate_model()

# Test on Unseen Data

In [None]:
class predict_unseen_data:
    def __init__(self, model, Xtrain, ytrain, Xtest, cv, prob_threshold):
        self.model = model
        self.Xtrain = Xtrain
        self.ytrain = ytrain
        self.Xtest = Xtest
        self.cv = cv
        self.prob_threshold = prob_threshold
        
    def prediction(self):
        from texttable import Texttable
        t = Texttable()
        
        if type(self.model).__name__ == 'CalibratedClassifierCV':
            if type(self.model.base_estimator).__name__ == 'Pipeline':
                model_name = 'Cal_' + type(list(self.model.base_estimator.named_steps.items())[1][1]).__name__
            else:
                model_name = 'Cal_' + type(self.model.base_estimator).__name__
        else:
            try: 
                type(list(self.model.named_steps.items())[1][1]).__name__
                
            except AttributeError:
                model_name = type(self.model).__name__
                
            else:
                model_name = type(list(self.model.named_steps.items())[1][1]).__name__
            
        print('+' * len(f' {model_name} '))
        print(f' {model_name} ')
        print('+' * len(f' {model_name} '))
        print()

        print('Performing Cross-Validation...')
        print('-----------------------------------------')
        t.set_deco(t.VLINES)
        t.add_rows([['CV#', 'Accuracy %', 'f1 Score %', 'ROC-AUC %']], header = False)
        print(t.draw())
        print('-----------------------------------------')
        t.reset()
        t.set_deco(t.HLINES)
        
        accuracy = []
        f1_score = []
        roc_auc_score = []
        fold_no = 1
        
        for train_index, test_index in self.cv.split(self.Xtrain,self.ytrain):
            X_train_kfold, X_val_kfold = self.Xtrain[train_index], self.Xtrain[test_index]
            y_train_kfold, y_val_kfold = self.ytrain[train_index], self.ytrain[test_index]

            classifier = self.model

            classifier.fit(X_train_kfold, y_train_kfold) 
            y_pred = classifier.predict(X_val_kfold)
            y_probs = classifier.predict_proba(X_val_kfold)
            
            accuracy.append(np.round(metrics.accuracy_score(y_val_kfold, y_pred)*100,3))
            f1_score.append(np.round(metrics.f1_score(y_val_kfold, y_pred)*100,3))
            roc_auc_score.append(np.round(metrics.roc_auc_score(y_val_kfold, y_probs[:,1])*100,3))
            
            t.set_cols_align(["c", "c", "c", "c"])
            t.add_row([fold_no, accuracy[fold_no-1],  f1_score[fold_no-1],  roc_auc_score[fold_no-1]])
            print(t.draw())
            t.reset()
            
            fold_no += 1
        
        print()
        print(f'*** {model_name} Mean CV Scores ***')
        print('=' * len(f'*** {model_name} Mean CV Scores ***'))
        print(f'ROC AUC   : {np.mean(roc_auc_score):.3f} ± {np.std(roc_auc_score):.1f} %')
        print(f'f1        : {np.mean(f1_score):.3f} ± {np.std(f1_score):.1f} %')
        print(f'Accuracy  : {np.mean(accuracy):.3f} ± {np.std(accuracy):.1f} %')
        print()
        print("---" * 40)
        
        print()
        print(f'Chosen Probability Threshold: {self.prob_threshold*100:.2f} %')
        
        self.y_probs = classifier.predict_proba(self.Xtest)[:,1]
        self.final_pred =  np.where(self.y_probs > self.prob_threshold, 1, 0)
        
        print()
        print('Finish!')
        
#predict_unseen_data(model, Xtrain, ytrain, Xtest, cv, prob_threshold)
#.prediction()

# Calibration

In [None]:
class calibrate_model:
    
    def __init__(self, model, Xtrain, ytrain, Xtest, ytest, cv, cal_method):
        self.model = model
        self.Xtrain = Xtrain
        self.ytrain = ytrain
        self.Xtest = Xtest
        self.ytest = ytest
        self.cv = cv
        self.cal_method = cal_method
        
    def calibrate_probability(self):
        
        from sklearn.model_selection import train_test_split
        from sklearn.calibration import CalibratedClassifierCV
        from sklearn.calibration import calibration_curve
        
        train_X, val_X, train_y, val_y = train_test_split(self.Xtrain, 
                                                          self.ytrain, 
                                                          test_size = 0.2, 
                                                          random_state = seed)
        
        
        #uncalibrated model
        
        for train_index, test_index in self.cv.split(train_X, train_y): 
            X_train_kfold, X_val_kfold = train_X[train_index], train_X[test_index] 
            y_train_kfold, y_val_kfold = train_y[train_index], train_y[test_index] 
            self.model.fit(X_train_kfold, y_train_kfold)
            
        uc_probs = self.model.predict_proba(self.Xtest)[:, 1]
        uc_fop, uc_mpv = calibration_curve(self.ytest, uc_probs, n_bins=10, normalize=True, 
                                           strategy = 'quantile')
        
        print()
        print(f'Uncalibrated Brier Score: {metrics.brier_score_loss(self.ytest, uc_probs)}')
        print(f'Uncalibrated ROC-AUC: {np.round(metrics.roc_auc_score(self.ytest, uc_probs)*100,3)}')
    

        #Calibrating Model
        self.cal_model = CalibratedClassifierCV(self.model, method=self.cal_method, cv=self.cv)
        self.cal_model.fit(val_X, val_y)
        
        # predict probabilities
        c_probs = self.cal_model.predict_proba(self.Xtest)[:, 1]
        
        print()
        print(f'Calibrated Brier Score: {metrics.brier_score_loss(self.ytest, c_probs)}')
        print(f'Calibrated ROC-AUC: {metrics.roc_auc_score(self.ytest, c_probs)*100:.3f} %')
        print()
        
        # reliability diagram
        c_fop, c_mpv = calibration_curve(self.ytest, c_probs, n_bins=10, normalize=True,
                                        strategy = 'quantile')

        # plot CATBOOST calibrated
        plt.plot([0, 1], [0, 1], linestyle='--');

        # plot un calibrated model reliability 
        plt.plot(uc_mpv, uc_fop, marker='.', label = 'Uncalibrated');

        # plot calibrated reliability
        plt.plot(c_mpv, c_fop, marker='.', label = 'Calibrated');

        plt.title(type(self.model).__name__ + ' ' + self.cal_method)
        plt.ylabel('Fraction of Positives (fop)')
        plt.xlabel('Mean Predicted Value (mpv)')
        plt.legend();
        plt.tight_layout()
        
#calibrate_model(model, Xtrain, ytrain, Xtest, ytest, cv, cal_method = 'isotonic')
#.calibrate_probability()

# Feature Selection

In [None]:
%%time
#BAckward Elimination

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.model_selection import StratifiedKFold

back = SFS(#model, k_features=(1,4),forward=False, floating=False, scoring = 'f1', 
            cv=StratifiedKFold(n_splits=10, random_state=seed, shuffle = True))

back.fit(#X_train, y_train)

 
print() 
print(f'Best Score: {back.k_score_*100:.3f} %')
    
back.k_feature_names_

In [None]:
%%time
#Forward Selection

from mlxtend.feature_selection import SequentialFeatureSelector as SFS

front = SFS(#model, k_features=(1,4), scoring = 'f1', forward=True, 
            floating=False, cv=StratifiedKFold(n_splits=10, random_state=seed, shuffle = True))

front.fit(#X_train, y_train)

 
print()    
print(f'Best Score: {front.k_score_*100:.3f} %')
    
front.k_feature_names_

In [None]:
%%time
#Bidirectional Elimination

from mlxtend.feature_selection import SequentialFeatureSelector as SFS

bi = SFS(#model, k_features=(1,4), scoring = 'f1', forward=True, 
         floating=True, cv=StratifiedKFold(n_splits=10, random_state=seed, shuffle = True))

bi.fit(#X_train, y_train)

 
print() 
print(f'Best Score: {bi.k_score_*100:.3f} %')
    
bi.k_feature_names_

In [None]:
##########################################################################################################
"""
TAKES TOO MUCH TIME!!! AVOID AT ALL COST!
"""
%%time
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

efs = EFS(#model, min_features=1,max_features=2,scoring='f1',print_progress=True,
          cv=StratifiedKFold(n_splits=5, random_state=seed, shuffle = True))

efs.fit(#X_train, y_train)

 
print()     
print(f'Best Score: {efs.best_score_*100:.3f} %')    
       
efs.best_feature_names_ 
    
###########################################################################################################

### Embedded Methods

#### Lasso Regularization

In [None]:
#Using regularization traceplots
%%time
from sklearn import linear_model
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# #############################################################################
# Compute paths

n_alphas = 100
alphas = np.linspace(0.0,1,n_alphas)

coefs = []
for a in alphas:
    lasso = Pipeline(steps = [
        ('scaler', StandardScaler()), 
        ('lasso', linear_model.Lasso(alpha=a, fit_intercept=False))
    ])
    
    lasso.fit(#X_train_df, y_train_df)
    coefs.append(lasso.named_steps.lasso.coef_)

# #############################################################################
# Display results
label = #X_train_df.columns

plt.figure(figsize = (10,7))
ax = plt.gca()
ax.plot(alphas, coefs);
plt.axhline(y=0, color='black', linestyle='-')

plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Lasso coefficients as a function of the regularization')
plt.axis('tight')
plt.legend(label);

coef_df = pd.DataFrame({'Features': label, 'Score':np.mean(coefs,0)})

lasso_feat = [i for i in coef_df[coef_df['Score']>0]['Features']]
print(f'Features Selected from Lasso Regularization: {lasso_feat}')
coef_df.sort_values('Score', ascending = False)

#### Recursive Feature Eliminaion

In [None]:
%%time
from yellowbrick.model_selection import RFECV
from sklearn.ensemble import ExtraTreesClassifier

 
 

visualizer = RFECV(ExtraTreesClassifier(criterion = 'entropy', max_depth = 6), 
                   cv = StratifiedKFold(n_splits=5, random_state=seed, shuffle = True), 
                   scoring = 'f1')

visualizer.fit(X_train_df, y_train_df)        # Fit the data to the visualizer
visualizer.show();           # Finalize and render the figure

 
print()    
print(f'Feature Rankings: {visualizer.ranking_}')

rfe_feat = [ind for ind, x in enumerate(visualizer.ranking_) if x==1]

### BORUTA

In [None]:
%%time
from boruta import BorutaPy

model = ExtraTreesClassifier(criterion = 'entropy', max_depth = 6, class_weight='balanced', n_jobs = -1)

feat_selector = BorutaPy(model, n_estimators='auto', verbose=0, random_state=seed)

feat_selector.fit(X_train.values, y_train.Loan_Status.values)

print(f'Selected Features: {feat_selector.support_}')
print()
print(f'Feature Ranking: {feat_selector.ranking_}')
print()
boruta_feat = [X_train.columns[i] for i, feat in enumerate(feat_selector.ranking_) if feat==1]
print(boruta_feat)

## ALL Feature Selection method Dictionary

In [None]:
selected_features = {
    'mutual_information': mi_feat, 
    'chi_squared': chi2_feat, 
    'ANOVA': anova_feat, 
    'back_elimination': list(back.k_feature_names_), 
    'front_elimination': list(front.k_feature_names_),
    'bidirectional_elimination': list(bi.k_feature_names_),
    'LASSO_regularization': lasso_feat, 
    'ExtraTrees_feature_imp': et_fi_feat, 
    'recursive_feature_elimination': rfe_feat, 
    'recursive_feature_addition': rfa_feat, 
    'select_by_shuffle': sel_shuff_feat, 
    'Boruta': boruta_feat
}

### Try all feature selection models

In [None]:
def best_features(train_X, train_Y, feature_dict, preprocessor, cv):
    
    models = [
        LogisticRegression(random_state = seed),
        LogisticRegressionCV(cv=10, random_state = seed),
        SGDClassifier(tol = 0.1, early_stopping = True, validation_fraction = 0.2, random_state = seed),
        Perceptron(tol = 0.1, early_stopping = True, validation_fraction = 0.2, random_state = seed),
        PassiveAggressiveClassifier(tol = 0.1, early_stopping = True, validation_fraction = 0.2, random_state = seed),
        RidgeClassifier(random_state = seed),
        RidgeClassifierCV(cv=10),
        LinearSVC(loss = 'hinge', random_state = seed),
        SVC(kernel = 'rbf', random_state = seed),
        NuSVC(random_state = seed),
        KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2),
        GaussianNB(), 
        BernoulliNB(),
        MLPClassifier(tol=0.1, early_stopping = True, validation_fraction = 0.2, random_state = seed),
        GaussianProcessClassifier(random_state = seed)
    ]

    tree_models = [
        DecisionTreeClassifier(criterion = 'entropy', max_depth = 6, random_state = seed),
        RandomForestClassifier(criterion='entropy', max_depth=6, class_weight='balanced', n_jobs=-1, random_state = seed), 
        XGBClassifier(use_label_encoder=False, eval_metric = 'error', seed = seed), 
        CatBoostClassifier(verbose = False, loss_function='CrossEntropy', eval_metric='TotalF1', random_seed = seed), 
        LGBMClassifier(random_state = seed), 
        AdaBoostClassifier(random_state = seed), 
        GradientBoostingClassifier(random_state = seed), 
        BaggingClassifier(random_state = seed), 
        ExtraTreesClassifier(criterion='entropy', max_depth=6, class_weight='balanced', n_jobs=-1, random_state = seed), 
        HistGradientBoostingClassifier(random_state = seed), 
        EasyEnsembleClassifier(random_state = seed), 
        RUSBoostClassifier(random_state = seed), 
        BalancedBaggingClassifier(random_state = seed), 
        BalancedRandomForestClassifier(n_estimators = 100, criterion = 'entropy', max_depth = 6, random_state = seed, class_weight = 'balanced'), 
        RGFClassifier(loss = 'Log', algorithm='RGF_Sib'), 
        FastRGFClassifier(loss='LOGISTIC'),
        GPBoostClassifier(random_state = seed)
    ]

    select = []
    top_acc = []
    top_f1 = []
    top_roc_auc = []
    top_prec = []
    top_recall = []
    top_bal_acc = []
    top_model = []

    for name, feat, i in zip(feature_dict.keys(), 
                             feature_dict.values(), 
                             range(len(feature_dict.keys()))):
        
        Xtrain = train_X.loc[:,feat].values
        ytrain = train_Y.values

        acc = []
        f1 = []
        roc_auc = []
        prec = []
        recall = []
        bal_acc = []
        model_names = []

        for model in models:
            pipe = Pipeline(steps = [('preprocessor', preprocessor), (type(model).__name__, model)])

            scores = cross_validate(pipe, 
                                    Xtrain, 
                                    ytrain,
                                    scoring = ['accuracy', 'f1', 'roc_auc', 'precision', 'recall', 
                                               'balanced_accuracy'],
                                    cv = cv, 
                                    n_jobs = -1)

            acc.append(list(scores.values())[2].mean()*100)
            f1.append(list(scores.values())[3].mean()*100)
            roc_auc.append(list(scores.values())[4].mean()*100)
            prec.append(list(scores.values())[5].mean()*100)
            recall.append(list(scores.values())[6].mean()*100)
            bal_acc.append(list(scores.values())[7].mean()*100)

            model_names.append(type(model).__name__)

        #Using Ensemble Models
        for model in tree_models:

            scores = cross_validate(model, 
                                    Xtrain, 
                                    ytrain,
                                    scoring = ['accuracy', 'f1', 'roc_auc', 'precision', 'recall', 
                                               'balanced_accuracy'],
                                    cv = cv, 
                                    n_jobs = -1)

            acc.append(list(scores.values())[2].mean()*100)
            f1.append(list(scores.values())[3].mean()*100)
            roc_auc.append(list(scores.values())[4].mean()*100)
            prec.append(list(scores.values())[5].mean()*100)
            recall.append(list(scores.values())[6].mean()*100)
            bal_acc.append(list(scores.values())[7].mean()*100)
            model_names.append(type(model).__name__)

        temp_df = pd.DataFrame({
            'Models': model_names, 
            'Acc %': acc, 
            'f1 %':f1,
            'ROC_AUC %' :roc_auc, 
            'Precision %': prec,
            'Recall %': recall, 
            'Balanced_Acc %': bal_acc 
        }).sort_values('f1 %', ascending = False, ignore_index = True)

        select.append(name)
        top_model.append(temp_df.iloc[0,0])
        top_acc.append(temp_df.iloc[0,1])
        top_f1.append(temp_df.iloc[0,2])
        top_roc_auc.append(temp_df.iloc[0,3])
        top_prec.append(temp_df.iloc[0,4])
        top_recall.append(temp_df.iloc[0,5])
        top_bal_acc.append(temp_df.iloc[0,6])
        
        Progress(i, len(feature_dict.keys()))
    print()

    df = pd.DataFrame({
        'Feature_Selection': select,
        'Model': top_model, 
        'Acc': top_acc, 
        'f1': top_f1,
        'ROC_AUC' : top_roc_auc, 
        'Prec': top_prec,
        'Recall': top_recall, 
        'Bal_Acc': top_bal_acc 
    }).sort_values('f1', ascending = False, ignore_index = True).style.\
                                                    highlight_max(color = 'green')
    
    return df

# best_features(train_X, train_Y, feature_dict, preprocessor, cv)

# Variable Transformation

In [None]:
class num_transformer:
    def __init__(self, x, variable):
        self.x = x
        self.variable = variable

    def var_transform_plots(self):
    
        import scipy.stats as stats

        fig, ax = plt.subplots(1,3, figsize = (15,5))

        sns.histplot(self.x[self.variable], kde = True, ax = ax[0]);
        sns.boxplot(self.x[self.variable], ax = ax[1]);
        stats.probplot(self.x[self.variable], dist="norm", plot=plt);
        plt.suptitle('Current Distribution', fontsize = 20)

        import feature_engine.transformation as vt
        
        self.log = vt.LogTransformer()
        self.recipr = vt.ReciprocalTransformer()
        self.exp = vt.PowerTransformer()
        self.boxcox = vt.BoxCoxTransformer()
        self.yeojohn = vt.YeoJohnsonTransformer()
        
        transformation_dict = dict(
            log = self.log, 
            reciper = self.recipr, 
            exp = self.exp, 
            boxcox = self.boxcox, 
            yeojohn = self.yeojohn 
        )

        for name, trnfm in transformation_dict.items():
            try:
                self.x[name + '_' + self.variable] = trnfm.fit_transform(self.x[self.variable].to_frame())

            except ValueError:
                print(f"""
                      Some variables contain zero or negative values, can't apply {type(trnfm).__name__}
                """)
                continue

            else:
                fig, ax = plt.subplots(1,3, figsize = (15,5))
                sns.histplot(self.x[name + '_' + self.variable], 
                             kde = True, ax = ax[0]).set(xlabel=self.variable);
                
                sns.boxplot(self.x[name + '_' + self.variable], 
                            ax = ax[1]).set(xlabel=self.variable);
                
                stats.probplot(self.x[name + '_' + self.variable], 
                               dist="norm", 
                               plot=plt);
                
                plt.suptitle(type(trnfm).__name__, fontsize = 20)
                plt.show()

In [1]:
print('m')
print('=')

m
=


In [2]:
print('Soviet Union')
print("=" * len('Soviet Union'))

Soviet Union
