## Project Functions

### Description

Below are the different functions used in the analysis.

In [None]:
# Import libraries
from sklearn.metrics import *
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from scipy import stats
import math as ma
import statsmodels.api as sm

### Broad Classification Metrics Function

In [3]:
# Generates all classification metrics 
def cap_auc(model, df, target, y, y_pred, y_score, X, length, width):
    from sklearn.metrics import roc_curve, auc
    
    # Concordance and Discordance
    Probability = model.predict_proba(X)
    Probability1 = pd.DataFrame(Probability)
    Probability1.columns = ['Prob_0','Prob_1']
    TruthTable = pd.merge(y[[target]], Probability1[['Prob_1']], how = 'inner', 
                          left_index = True, right_index = True)
    zeros = TruthTable[(TruthTable[target] == 0)].reset_index(drop = True)
    ones = TruthTable[(TruthTable[target] == 1)].reset_index(drop = True)
    
    from bisect import bisect_left, bisect_right
    zeros_list = sorted([zeros.iloc[j,1] for j in zeros.index])
    zeros_length = len(zeros_list)
    disc = 0
    ties = 0
    conc = 0
    for i in ones.index:
        cur_conc = bisect_left(zeros_list, ones.iloc[i,1])
        cur_ties = bisect_right(zeros_list, ones.iloc[i,1]) - cur_conc
        conc += cur_conc
        ties += cur_ties
        
    pairs_tested = zeros_length * len(ones.index)
    disc = pairs_tested - conc - ties
    concordance = round(conc/pairs_tested,2)
    discordance = round(disc/pairs_tested,2)
    ties_perc = round(ties/pairs_tested,2)
    Somers_D = round((conc - disc)/pairs_tested,2)
    
    results1 = [('Pairs: ', pairs_tested),
                ('Conc: ', conc),
                ('Disc: ', disc),
                ('Tied: ', ties)]
    
    print('\n')
    for label, value in results1:
        print(f"{label:{35}} {value:.>{20}}")
        
    results2 = [('Concordance: ', concordance),
                ('Discordance: ', discordance),
                ('Tied: ', ties_perc),
                ('Somers D: ', Somers_D)]
    
    print('\n')
    for label, value in results2:
        print(f"{label:{35}} {value:.>{20}}")
    
    # ROC plot
    probs = y_score
    fpr, tpr, thresholds = roc_curve(y, probs)
    roc_auc = auc(fpr, tpr)

    print('\n')
    plt.figure(figsize = (width, length))
    plt.plot([0,1], [0,1], 'r--')
    label = 'classifier:' + ' {0:.2f}'.format(roc_auc)
    plt.plot(fpr, tpr, c = 'g', label = label, linewidth = 2)
    plt.xlabel('false positive rate')
    plt.ylabel('true positive rate')
    plt.title('receiver operating characteristic')
    plt.legend(loc = 'lower right')
    plt.show()
    
    auc = round(roc_auc_score(y, y_score),2)

    results = [('AUC:',auc)]
    
    print('\n')
    for label, value in results:
        print(f"{label:{35}} {value:.>{20}}")
        
    # General Classification metrics
    cm = confusion_matrix(y, y_pred)

    tp = cm[0][0]
    fp = cm[0][1]

    tn = cm[1][1]
    fn = cm[1][0]
    
    print('\n')
    cm_df = pd.DataFrame([{'1': tp, '0': fp}, {'1': fn, '0': tn}])
    cm_df = cm_df.set_index([pd.Index([1,0])])
    print('confussion matrix' + '\n')
    print(cm_df)

    accuracy = round((tp + tn) / (tp + fp + tn + fn), 2)
    precision = round((tp) / (tp + fp),2)
    recall = round((tp) / (tp + fn),2)
    f1 = round((2 * (precision * recall)) / (precision + recall),2)

    results3 = [('Accuracy:', accuracy),
                ('Precision:', precision),
                ('Recall:', recall),
                ('F1:', f1)]
    
    print('\n')
    for label, value in results3:
        print(f"{label:{35}} {value:.>{20}}")

    # Cap plot
    y = y.to_numpy()
    y_pred = y_pred.astype(int).to_numpy()
    y_score = y_score.to_numpy()

    total = len(y)
    class_1_count = np.sum(y)
    class_0_count = total - class_1_count

    probs = y_score
    model_y = [y for _, y in sorted(zip(probs, y), reverse = True)]
    y_values = np.append([0], np.cumsum(model_y))
    X_values = np.arange(0, total + 1)

    print('\n')
    sns.set(font_scale = 1, style = 'white')
    plt.figure(figsize = (width, length))
    plt.plot([0, total], [0, class_1_count], c = 'r', linestyle = '--', label = 'random model')

    plt.plot([0, class_1_count, total],[0, class_1_count, class_1_count], c = 'grey', 
             linewidth = 2, label = 'perfect model')

    plt.plot(X_values, y_values, c = 'b', label = 'classifier', linewidth = 2)

    plt.xlabel('total observations')
    plt.ylabel('class 1 observations')
    plt.title('cumulative accuracy profile')
    plt.legend(loc = 'lower right')

    index = int((50*total / 100))

    plt.plot([index, index], [0, y_values[index]], c ='g', linestyle = '--')

    plt.plot([0, index], [y_values[index], y_values[index]], c = 'g', linestyle = '--')

    class_1_observed = y_values[index] * 100 / max(y_values)
    plt.show()
    
    # Cap table
    rows_decile = round(len(df) / 10, 0)
    flag_count = df[target].sum()
    cap_table = df
    cap_table = cap_table.sort_values(by = 'predicted_proba', ascending = False).reset_index(drop = True)
    cap_table['count'] = 1
    cap_table['count_of_rows'] = 1
    cap_table['count'] = cap_table['count'].cumsum()
    cap_table['bin'] = np.ceil(cap_table['count'] / rows_decile)
    cap_table['bin'][cap_table['bin'] > 10] = 10
    cap_table = cap_table.groupby(by = ['bin']).sum().reset_index()
    cap_table = cap_table[['bin', 'count_of_rows', target]]
    cap_table['model_percent'] = round((cap_table[target] / flag_count) * 100, 2)
    cap_table['random_percent'] = 10
    cap_table['model_cumm_percent'] = cap_table['model_percent'].cumsum()
    cap_table['random_cumm_percent'] = cap_table['random_percent'].cumsum()
    cap_table['ks'] = cap_table['model_cumm_percent'] - cap_table['random_cumm_percent']
    cap_table.loc[len(cap_table)] = 0
    cap_table = cap_table.sort_values(by = 'bin', ascending = True).reset_index(drop = True)
    return cap_table
    

### Logistic Regression Summary Function

In [None]:
# Provides a summary of the logistic regression
def logit_summary(X, y):
    X2 = sm.add_constant(X)
    logit_model = sm.Logit(y, X2)
    result = logit_model.fit()
    print(result.summary2())