In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_fscore_support, f1_score, accuracy_score, roc_auc_score

In [2]:
def create_dummies(df, cols):
    """
    Create dummies for the cateogorical variables of the ieee-fraud-detection training set.
    
    The categorical variables to be turned into dummies are
    ProductCD, card3-6, addr2, P_emaildomain, R_emaildomain, M1,2,3,4,6,7,9, id_12,15,16,28,29,31,35,36,37,38
    
    Note: These are not all categorical/indicator variables in the training set. 
          They were manually selected based on the correlation with the target.
    
    args:
    df(pd.DataFrame): ieee-fraud-detection training/validation/test set
    cols: numeric columns that would like to be merged with the dummies
    
    returns:
    df(pd.DataFrame): dataframe with dummies and the input columns.
    """

    product_dummies = pd.get_dummies(df.ProductCD)
    card4_dummies = pd.get_dummies(df.card4)
    
    card3_dummy = df.card3.map(lambda x: 1 if x == 150.0 else (x if pd.isnull(x) else 0)) # 88% has '150.0'    
    card5_dummy = df.card5.map(lambda x: 1 if x == 226.0 else (x if pd.isnull(x) else 0)) # 50% has '226.0'
    card6_debit = df.card6.map(lambda x: 1 if x == 'debit' else (x if pd.isnull(x) else 0))
    card6_credit = df.card6.map(lambda x: 1 if x == 'credit' else (x if pd.isnull(x) else 0))
    card6_others = df.card6.map(lambda x: 1 if (x != 'debit') & (x != 'credit') else (x if pd.isnull(x) else 0))

    card_dummies = pd.concat([card3_dummy, card5_dummy, card6_debit, card6_credit, card6_others], axis = 1)
    card_dummies.columns = ['card3_150', 'card5_226', 'card6_debit', 'card6_credit', 'card6_neither']

    addr2_dummy = df.addr2.map(lambda x: 1 if x == 87.0 else (x if pd.isnull(x) else 0))# 88% has '87'

    P_emaildomain = df.P_emaildomain.map(lambda x: x.split('.')[0] if type(x) == str else x)
    P_emaildomain_dummies_columns = ['gmail','yahoo','hotmail','anonymous','aol','comcast','icloud','outlook','msn','others']
    P_emaildomain_dummies = pd.DataFrame()
    for colname in P_emaildomain_dummies_columns:
        if colname == 'others':
            col = P_emaildomain.map(lambda x: 1 if x not in {'gmail','yahoo','hotmail','anonymous','aol','comcast','icloud','outlook','msn'} else (x if pd.isnull(x) else 0))
            P_emaildomain_dummies = pd.concat([P_emaildomain_dummies.reset_index(drop=True), col.reset_index(drop=True)], axis = 1)
            continue
        P_emaildomain_dummies = pd.concat([P_emaildomain_dummies, 
                                           P_emaildomain.map(lambda x: 1 if x == colname else (x if pd.isnull(x) else 0))], 
                                          ignore_index=True, axis = 1)
    P_emaildomain_dummies.columns = ['P_' + col for col in P_emaildomain_dummies_columns]

    
    R_emaildomain = df.R_emaildomain.map(lambda x: x.split('.')[0] if type(x) == str else x)
    R_emaildomain_dummies_columns = ['gmail','yahoo','hotmail','anonymous','aol','comcast','icloud','outlook','msn','others']
    R_emaildomain_dummies = pd.DataFrame()
    for colname in R_emaildomain_dummies_columns:
        if colname == 'others':
            col = R_emaildomain.map(lambda x: 1 if x not in {'gmail','yahoo','hotmail','anonymous','aol','comcast','icloud','outlook','msn'} else (x if pd.isnull(x) else 0))
            R_emaildomain_dummies = pd.concat([R_emaildomain_dummies.reset_index(drop=True), col.reset_index(drop=True)], axis = 1)
            break
        R_emaildomain_dummies = pd.concat([R_emaildomain_dummies, 
                                           R_emaildomain.map(lambda x: 1 if x == colname else (x if pd.isnull(x) else 0))], 
                                          ignore_index=True, axis = 1)
    R_emaildomain_dummies.columns = ['R_' + col for col in R_emaildomain_dummies_columns]

    
    tmp = df.loc[:, ['M1','M2','M3','M4','M6','M7','M9']]
    M_dummies = pd.DataFrame()

    for col in ['M1','M2','M3','M6','M9']:
        M_dummies = pd.concat([M_dummies, tmp.loc[:, col].map(lambda x: 1 if x == 'T' else (x if pd.isnull(x) else 0))], ignore_index = True, axis = 1)
    M_dummies = pd.concat([M_dummies, 
                           tmp.loc[:, 'M7'].map(lambda x: 1 if x == 'F' else (x if pd.isnull(x) else 0)),
                           tmp.loc[:, 'M4'].map(lambda x: 1 if x == 'M2' else (x if pd.isnull(x) else 0))], 
                          ignore_index = True, axis = 1)    
    M_dummies.columns = ['M1_T','M2_T','M3_T','M6_T','M9_T', 'M7_F', 'M4_M2']
    
    id_12_dummy = pd.get_dummies(df.id_12).loc[:,['NotFound']] # Not Found (21%)
    id_15_dummies = pd.get_dummies(df.id_15).loc[:,['Found', 'New']] # Found, New (11%, 10%)
    id_16_dummy = pd.get_dummies(df.id_16).loc[:, ['Found']] # Found (11%)
    id_28_dummy = pd.get_dummies(df.id_28).loc[:, ['Found']] # Found (13%)
    id_29_dummy = pd.get_dummies(df.id_29).loc[:, ['Found']] # Found (12%)
    tmp = df.id_31.map(lambda x: x.split()[0].split('/')[0].lower() if type(x) == str else x)
    id_31_dummy = pd.get_dummies(tmp).loc[:, ['chrome']] # chrome (13%)
    id_35_dummy = pd.get_dummies(df.id_35).loc[:, ['F']] # 'F' (11%)
    id_36_dummy = pd.get_dummies(df.id_36).loc[:, ['F']] # 'F' (22%)
    id_37_dummy = pd.get_dummies(df.id_37).loc[:, ['T']] # 'T' (19%)
    id_38_dummy = pd.get_dummies(df.id_38).loc[:, ['F']] # 'F' (13%)   
    
    id_dummies = pd.concat([id_12_dummy, id_15_dummies, id_16_dummy, id_28_dummy, id_29_dummy, id_31_dummy, id_35_dummy, id_36_dummy, id_37_dummy, id_38_dummy],
                          ignore_index = True, axis = 1)
    id_dummies.columns = ['id_12_Not_Found', 'id_15_Found', 'id_15_New', 'id_16_Found', 'id_28_Found', 'id_29_Found', 'id_31_chrome',
                         'id_35_F', 'id_36_F', 'id_37_T', 'id_38_F']
    
    device_mobile = df.DeviceType.map(lambda x: 1 if x == "mobile" else (x if pd.isnull(x) else 0))
    
    df = pd.concat([df.loc[:, cols].reset_index(drop=True),
                     product_dummies.reset_index(drop=True),
                     card4_dummies.reset_index(drop=True), 
                     card_dummies.reset_index(drop=True), 
                     addr2_dummy.reset_index(drop=True), 
                     P_emaildomain_dummies.reset_index(drop=True),
                    R_emaildomain_dummies.reset_index(drop=True),
                   M_dummies.reset_index(drop=True),
                   id_dummies.reset_index(drop=True),
                   device_mobile.reset_index(drop=True)], axis = 1).copy()
    
    df.columns = list(cols) + list(product_dummies.columns) + list(card4_dummies.columns) + list(card_dummies.columns) + ['addr2_87'] + \
    list(P_emaildomain_dummies.columns) + list(R_emaildomain_dummies.columns) + list(M_dummies.columns) + list(id_dummies.columns) + ['device_mobile']
    
    return df

In [3]:
def plot_fbeta_recall_precision_acc(model_val_probs, actual_y, thresh_ps = np.linspace(.0,.99,1000), beta = 1.5):
    """
    Plot Fbeta score, Recall, Precision, Accuracy across thresholds
    Copied from the following notebook:
    sf20_ds19/curriculum/project-03/class-imbalance/class_imbalance_instacart.ipynb
    
    args:
    model_val_probs(np.array): predicted probabilities
    actual_y(pd.DataFrame): actual y
    clf: classifier/model
    thresh_ps(np.array): candidate thresholds
    beta(float): number of times that recall is more import than precision
    
    returns:
    precs, recs, fbetas, acc_scores(tuple)
    """

    thresh_ps = np.linspace(.0,.99,1000)

    precs, recs, fbetas, acc_scores = [], [], [], []
    for p in thresh_ps:
        model_val_labels = model_val_probs >= p
        prec, rec, fbeta, _ = precision_recall_fscore_support(actual_y, model_val_labels, beta = beta, average = 'binary')
        precs.append(prec); recs.append(rec); fbetas.append(fbeta)
        acc_scores.append(accuracy_score(actual_y, model_val_labels))

    plt.plot(thresh_ps, fbetas)
    plt.plot(thresh_ps, precs)
    plt.plot(thresh_ps, recs)
    plt.plot(thresh_ps, acc_scores)

    plt.title('Metric Scores vs. Positive Class Decision Probability Threshold')
    plt.legend(['Fbeta','Precision','Recall','Accuracy'])
    plt.xlabel('P threshold')
    plt.ylabel('Metric score')
    plt.ylim(0, 1);

    best_f1_score = np.max(fbetas) 
    best_thresh_p = thresh_ps[np.argmax(fbetas)]

    print(f"Best Fbeta({beta}) score {best_f1_score:8.5f} at prob decision threshold >= {best_thresh_p:8.5f}")
    
    return precs, recs, fbetas, acc_scores

In [4]:
def plot_metric_min_samples_split(min_samples_split_list, scores):
    """
    Plots the Precision, Recall, and Fbeta scores across multiple values of min_samples_split
    
    args:
    min_samples_split_list(list): list of min_samples_split(integer)
    scores(list): list of tuples with precision, recall, and fbeta score for each min_samples_split value
    
    returns:
    None
    """
    precs, recs, fbetas = [x[0] for x in scores], [x[1] for x in scores], [x[2] for x in scores]

    plt.plot(min_samples_split_list, fbetas)
    plt.plot(min_samples_split_list, precs)
    plt.plot(min_samples_split_list, recs)

    plt.title('Metric Scores vs. Positive Class Decision Probability Threshold')
    plt.legend(['Fbeta','Precision','Recall'])
    plt.xlabel('Min samples split')
    plt.ylabel('Metric score')
    plt.ylim(0, 1.1);

    best_fbeta_score = np.max(fbetas) 
    best_min = min_samples_split_list[np.argmax(fbetas)]

    print(f"Best Fbeta(1.5) score {best_fbeta_score:8.5f} at min samples split = {best_min}")

In [5]:
def plot_metric_max_features(max_features_list, scores):
    """
    Plots the Precision, Recall, and Fbeta scores across multiple values of max_features
    
    args:
    max_features_list(list): list of max_features(integer)
    scores(list): list of tuples with precision, recall, and fbeta score for each max_features value
    
    returns:
    None
    """
    precs, recs, fbetas = [x[0] for x in scores], [x[1] for x in scores], [x[2] for x in scores]

    plt.plot(max_features_list, fbetas)
    plt.plot(max_features_list, precs)
    plt.plot(max_features_list, recs)

    plt.title('Metric Scores vs. Positive Class Decision Probability Threshold')
    plt.legend(['Fbeta','Precision','Recall'])
    plt.xlabel('Max features')
    plt.ylabel('Metric score')
    plt.ylim(0, 1.1);

    best_fbeta_score = np.max(fbetas) 
    best_max_features = max_features_list[np.argmax(fbetas)]

    print(f"Best Fbeta(1.5) score {best_fbeta_score:8.5f} at max features = {best_max_features}")

In [6]:
def plot_metric_n_estimators(n_estimators_list, scores):
    """
    Plots the Precision, Recall, and Fbeta scores across multiple values of n_estimators(number of trees)
    
    args:
    max_features_list(list): list of n_estimators(integer)
    scores(list): list of tuples with precision, recall, and fbeta score for each n_estimators value
    
    returns:
    None
    """
    precs, recs, fbetas = [x[0] for x in scores], [x[1] for x in scores], [x[2] for x in scores]

    plt.plot(n_estimators_list, fbetas)
    plt.plot(n_estimators_list, precs)
    plt.plot(n_estimators_list, recs)

    plt.title('Metric Scores vs. Positive Class Decision Probability Threshold')
    plt.legend(['Fbeta','Precision','Recall'])
    plt.xlabel('Number of trees')
    plt.ylabel('Metric score')
    plt.ylim(0, 1.1);

    best_fbeta_score = np.max(fbetas) 
    best_n_estimators = n_estimators_list[np.argmax(fbetas)]

    print(f"Best Fbeta(1.5) score {best_fbeta_score:8.5f} at n_estimators = {best_n_estimators}")

In [7]:
def randomforest_result(clf, X_train, y_train, X_val, y_val, scores_train=[], scores_val=[]):
    """
    
    Prints Precision, Recall, and F beta scores with 0.5 cut-off probability and appends them to the input lists for training set and validation set.
    
    args:
    clf(classifier): un-trained classifier with only hyperparameters set
    X_train, y_train, X_val, y_val(pd.DataFrame): training set and validation set
    scores_train(list): empty or non-empty list to append the evaluations on the training set
    scores_val(list): empty or non-empty list to append the evaluations on the validation set
    
    returns:
    clf, scores_train, scores_val (tuple)
    
    """
    
    clf.fit(X_train, y_train);
    prec, rec, fbeta, _ = precision_recall_fscore_support(y_train, clf.predict(X_train), beta = 1.5, average = 'binary')
    scores_train.append((prec, rec, fbeta))
    print(f"Train set: Precision = {round(prec,5)}, Recall = {round(rec,5)}, F_1.5 = {round(fbeta,5)}")
    prec, rec, fbeta, _ = precision_recall_fscore_support(y_val, clf.predict(X_val), beta = 1.5, average = 'binary')
    scores_val.append((prec, rec, fbeta))
    print(f"Val set: Precision = {round(prec,5)}, Recall = {round(rec,5)}, F_1.5 = {round(fbeta,5)}\n")
    
    return clf, scores_train, scores_val

In [8]:
def extra_trees_result(clf, X_train, y_train, X_val, y_val, scores_train=[], scores_val=[]):
    """
    
    Prints Precision, Recall, and F beta scores with 0.5 cut-off probability and appends them to the input lists for training set and validation set.
    
    args:
    clf(classifier): un-trained classifier with only hyperparameters set
    X_train, y_train, X_val, y_val(pd.DataFrame): training set and validation set
    scores_train(list): empty or non-empty list to append the evaluations on the training set
    scores_val(list): empty or non-empty list to append the evaluations on the validation set
    
    returns:
    clf, scores_train, scores_val (tuple)
    
    """
    
    clf.fit(X_train, y_train);
    prec, rec, fbeta, _ = precision_recall_fscore_support(y_train, clf.predict(X_train), beta = 1.5, average = 'binary')
    scores_train.append((prec, rec, fbeta))
    print(f"Train set: Precision = {round(prec,5)}, Recall = {round(rec,5)}, F_1.5 = {round(fbeta,5)}")
    prec, rec, fbeta, _ = precision_recall_fscore_support(y_val, clf.predict(X_val), beta = 1.5, average = 'binary')
    scores_val.append((prec, rec, fbeta))
    print(f"Val set: Precision = {round(prec,5)}, Recall = {round(rec,5)}, F_1.5 = {round(fbeta,5)}\n")
    
    return clf, scores_train, scores_val

In [9]:
def check_multicollinearity(df):
    """
    Checks the multicollinearity among the features in the input dataframe.
    
    args:
    df(pd.DataFrame): Given X matrix
    
    returns:
    x_cor(pd.DataFrame): Dataframe with three columns: variable 1, variable 2, and their correlation
    """
    c = df.corr().abs()
    x_cor = c.unstack().sort_values(kind="quicksort")
    x_cor = pd.DataFrame(x_cor).reset_index(level=[0,1])
    x_cor.columns = ['var1','var2','correlation']
    x_cor = x_cor.loc[x_cor.var1 != x_cor.var2].sort_values(by='correlation', ascending = False).reset_index(drop=True)
    
    return x_cor