In [1]:
#importing my libraries
import matplotlib.pyplot as plt                      
import pandas as pd                                  
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import roc_auc_score       
from sklearn.metrics import confusion_matrix  # predictive modeling with nice outputs 
import statsmodels.formula.api as smf # regression modeling
import numpy as np

# CART model packages
from sklearn.tree import DecisionTreeClassifier   
from sklearn.tree import export_graphviz   
from io import StringIO
from IPython.display import Image                  
import pydotplus 

# new packages
from sklearn.model_selection import RandomizedSearchCV    
from sklearn.metrics import make_scorer  

# Random Forest
from sklearn.ensemble import RandomForestClassifier     # random forest
from sklearn.ensemble import GradientBoostingClassifier # gbm

# setting pandas print options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Data Preparation

In [2]:
# specifying file name
file = 'Datasets/Apprentice_Chef_Dataset.xlsx'

# reading the file into Python
data_df = pd.read_excel(file)

## Missing Value Analysis and Imputation

In [3]:
data_df.isnull().sum(axis = 0)

REVENUE                         0
CROSS_SELL_SUCCESS              0
NAME                            0
EMAIL                           0
FIRST_NAME                      0
FAMILY_NAME                    47
TOTAL_MEALS_ORDERED             0
UNIQUE_MEALS_PURCH              0
CONTACTS_W_CUSTOMER_SERVICE     0
PRODUCT_CATEGORIES_VIEWED       0
AVG_TIME_PER_SITE_VISIT         0
MOBILE_NUMBER                   0
CANCELLATIONS_BEFORE_NOON       0
CANCELLATIONS_AFTER_NOON        0
TASTES_AND_PREFERENCES          0
PC_LOGINS                       0
MOBILE_LOGINS                   0
WEEKLY_PLAN                     0
EARLY_DELIVERIES                0
LATE_DELIVERIES                 0
PACKAGE_LOCKER                  0
REFRIGERATED_LOCKER             0
AVG_PREP_VID_TIME               0
LARGEST_ORDER_SIZE              0
MASTER_CLASSES_ATTENDED         0
MEDIAN_MEAL_RATING              0
AVG_CLICKS_PER_VISIT            0
TOTAL_PHOTOS_VIEWED             0
dtype: int64

In [4]:
# creating a dropped dataset to visualize 'FAMILY_NAME'
df_dropped = data_df.dropna()

#check if there is any missing values in the dropped dataset
df_dropped.isnull().sum()

REVENUE                        0
CROSS_SELL_SUCCESS             0
NAME                           0
EMAIL                          0
FIRST_NAME                     0
FAMILY_NAME                    0
TOTAL_MEALS_ORDERED            0
UNIQUE_MEALS_PURCH             0
CONTACTS_W_CUSTOMER_SERVICE    0
PRODUCT_CATEGORIES_VIEWED      0
AVG_TIME_PER_SITE_VISIT        0
MOBILE_NUMBER                  0
CANCELLATIONS_BEFORE_NOON      0
CANCELLATIONS_AFTER_NOON       0
TASTES_AND_PREFERENCES         0
PC_LOGINS                      0
MOBILE_LOGINS                  0
WEEKLY_PLAN                    0
EARLY_DELIVERIES               0
LATE_DELIVERIES                0
PACKAGE_LOCKER                 0
REFRIGERATED_LOCKER            0
AVG_PREP_VID_TIME              0
LARGEST_ORDER_SIZE             0
MASTER_CLASSES_ATTENDED        0
MEDIAN_MEAL_RATING             0
AVG_CLICKS_PER_VISIT           0
TOTAL_PHOTOS_VIEWED            0
dtype: int64

In [5]:
# creating an imputation value
fill = 'Unknown'

# imputing 'FAMILY_NAME'
data_df['FAMILY_NAME'] = data_df['FAMILY_NAME'].fillna(fill)

# Feature Engineering

In [6]:
# Creating New Calculated Variables
data_df['MOBILE_LOGINS_RATE'] = data_df['MOBILE_LOGINS'] / data_df['TOTAL_MEALS_ORDERED']
data_df['CANCELLATION_RATE'] = (data_df['CANCELLATIONS_BEFORE_NOON'] + data_df['CANCELLATIONS_AFTER_NOON'])/ data_df['TOTAL_MEALS_ORDERED']
data_df['CATVIEWS_CLICKS_RATIO'] = data_df['PRODUCT_CATEGORIES_VIEWED'] / data_df['AVG_CLICKS_PER_VISIT']
data_df['CATVIEWS_VISIT_RATIO'] = data_df['PRODUCT_CATEGORIES_VIEWED'] / data_df['AVG_TIME_PER_SITE_VISIT']


# checking result
data_df.loc[ : , ['MOBILE_LOGINS_RATE','CANCELLATION_RATE',
                 'CATVIEWS_CLICKS_RATIO',
                 'CATVIEWS_VISIT_RATIO']].head()


Unnamed: 0,MOBILE_LOGINS_RATE,CANCELLATION_RATE,CATVIEWS_CLICKS_RATIO,CATVIEWS_VISIT_RATIO
0,0.142857,0.285714,0.588235,0.208333
1,0.011494,0.0,0.615385,0.198265
2,0.066667,0.2,0.3125,0.252908
3,0.076923,0.153846,0.357143,0.055556
4,0.021277,0.0,0.833333,0.247647


## Continuous Variables: Trend Based Features

In [7]:
# Log transformation on the continuos variables
data_df['log_REVENUE'] = np.log10(data_df['REVENUE'])
data_df['log_AVG_TIME_PER_SITE_VISIT'] = np.log10(data_df['AVG_TIME_PER_SITE_VISIT'])
data_df['log_AVG_PREP_VID_TIME'] = np.log10(data_df['AVG_PREP_VID_TIME'])
data_df['log_TOTAL_MEALS_ORDERED'] = np.log10(data_df['TOTAL_MEALS_ORDERED'])

In [8]:
# dummy variable for the 2 features
data_df['has_TOTAL_PHOTOS_VIEWED'] = 0
data_df['has_WEEKLY_PLAN'] = 0


for index, value in data_df.iterrows():
    
    # TOTAL_PHOTOS_VIEWED
    if data_df.loc[index, 'TOTAL_PHOTOS_VIEWED'] > 0:
        data_df.loc[index, 'has_TOTAL_PHOTOS_VIEWED'] = 1


    # Second_Flr_SF
    if data_df.loc[index, 'WEEKLY_PLAN'] > 0:
        data_df.loc[index, 'has_WEEKLY_PLAN'] = 1
        
        
# checking results
data_df['has_TOTAL_PHOTOS_VIEWED'].value_counts(normalize = False).sort_index()
data_df['has_WEEKLY_PLAN'].value_counts(normalize = False).sort_index()

data_df[['has_TOTAL_PHOTOS_VIEWED', 'has_WEEKLY_PLAN']].head(10)


Unnamed: 0,has_TOTAL_PHOTOS_VIEWED,has_WEEKLY_PLAN
0,0,0
1,1,1
2,0,1
3,0,1
4,1,1
5,0,0
6,1,1
7,0,1
8,0,1
9,1,1


## Interval / Count Variables

In [9]:
# Categorizing the Order size into 3 (Small, Medium, Large)

data_df['ORDER_SIZE_SMALL'] = 0
data_df['ORDER_SIZE_MEDIUM'] = 0
data_df['ORDER_SIZE_LARGE'] = 0

for index, value in data_df.iterrows():
    if data_df.loc[index, 'LARGEST_ORDER_SIZE'] >= 8.00:
        data_df.loc[index, 'ORDER_SIZE_LARGE'] = 1
    elif data_df.loc[index, 'LARGEST_ORDER_SIZE'] >= 7.00:
        data_df.loc[index, 'ORDER_SIZE_MEDIUM'] = 1
    elif data_df.loc[index, 'LARGEST_ORDER_SIZE'] >= 4.00:
        data_df.loc[index, 'ORDER_SIZE_SMALL'] = 1
    else:
        continue

In [10]:
# Creating new columns for dummy variable
data_df['has_CANCELLATIONS_BEFORE_NOON']  = 0
data_df['has_CANCELLATIONS_AFTER_NOON']   = 0
data_df['has_MASTER_CLASSES_ATTENDED']    = 0
data_df['has_EARLY_DELIVERIES']           = 0
data_df['has_LATE_DELIVERIES']             = 0
data_df['no_LATE_DELIVERIES']              = 0

        
# for loop to declare 0 and 1 
for index, value in data_df.iterrows():
    
    # cancellations_before_noon
    if data_df.loc[index,'CANCELLATIONS_BEFORE_NOON'] > 0:
        data_df.loc[index, 'has_CANCELLATIONS_BEFORE_NOON'] = 1

    # cancellations_after_noon
    if data_df.loc[index, 'CANCELLATIONS_AFTER_NOON'] > 0:
        data_df.loc[index, 'has_CANCELLATIONS_AFTER_NOON'] = 1
        
    # master_classes_attended
    if data_df.loc[index, 'MASTER_CLASSES_ATTENDED'] > 0:
        data_df.loc[index, 'has_MASTER_CLASSES_ATTENDED'] = 1
        
    # early_deliveries
    if data_df.loc[index, 'EARLY_DELIVERIES'] > 0:
        data_df.loc[index, 'has_EARLY_DELIVERIES'] = 1
    
    # late_deliveries
    if data_df.loc[index, 'LATE_DELIVERIES'] > 0:
        data_df.loc[index, 'has_LATE_DELIVERIES'] = 1

    # no_late_deliveries
    if data_df.loc[index,'LATE_DELIVERIES'] == 0:
        data_df.loc[index, 'no_LATE_DELIVERIES'] = 1

## Categorial Data

In [11]:
# Splitting emails

# placeholder list
placeholder_lst = []

# looping over each email address
for index, col in data_df.iterrows():
    
    # splitting email domain at '@'
    split_email = data_df.loc[index, 'EMAIL'].split(sep = '@')
    
    # appending placeholder_lst with the results
    placeholder_lst.append(split_email)
    

# converting placeholder_lst into a DataFrame 
emails = pd.DataFrame(placeholder_lst)

In [12]:
# renaming column to concatenate
emails.columns = ['0' , 'EMAIL_domain']


# concatenating personal_email_domain with friends DataFrame
data_df = pd.concat([data_df, emails['EMAIL_domain']],
                     axis = 1)


# printing value counts of personal_email_domain
data_df.loc[: ,'EMAIL_domain'].value_counts()

gmail.com           303
protonmail.com      284
yahoo.com           274
msn.com              72
aol.com              69
passport.com         64
hotmail.com          63
live.com             62
me.com               59
amex.com             30
jnj.com              28
merck.com            28
cocacola.com         28
mcdonalds.com        28
apple.com            27
nike.com             27
ge.org               26
ibm.com              26
dupont.com           26
microsoft.com        25
chevron.com          25
unitedhealth.com     24
exxon.com            24
travelers.com        24
boeing.com           23
pg.com               22
verizon.com          22
caterpillar.com      22
mmm.com              22
disney.com           21
walmart.com          21
pfizer.com           20
visa.com             20
jpmorgan.com         19
unitedtech.com       18
cisco.com            18
goldmansacs.com      18
homedepot.com        17
intel.com            17
Name: EMAIL_domain, dtype: int64

In [13]:
#Creating domain types 

# email domain types
professional_emails = ['@amex.com','@jnj.com', '@merck.com', '@cocacola.com','@mcdonalds.com', '@apple.com',
                              '@nike.com','@ge.org','@dupont.com','@ibm.com','@chevron.com','@microsoft.com','@exxon.com','@unitedhealth.com',
                              '@travelers.com','@boeing.com','@mmm.com','@caterpillar.com','@verizon.com','@pg.com',
                              '@walmart.com','@disney.com','@pfizer.com','@visa.com','@jpmorgan.com','@cisco.com',
                              '@unitedtech.com','@goldmansacs.com','@homedepot.com','@intel.com']

personal_emails  = ['@gmail.com', '@yahoo.com', '@protonmail.com']


junk_email_domains       = ['@me.com',
                            '@aol.com',
                            '@hotmail.com',
                            '@live.com',
                            '@msn.com',
                            '@passport.com']


# placeholder list
placeholder_lst = []


# looping to group observations by domain type
for domain in data_df['EMAIL_domain']:
    
    if '@' + domain in professional_emails:
        placeholder_lst.append('professional')
        

    elif '@' + domain in personal_emails:
        placeholder_lst.append('personal')
        
    elif '@' + domain in junk_email_domains:
        placeholder_lst.append('junk')
    
    else:
            print('Unknown')


# concatenating with original DataFrame
data_df['domain_group'] = pd.Series(placeholder_lst)


# checking results
data_df['domain_group'].value_counts()

personal        861
professional    696
junk            389
Name: domain_group, dtype: int64

In [14]:
# one hot encoding for email domain categorial variable
one_hot_domain       = pd.get_dummies(data_df['domain_group'])


# dropping categorical variables after they've been encoded
data_df = data_df.drop('EMAIL', axis = 1)
data_df = data_df.drop('domain_group', axis = 1)
data_df = data_df.drop('EMAIL_domain', axis = 1)

# joining codings together
data_df = data_df.join([one_hot_domain])


# saving new columns
new_columns = data_df.columns

In [15]:
# Counting the number of names

def mv_flagger(df):
    """
Flags all columns that have missing values with 'm-COLUMN_NAME'.

PARAMETERS
----------
df : DataFrame to flag missing values


RETURNS
-------
DataFrame with missing value flags."""


    for col in df:

        if df[col].isnull().astype(int).sum() > 0:
            df['m_'+col] = df[col].isnull().astype(int)
            
    return df



#########################
# text_split_feature
#########################
def text_split_feature(col, df, sep=' ', new_col_name='NUM_OF_NAMES'):
    """
Splits values in a string Series (as part of a DataFrame) and sums the number
of resulting items. Automatically appends summed column to original DataFrame.

PARAMETERS
----------
col          : column to split
df           : DataFrame where column is located
sep          : string sequence to split by, default ' '
new_col_name : name of new column after summing split, default
               'number_of_names'
"""
    
    df[new_col_name] = 0
    
    
    for index, val in df.iterrows():
        df.loc[index, new_col_name] = len(df.loc[index, col].split(sep = ' '))

In [16]:
# calling text_split_feature
text_split_feature(col = 'NAME',
                   df  = data_df)


# checking results
data_df['NUM_OF_NAMES'].value_counts().sort_index()

1     591
2    1201
3      98
4       9
5      35
6      12
Name: NUM_OF_NAMES, dtype: int64

# USER DEFINED FUNCTIONS

In [17]:
# optimal_neighbors

def optimal_neighbors(x_data,
                      y_data,
                      standardize = True,
                      pct_test=0.25,
                      seed=219,
                      response_type='reg',
                      max_neighbors=20,
                      show_viz=True):
    """
Exhaustively compute training and testing results for KNN across
[1, max_neighbors]. Outputs the maximum test score and (by default) a
visualization of the results.
PARAMETERS
----------
x_data        : explanatory variable data
y_data        : response variable
standardize   : whether or not to standardize the X data, default True
pct_test      : test size for training and validation from (0,1), default 0.25
seed          : random seed to be used in algorithm, default 219
response_type : type of neighbors algorithm to use, default 'reg'
    Use 'reg' for regression (KNeighborsRegressor)
    Use 'class' for classification (KNeighborsClassifier)
max_neighbors : maximum number of neighbors in exhaustive search, default 20
show_viz      : display or surpress k-neigbors visualization, default True
"""    
    
    
    if standardize == True:
        # optionally standardizing x_data
        scaler             = StandardScaler()
        scaler.fit(x_data)
        x_scaled           = scaler.transform(x_data)
        x_scaled_df        = pd.DataFrame(x_scaled)
        x_data             = x_scaled_df



    # train-test split
    x_train, x_test, y_train, y_test = train_test_split(x_data,
                                                        y_data,
                                                        test_size = pct_test,
                                                        random_state = seed)


    # creating lists for training set accuracy and test set accuracy
    training_accuracy = []
    test_accuracy = []
    
    
    # setting neighbor range
    neighbors_settings = range(1, max_neighbors + 1)


    for n_neighbors in neighbors_settings:
        # building the model based on response variable type
        if response_type == 'reg':
            clf = KNeighborsRegressor(n_neighbors = n_neighbors)
            clf.fit(x_train, y_train)
            
        elif response_type == 'class':
            clf = KNeighborsClassifier(n_neighbors = n_neighbors)
            clf.fit(x_train, y_train)            
            
        else:
            print("Error: response_type must be 'reg' or 'class'")
        
        
        # recording the training set accuracy
        training_accuracy.append(clf.score(x_train, y_train))
    
        # recording the generalization accuracy
        test_accuracy.append(clf.score(x_test, y_test))


    # optionally displaying visualization
    if show_viz == True:
        # plotting the visualization
        fig, ax = plt.subplots(figsize=(12,8))
        plt.plot(neighbors_settings, training_accuracy, label = "training accuracy")
        plt.plot(neighbors_settings, test_accuracy, label = "test accuracy")
        plt.ylabel("Accuracy")
        plt.xlabel("n_neighbors")
        plt.legend()
        plt.show()
    
    
    # returning optimal number of neighbors
    print(f"The optimal number of neighbors is: {test_accuracy.index(max(test_accuracy))+1}")
    return test_accuracy.index(max(test_accuracy))+1



# visual_cm

def visual_cm(true_y, pred_y, labels = None):
    """
Creates a visualization of a confusion matrix.

PARAMETERS
----------
true_y : true values for the response variable
pred_y : predicted values for the response variable
labels : , default None
    """
    # visualizing the confusion matrix

    # setting labels
    lbls = labels
    

    # declaring a confusion matrix object
    cm = confusion_matrix(y_true = true_y,
                          y_pred = pred_y)


    # heatmap
    sns.heatmap(cm,
                annot       = True,
                xticklabels = lbls,
                yticklabels = lbls,
                cmap        = 'Blues',
                fmt         = 'g')


    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix of the Classifier')
    plt.show()

In [18]:
# Checking correlations with Y variable
display(data_df.loc[:, "CROSS_SELL_SUCCESS"].value_counts())

corr_scores = data_df.corr()

corr_scores.loc[:, "CROSS_SELL_SUCCESS"].sort_values(ascending = False)

1    1321
0     625
Name: CROSS_SELL_SUCCESS, dtype: int64

CROSS_SELL_SUCCESS               1.000000
professional                     0.194102
CANCELLATIONS_BEFORE_NOON        0.163442
NUM_OF_NAMES                     0.156373
has_CANCELLATIONS_BEFORE_NOON    0.138611
MOBILE_NUMBER                    0.102657
CANCELLATION_RATE                0.095703
TASTES_AND_PREFERENCES           0.081438
REFRIGERATED_LOCKER              0.068321
has_MASTER_CLASSES_ATTENDED      0.049939
PC_LOGINS                        0.044462
PACKAGE_LOCKER                   0.043534
personal                         0.038841
MASTER_CLASSES_ATTENDED          0.037213
CONTACTS_W_CUSTOMER_SERVICE      0.036541
log_AVG_PREP_VID_TIME            0.034529
AVG_PREP_VID_TIME                0.032115
MEDIAN_MEAL_RATING               0.031798
ORDER_SIZE_SMALL                 0.026679
log_TOTAL_MEALS_ORDERED          0.023570
LARGEST_ORDER_SIZE               0.022247
CATVIEWS_CLICKS_RATIO            0.019348
log_AVG_TIME_PER_SITE_VISIT      0.015990
EARLY_DELIVERIES                 0

# TRAIN - TEST SPLIT

In [19]:
# declaring explanatory variables
data_df_drop = ['CROSS_SELL_SUCCESS','NAME','FIRST_NAME', 'FAMILY_NAME']

data_df_x = data_df.drop(data_df_drop, axis = 1)

# declaring response variable
data_df_y = data_df.loc[ : , 'CROSS_SELL_SUCCESS']

In [20]:
# train-test split with stratification
x_train, x_test, y_train, y_test = train_test_split(
            data_df_x,
            data_df_y,
            test_size    = 0.25,
            random_state = 219,
            stratify     = data_df_y)


# merging training data for statsmodels
data_df_train = pd.concat([x_train, y_train], axis = 1)

In [21]:
for val in data_df_x:
    print(f" {val} + ")

 REVENUE + 
 TOTAL_MEALS_ORDERED + 
 UNIQUE_MEALS_PURCH + 
 CONTACTS_W_CUSTOMER_SERVICE + 
 PRODUCT_CATEGORIES_VIEWED + 
 AVG_TIME_PER_SITE_VISIT + 
 MOBILE_NUMBER + 
 CANCELLATIONS_BEFORE_NOON + 
 CANCELLATIONS_AFTER_NOON + 
 TASTES_AND_PREFERENCES + 
 PC_LOGINS + 
 MOBILE_LOGINS + 
 WEEKLY_PLAN + 
 EARLY_DELIVERIES + 
 LATE_DELIVERIES + 
 PACKAGE_LOCKER + 
 REFRIGERATED_LOCKER + 
 AVG_PREP_VID_TIME + 
 LARGEST_ORDER_SIZE + 
 MASTER_CLASSES_ATTENDED + 
 MEDIAN_MEAL_RATING + 
 AVG_CLICKS_PER_VISIT + 
 TOTAL_PHOTOS_VIEWED + 
 MOBILE_LOGINS_RATE + 
 CANCELLATION_RATE + 
 CATVIEWS_CLICKS_RATIO + 
 CATVIEWS_VISIT_RATIO + 
 log_REVENUE + 
 log_AVG_TIME_PER_SITE_VISIT + 
 log_AVG_PREP_VID_TIME + 
 log_TOTAL_MEALS_ORDERED + 
 has_TOTAL_PHOTOS_VIEWED + 
 has_WEEKLY_PLAN + 
 ORDER_SIZE_SMALL + 
 ORDER_SIZE_MEDIUM + 
 ORDER_SIZE_LARGE + 
 has_CANCELLATIONS_BEFORE_NOON + 
 has_CANCELLATIONS_AFTER_NOON + 
 has_MASTER_CLASSES_ATTENDED + 
 has_EARLY_DELIVERIES + 
 has_LATE_DELIVERIES + 
 no_LATE_DEL

In [22]:
# instantiating a logistic regression model object
logit_full = smf.logit(formula = """ CROSS_SELL_SUCCESS ~  REVENUE + 
 TOTAL_MEALS_ORDERED + 
 UNIQUE_MEALS_PURCH + 
 CONTACTS_W_CUSTOMER_SERVICE + 
 PRODUCT_CATEGORIES_VIEWED + 
 AVG_TIME_PER_SITE_VISIT + 
 MOBILE_NUMBER + 
 CANCELLATIONS_BEFORE_NOON + 
 CANCELLATIONS_AFTER_NOON + 
 TASTES_AND_PREFERENCES + 
 PC_LOGINS + 
 MOBILE_LOGINS + 
 WEEKLY_PLAN + 
 EARLY_DELIVERIES + 
 LATE_DELIVERIES + 
 PACKAGE_LOCKER + 
 REFRIGERATED_LOCKER + 
 AVG_PREP_VID_TIME + 
 LARGEST_ORDER_SIZE + 
 MASTER_CLASSES_ATTENDED + 
 MEDIAN_MEAL_RATING + 
 AVG_CLICKS_PER_VISIT + 
 TOTAL_PHOTOS_VIEWED + 
 MOBILE_LOGINS_RATE + 
 CANCELLATION_RATE + 
 CATVIEWS_CLICKS_RATIO + 
 CATVIEWS_VISIT_RATIO + 
 log_REVENUE + 
 log_AVG_TIME_PER_SITE_VISIT + 
 log_AVG_PREP_VID_TIME + 
 log_TOTAL_MEALS_ORDERED + 
 has_TOTAL_PHOTOS_VIEWED + 
 has_WEEKLY_PLAN + 
 ORDER_SIZE_SMALL + 
 ORDER_SIZE_MEDIUM + 
 ORDER_SIZE_LARGE + 
 has_CANCELLATIONS_BEFORE_NOON + 
 has_CANCELLATIONS_AFTER_NOON + 
 has_MASTER_CLASSES_ATTENDED + 
 has_EARLY_DELIVERIES + 
 has_LATE_DELIVERIES + 
 no_LATE_DELIVERIES + 
 junk + 
 personal + 
 professional + 
 NUM_OF_NAMES
                                     """,
 data    = data_df_train)


# fitting the model object
logit_full = logit_full.fit()


# checking the results SUMMARY
logit_full.summary()

Optimization terminated successfully.
         Current function value: 0.526777
         Iterations 19


0,1,2,3
Dep. Variable:,CROSS_SELL_SUCCESS,No. Observations:,1459.0
Model:,Logit,Df Residuals:,1414.0
Method:,MLE,Df Model:,44.0
Date:,"Sun, 14 Feb 2021",Pseudo R-squ.:,0.1611
Time:,23:12:56,Log-Likelihood:,-768.57
converged:,True,LL-Null:,-916.19
Covariance Type:,nonrobust,LLR p-value:,6.267e-39

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-2.5357,4.44e+06,-5.71e-07,1.000,-8.71e+06,8.71e+06
REVENUE,-0.0002,0.000,-1.171,0.241,-0.001,0.000
TOTAL_MEALS_ORDERED,-0.0109,0.003,-3.171,0.002,-0.018,-0.004
UNIQUE_MEALS_PURCH,0.0003,0.027,0.010,0.992,-0.053,0.053
CONTACTS_W_CUSTOMER_SERVICE,0.0930,0.039,2.403,0.016,0.017,0.169
PRODUCT_CATEGORIES_VIEWED,-0.0319,0.112,-0.284,0.776,-0.252,0.188
AVG_TIME_PER_SITE_VISIT,0.0011,0.002,0.534,0.593,-0.003,0.005
MOBILE_NUMBER,0.9109,0.183,4.973,0.000,0.552,1.270
CANCELLATIONS_BEFORE_NOON,0.1780,0.081,2.199,0.028,0.019,0.337


# CANDIDATE DICTIONARY

In [23]:
# creating a dictionary to store candidate models
# I tried different variables for different models and picked which set will perform better

candidate_dict = {

 # full model
 'logit_full'   : ['REVENUE',
'TOTAL_MEALS_ORDERED',
'UNIQUE_MEALS_PURCH',
'CONTACTS_W_CUSTOMER_SERVICE',
'PRODUCT_CATEGORIES_VIEWED',
'AVG_TIME_PER_SITE_VISIT',
'MOBILE_NUMBER',
'CANCELLATIONS_BEFORE_NOON',
'CANCELLATIONS_AFTER_NOON',
'TASTES_AND_PREFERENCES',
'PC_LOGINS',
'MOBILE_LOGINS',
'WEEKLY_PLAN',
'EARLY_DELIVERIES',
'LATE_DELIVERIES',
'PACKAGE_LOCKER',
'REFRIGERATED_LOCKER',
'AVG_PREP_VID_TIME',
'LARGEST_ORDER_SIZE',
'MASTER_CLASSES_ATTENDED',
'MEDIAN_MEAL_RATING',
'AVG_CLICKS_PER_VISIT',
'TOTAL_PHOTOS_VIEWED',
'MOBILE_LOGINS_RATE',
'CANCELLATION_RATE',
'has_CANCELLATIONS_BEFORE_NOON',
'has_CANCELLATIONS_AFTER_NOON',
'has_WEEKLY_PLAN',
'has_MASTER_CLASSES_ATTENDED',
'has_EARLY_DELIVERIES',
'has_LATE_DELIVERIES',
'no_LATE_DELIVERIES',
'professional',
'personal',
'junk',
'MOBILE_TO_PC_RATIO',
'CATVIEWS_CLICKS_RATIO',
'CATVIEWS_VISIT_RATIO'],
 

            
# significant variables only (set 1)
 'logit_sig'    : ['CONTACTS_W_CUSTOMER_SERVICE' ,
                   'MOBILE_NUMBER' ,
                   'TASTES_AND_PREFERENCES' ,
                   'CANCELLATIONS_BEFORE_NOON' ,
                   'PC_LOGINS' , 
                   'EARLY_DELIVERIES' ,
                   'REFRIGERATED_LOCKER' ,
                   'NUM_OF_NAMES',
                   'junk'],


# significant variables only (set 1)
 'logit_sig2'    : ['TOTAL_MEALS_ORDERED', 'MOBILE_LOGINS',
                    'WEEKLY_PLAN', 'has_MASTER_CLASSES_ATTENDED','PRODUCT_CATEGORIES_VIEWED', 
                    'CONTACTS_W_CUSTOMER_SERVICE',
                   'MOBILE_NUMBER', 'TASTES_AND_PREFERENCES', 'CANCELLATIONS_BEFORE_NOON',
                   'PC_LOGINS',  'EARLY_DELIVERIES','REFRIGERATED_LOCKER',
                   'NUM_OF_NAMES','junk'],
        
    
           
# significant variables only (set 1)
 'logit_sig3'    : ['TOTAL_MEALS_ORDERED', 'MOBILE_LOGINS','log_REVENUE',
                    'WEEKLY_PLAN', 'has_MASTER_CLASSES_ATTENDED','PRODUCT_CATEGORIES_VIEWED', 
                    'CONTACTS_W_CUSTOMER_SERVICE', 'MOBILE_NUMBER',
                   'TASTES_AND_PREFERENCES','CANCELLATIONS_BEFORE_NOON',
                   'PC_LOGINS', 'EARLY_DELIVERIES','REFRIGERATED_LOCKER','has_EARLY_DELIVERIES',
                   'NUM_OF_NAMES', 'junk','log_AVG_PREP_VID_TIME', 'ORDER_SIZE_SMALL','ORDER_SIZE_MEDIUM']

}

# Logistic Regression

In [24]:
# train/test split with the full model
data_df_x   =  data_df.loc[ : , candidate_dict['logit_sig']]
data_df_y =  data_df.loc[ : , 'CROSS_SELL_SUCCESS']


# this is the exact code we were using before
X_train, X_test, y_train, y_test = train_test_split(
            data_df_x,
            data_df_y,
            random_state = 219,
            test_size    = 0.25,
            stratify     = data_df_y)


# INSTANTIATING a logistic regression model
logreg = LogisticRegression(solver = 'lbfgs',
                            C = 1,
                            random_state = 219)


# FITTING the training data
logreg_fit = logreg.fit(X_train, y_train)


# PREDICTING based on the testing set
logreg_pred = logreg_fit.predict(X_test)


# SCORING the results
print('LogReg Training ACCURACY:', logreg_fit.score(X_train, y_train).round(4))
print('LogReg Testing  ACCURACY:', logreg_fit.score(X_test, y_test).round(4))

# saving scoring data for future use
logreg_train_score = logreg_fit.score(X_train, y_train).round(4) # accuracy
logreg_test_score  = logreg_fit.score(X_test, y_test).round(4)   # accuracy


# displaying and saving the gap between training and testing
print('LogReg Train-Test Gap   :', abs(logreg_train_score - logreg_test_score).round(4))
logreg_test_gap = abs(logreg_train_score - logreg_test_score).round(4)

LogReg Training ACCURACY: 0.7354
LogReg Testing  ACCURACY: 0.729
LogReg Train-Test Gap   : 0.0064


In [25]:
# unpacking the confusion matrix
logreg_tn, \
logreg_fp, \
logreg_fn, \
logreg_tp = confusion_matrix(y_true = y_test, y_pred = logreg_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {logreg_tn}
False Positives: {logreg_fp}
False Negatives: {logreg_fn}
True Positives : {logreg_tp}
""")


True Negatives : 53
False Positives: 103
False Negatives: 29
True Positives : 302



In [26]:
# area under the roc curve (auc)
print(roc_auc_score(y_true  = y_test,
                    y_score = logreg_pred).round(decimals = 4))


# saving AUC score for future use
logreg_auc_score = roc_auc_score(y_true  = y_test,
                                 y_score = logreg_pred).round(decimals = 4)

0.6261


In [27]:
# zipping each feature name to its coefficient
logreg_model_values = zip(data_df[candidate_dict['logit_sig']].columns,
                          logreg_fit.coef_.ravel().round(decimals = 2))


# setting up a placeholder list to store model features
logreg_model_lst = [('intercept', logreg_fit.intercept_[0].round(decimals = 2))]


# printing out each feature-coefficient pair one by one
for val in logreg_model_values:
    logreg_model_lst.append(val)
    

# checking the results
for pair in logreg_model_lst:
    print(pair)

('intercept', -2.77)
('CONTACTS_W_CUSTOMER_SERVICE', 0.05)
('MOBILE_NUMBER', 0.81)
('TASTES_AND_PREFERENCES', 0.37)
('CANCELLATIONS_BEFORE_NOON', 0.28)
('PC_LOGINS', 0.21)
('EARLY_DELIVERIES', 0.06)
('REFRIGERATED_LOCKER', 0.47)
('NUM_OF_NAMES', 0.53)
('junk', -1.54)


# Classification: Full Trees

In [28]:
# INSTANTIATING a classification tree object
full_tree = DecisionTreeClassifier()


# FITTING the training data
full_tree_fit = full_tree.fit(x_train, y_train)


# PREDICTING on new data
full_tree_pred = full_tree_fit.predict(x_test)


# SCORING the model
print('Full Tree Training ACCURACY:', full_tree_fit.score(x_train,
                                                    y_train).round(4))

print('Full Tree Testing ACCURACY :', full_tree_fit.score(x_test,
                                                    y_test).round(4))

print('Full Tree AUC Score:', roc_auc_score(y_true  = y_test,
                                            y_score = full_tree_pred).round(4))


# saving scoring data for future use
full_tree_train_score = full_tree_fit.score(x_train, y_train).round(4) # accuracy
full_tree_test_score  = full_tree_fit.score(x_test, y_test).round(4)   # accuracy


# saving AUC
full_tree_auc_score   = roc_auc_score(y_true  = y_test,
                                      y_score = full_tree_pred).round(4) # auc

Full Tree Training ACCURACY: 1.0
Full Tree Testing ACCURACY : 0.6468
Full Tree AUC Score: 0.6029


In [29]:
# unpacking the confusion matrix
full_tree_tn, \
full_tree_fp, \
full_tree_fn, \
full_tree_tp = confusion_matrix(y_true = y_test, y_pred = full_tree_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {full_tree_tn}
False Positives: {full_tree_fp}
False Negatives: {full_tree_fn}
True Positives : {full_tree_tp}
""")


True Negatives : 75
False Positives: 81
False Negatives: 91
True Positives : 240



# Pruned 

In [30]:
# INSTANTIATING a classification tree object
pruned_tree = DecisionTreeClassifier(max_depth = 4,
                                     min_samples_leaf = 25,
                                     random_state = 219)


# FITTING the training data
pruned_tree_fit  = pruned_tree.fit(X_train, y_train)


# PREDICTING on new data
pruned_tree_pred = pruned_tree_fit.predict(X_test)


# SCORING the model
print('Training ACCURACY:', pruned_tree_fit.score(X_train, y_train).round(4))
print('Testing  ACCURACY:', pruned_tree_fit.score(X_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = pruned_tree_pred).round(4))


# saving scoring data for future use
pruned_tree_train_score = pruned_tree_fit.score(X_train, y_train).round(4) # accuracy
pruned_tree_test_score  = pruned_tree_fit.score(X_test, y_test).round(4)   # accuracy


# saving auc score
pruned_tree_auc_score   = roc_auc_score(y_true  = y_test,
                                        y_score = pruned_tree_pred).round(4) # auc

Training ACCURACY: 0.7505
Testing  ACCURACY: 0.7598
AUC Score        : 0.686


In [31]:
# unpacking the confusion matrix
pruned_tree_tn, \
pruned_tree_fp, \
pruned_tree_fn, \
pruned_tree_tp = confusion_matrix(y_true = y_test, y_pred = pruned_tree_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {pruned_tree_tn}
False Positives: {pruned_tree_fp}
False Negatives: {pruned_tree_fn}
True Positives : {pruned_tree_tp}
""")


True Negatives : 75
False Positives: 81
False Negatives: 36
True Positives : 295



# RESULTS (WITHOUT TUNING)

In [32]:
# This is NOT my final results

# comparing results
print(f"""
Model         AUC Score      TN, FP, FN, TP
-----         ---------      --------------
Logistic      {logreg_auc_score}         {logreg_tn, logreg_fp, logreg_fn, logreg_tp}
Full Tree     {full_tree_auc_score}         {full_tree_tn, full_tree_fp, full_tree_fn, full_tree_tp}
Pruned Tree   {pruned_tree_auc_score}         {pruned_tree_tn, pruned_tree_fp, pruned_tree_fn, pruned_tree_tp}
""")


# creating a dictionary for model results
model_performance = {
    
    'Model Name'    : ['Logistic', 'Full Tree', 'Pruned Tree'],
           
    'AUC Score' : [logreg_auc_score, full_tree_auc_score, pruned_tree_auc_score],
    
    'Training Accuracy' : [logreg_train_score, full_tree_train_score,
                           pruned_tree_train_score],
           
    'Testing Accuracy'  : [logreg_test_score, full_tree_test_score,
                           pruned_tree_test_score],

    'Confusion Matrix'  : [(logreg_tn, logreg_fp, logreg_fn, logreg_tp),
                           (full_tree_tn, full_tree_fp, full_tree_fn, full_tree_tp),
                           (pruned_tree_tn, pruned_tree_fp, pruned_tree_fn, pruned_tree_tp)]}


Model         AUC Score      TN, FP, FN, TP
-----         ---------      --------------
Logistic      0.6261         (53, 103, 29, 302)
Full Tree     0.6029         (75, 81, 91, 240)
Pruned Tree   0.686         (75, 81, 36, 295)



# Hyperparameter Tuning with RandomizedSearchCV

In [33]:
# RandomizedSearchCV
# declaring a hyperparameter space
C_space          = pd.np.arange(0.1, 5.0, 0.1)
warm_start_space = [True, False]
solver_space     = ['newton-cg', 'sag', 'lbfgs']


# creating a hyperparameter grid
param_grid = {'C'          : C_space,
              'warm_start' : warm_start_space,
              'solver'     : solver_space}


# INSTANTIATING the model object without hyperparameters
lr_tuned = LogisticRegression(random_state = 219,
                              max_iter     = 1000)


# GridSearchCV object
lr_tuned_cv = RandomizedSearchCV(estimator           = lr_tuned,   # the model object
                                 param_distributions = param_grid, # parameters to tune
                                 cv                  = 3,          # how many folds in cross-validation
                                 n_iter              = 250,        # number of combinations of hyperparameters to try
                                 random_state        = 219,        # starting point for random sequence
                                 scoring = make_scorer(
                                           roc_auc_score,
                                           needs_threshold = False)) # scoring criteria (AUC)


# FITTING to the FULL DATASET (due to cross-validation)
lr_tuned_cv.fit(data_df_x, data_df_y)


# PREDICT step is not needed


# printing the optimal parameters and best score
print("Tuned Parameters  :", lr_tuned_cv.best_params_)
print("Tuned CV AUC      :", lr_tuned_cv.best_score_.round(4))

  C_space          = pd.np.arange(0.1, 5.0, 0.1)


Tuned Parameters  : {'warm_start': True, 'solver': 'newton-cg', 'C': 3.0000000000000004}
Tuned CV AUC      : 0.6365


In [34]:
# building a model based on hyperparameter tuning results

# INSTANTIATING a logistic regression model with tuned values
lr_tuned = lr_tuned_cv.best_estimator_


# FIT step is not needed


# PREDICTING based on the testing set
lr_tuned_pred = lr_tuned.predict(X_test)


# SCORING the results
print('LR Tuned Training ACCURACY:', lr_tuned.score(X_train, y_train).round(4))
print('LR Tuned Testing  ACCURACY:', lr_tuned.score(X_test, y_test).round(4))
print('LR Tuned AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = lr_tuned_pred).round(4))


# saving scoring data for future use
lr_tuned_train_score = lr_tuned.score(X_train, y_train).round(4) # accuracy
lr_tuned_test_score  = lr_tuned.score(X_test, y_test).round(4)   # accuracy


# saving the AUC score
lr_tuned_auc         = roc_auc_score(y_true  = y_test,
                                     y_score = lr_tuned_pred).round(4) # auc

LR Tuned Training ACCURACY: 0.7375
LR Tuned Testing  ACCURACY: 0.7351
LR Tuned AUC Score        : 0.6357


# Hyperparameter Tuning on Classification Trees

In [35]:
# declaring a hyperparameter space
criterion_space = ['gini', 'entropy']
splitter_space  = ['best', 'random']
depth_space     = pd.np.arange(1, 25, 1)
leaf_space      = pd.np.arange(1, 100, 1)


# creating a hyperparameter grid
param_grid = {'criterion'        : criterion_space,
              'splitter'         : splitter_space,
              'max_depth'        : depth_space,
              'min_samples_leaf' : leaf_space}


# INSTANTIATING the model object without hyperparameters
tuned_tree = DecisionTreeClassifier(random_state = 219)


# RandomizedSearchCV object
tuned_tree_cv = RandomizedSearchCV(estimator             = tuned_tree,
                                   param_distributions   = param_grid,
                                   cv                    = 3,
                                   n_iter                = 1000,
                                   random_state          = 219,
                                   scoring = make_scorer(roc_auc_score,
                                             needs_threshold = False))


# FITTING to the FULL DATASET (due to cross-validation)
tuned_tree_cv.fit(data_df_x, data_df_y)


# PREDICT step is not needed


# printing the optimal parameters and best score
print("Tuned Parameters  :", tuned_tree_cv.best_params_)
print("Tuned Training AUC:", tuned_tree_cv.best_score_.round(4))

  depth_space     = pd.np.arange(1, 25, 1)
  leaf_space      = pd.np.arange(1, 100, 1)


Tuned Parameters  : {'splitter': 'best', 'min_samples_leaf': 16, 'max_depth': 3, 'criterion': 'gini'}
Tuned Training AUC: 0.7032


In [36]:
# building a model based on hyperparameter tuning results

# INSTANTIATING a logistic regression model with tuned values
tree_tuned = tuned_tree_cv.best_estimator_


# FIT step is not needed


# PREDICTING based on the testing set
tree_tuned_pred = tree_tuned.predict(X_test)


# SCORING the results
print('Training ACCURACY:', tree_tuned.score(X_train, y_train).round(4))
print('Testing  ACCURACY:', tree_tuned.score(X_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = tree_tuned_pred).round(4))


# saving scoring data for future use
tree_tuned_train_score = tree_tuned.score(X_train, y_train).round(4) # accuracy
tree_tuned_test_score  = tree_tuned.score(X_test, y_test).round(4)   # accuracy

# displaying and saving the gap between training and testing
print('Train-Test Gap   :', abs(tree_tuned_train_score - \
                                       tree_tuned_test_score).round(4))

tree_tuned_test_gap = abs(tree_tuned_train_score - tree_tuned_test_score).round(4)


# saving the AUC score
tree_tuned_auc         = roc_auc_score(y_true  = y_test,
                                     y_score = tree_tuned_pred).round(4) # auc

Training ACCURACY: 0.7402
Testing  ACCURACY: 0.7762
AUC Score        : 0.732
Train-Test Gap   : 0.036


In [37]:
# unpacking the confusion matrix
tuned_tree_tn, \
tuned_tree_fp, \
tuned_tree_fn, \
tuned_tree_tp = confusion_matrix(y_true = y_test, y_pred = tree_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {tuned_tree_tn}
False Positives: {tuned_tree_fp}
False Negatives: {tuned_tree_fn}
True Positives : {tuned_tree_tp}
""")


True Negatives : 95
False Positives: 61
False Negatives: 48
True Positives : 283



# Random Forest

In [38]:
# train/test split with the full model
data_df_x   =  data_df.loc[ : , candidate_dict['logit_sig3']]
data_df_y =  data_df.loc[ : , 'CROSS_SELL_SUCCESS']

# this is the exact code we were using before
X_train, X_test, y_train, y_test = train_test_split(
            data_df_x,
            data_df_y,
            random_state = 219,
            test_size    = 0.25,
            stratify     = data_df_y)

In [39]:
# building a model based on hyperparameter tuning results

# copy/pasting in the best_estimator_ results
# to avoid running another RandomizedSearch
forest_tuned = RandomForestClassifier(bootstrap=False, max_depth=8, max_features='sqrt',
                        min_samples_split=4, n_estimators=350, random_state=219)


# FITTING the model object
forest_tuned_fit = forest_tuned.fit(data_df_x, data_df_y)


# PREDICTING based on the testing set
forest_tuned_pred = forest_tuned_fit.predict(X_test)


# SCORING the results
print('Forest Tuned Training ACCURACY:', forest_tuned.score(X_train, y_train).round(4))
print('Forest Tuned Testing  ACCURACY:', forest_tuned.score(X_test, y_test).round(4))



# saving scoring data for future use
forest_tuned_train_score = forest_tuned.score(X_train, y_train).round(4) # accuracy
forest_tuned_test_score  = forest_tuned.score(X_test, y_test).round(4)   # accuracy


# displaying and saving the gap between training and testing
print('Forest Train-Test Gap   :', abs(forest_tuned_train_score - \
                                       forest_tuned_test_score).round(4))

forest_tuned_gap = abs(forest_tuned_train_score - forest_tuned_test_score).round(4)

# unpacking the confusion matrix
rand_forest_tn, \
rand_forest_fp, \
rand_forest_fn, \
rand_forest_tp = confusion_matrix(y_true = y_test, y_pred = forest_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {rand_forest_tn}
False Positives: {rand_forest_fp}
False Negatives: {rand_forest_fn}
True Positives : {rand_forest_tp}
""")


# saving the AUC score
forest_tuned_auc = roc_auc_score(y_true  = y_test,
                                 y_score = forest_tuned_pred).round(4) # auc


print('Forest Tuned AUC Score        :', roc_auc_score(y_true  = y_test,
                                                       y_score = forest_tuned_pred).round(4))

Forest Tuned Training ACCURACY: 0.865
Forest Tuned Testing  ACCURACY: 0.8953
Forest Train-Test Gap   : 0.0303

True Negatives : 108
False Positives: 48
False Negatives: 3
True Positives : 328

Forest Tuned AUC Score        : 0.8416


# Gradient Boosted Machines

In [40]:
# train/test split with the full model
data_df_x   =  data_df.loc[ : , candidate_dict['logit_sig3']]
data_df_y =  data_df.loc[ : , 'CROSS_SELL_SUCCESS']

# this is the exact code we were using before
X_train, X_test, y_train, y_test = train_test_split(
            data_df_x,
            data_df_y,
            random_state = 219,
            test_size    = 0.25,
            stratify     = data_df_y)

In [41]:
# INSTANTIATING the model object without hyperparameters
gbm_tuned =  GradientBoostingClassifier(criterion='mse', loss='exponential', max_depth=2,
                            max_features='sqrt', n_estimators=200,
                            random_state=219)

# FIT step is needed as we are not using .best_estimator
gbm_tuned_fit = gbm_tuned.fit(data_df_x, data_df_y)

# PREDICTING based on the testing set
gbm_tuned_pred = gbm_tuned_fit.predict(X_test)

# SCORING the results
print('Training ACCURACY:', gbm_tuned_fit.score(X_train, y_train).round(4))
print('Testing ACCURACY :', gbm_tuned_fit.score(X_test, y_test).round(4))


# saving scoring data for future use
full_gbm_train_score = gbm_tuned_fit.score(X_train, y_train).round(4) # accuracy
full_gbm_test_score  = gbm_tuned_fit.score(X_test, y_test).round(4)   # accuracy

# displaying and saving the gap between training and testing
print('Forest Train-Test Gap   :', abs(full_gbm_train_score - \
                                       full_gbm_test_score).round(4))

full_gbm_gap = abs(full_gbm_train_score - full_gbm_test_score).round(4)

# unpacking the confusion matrix
gbm_tuned_tn, \
gbm_tuned_fp, \
gbm_tuned_fn, \
gbm_tuned_tp = confusion_matrix(y_true = y_test, 
                                y_pred = gbm_tuned_pred).ravel()

# printing each result one-by-one
print(f"""
True Negatives : {gbm_tuned_tn}
False Positives: {gbm_tuned_fp}
False Negatives: {gbm_tuned_fn}
True Positives : {gbm_tuned_tp}
""")

print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                  y_score = gbm_tuned_pred).round(4))

# saving the AUC score
gbm_tuned_auc         = roc_auc_score(y_true  = y_test,
                                     y_score = gbm_tuned_pred).round(4) # auce

Training ACCURACY: 0.7855
Testing ACCURACY : 0.7844
Forest Train-Test Gap   : 0.0011

True Negatives : 79
False Positives: 77
False Negatives: 28
True Positives : 303

AUC Score        : 0.7109


# FINAL RESULTS

In [42]:
# creating a dictionary for model results
model_performance = {
    
    'Model Name'        : ['Logistic Regression',
                           'Classification Trees', 
                           'Random Forest [FINAL]',
                           'Gradient Boosted Models'],
           
    'AUC Score'         : [logreg_auc_score, 
                           tree_tuned_auc, 
                           forest_tuned_auc,
                           gbm_tuned_auc],
    
    'Training Accuracy' : [logreg_train_score, 
                           tree_tuned_train_score,
                           forest_tuned_train_score,
                           full_gbm_train_score],
           
    'Testing Accuracy'  : [logreg_test_score, 
                           tree_tuned_test_score,
                           forest_tuned_test_score,
                           full_gbm_test_score],
    
    'Train-Test Gap'    : [logreg_test_gap,
                           tree_tuned_test_gap,
                           forest_tuned_gap,
                           full_gbm_gap],

    'Confusion Matrix (TN, FP, FN, TP)'  : [(logreg_tn, logreg_fp, logreg_fn, logreg_tp),
                         (full_tree_tn, full_tree_fp, full_tree_fn, full_tree_tp),
                           (rand_forest_tn, rand_forest_fp, rand_forest_fn, rand_forest_tp),                 
                           (gbm_tuned_tn, gbm_tuned_fp, gbm_tuned_fn, gbm_tuned_tp)]}

                       
# converting model_performance into a DataFrame
model_performance = pd.DataFrame(model_performance)

model_performance

Unnamed: 0,Model Name,AUC Score,Training Accuracy,Testing Accuracy,Train-Test Gap,"Confusion Matrix (TN, FP, FN, TP)"
0,Logistic Regression,0.6261,0.7354,0.729,0.0064,"(53, 103, 29, 302)"
1,Classification Trees,0.732,0.7402,0.7762,0.036,"(75, 81, 91, 240)"
2,Random Forest [FINAL],0.8416,0.865,0.8953,0.0303,"(108, 48, 3, 328)"
3,Gradient Boosted Models,0.7109,0.7855,0.7844,0.0011,"(79, 77, 28, 303)"


**For this project. I will be selecting Random Forest as my best model with AUC of 0.8416**