In [1]:
# to time the program
import time
start_time = time.time()

In [2]:
# importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import statsmodels.formula.api as smf                
from sklearn.metrics import confusion_matrix        
from sklearn.metrics import roc_auc_score            
from sklearn.neighbors import KNeighborsClassifier   
from sklearn.neighbors import KNeighborsRegressor    
from sklearn.preprocessing import StandardScaler     
from sklearn.tree import DecisionTreeClassifier      
from sklearn.tree import export_graphviz            
from six import StringIO          
from IPython.display import Image                    
import pydotplus                                    
from sklearn.model_selection import RandomizedSearchCV    
from sklearn.metrics import make_scorer              
from sklearn.ensemble import RandomForestClassifier    
from sklearn.ensemble import GradientBoostingClassifier

In [3]:
# storing file path as file
file = './Apprentice_Chef_Dataset.xlsx'

# creating dataset using the file path
ac_dataset = pd.read_excel(io = file)

# setting pandas print options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [4]:
# changing all column names to lowercase
ac_dataset.columns = ac_dataset.columns.str.lower()

# # checking information of all columns
# ac_dataset.info()

In [5]:
# # viewing first five rows of dataset
# ac_dataset.head(n = 5)

In [6]:
# Continuous variables
# -------------------------------
# revenue                         - log transform
# avg_prep_vid_time               - log transform, compute: total time on site
# avg_time_per_site_visit         - log transform

# Categorical variables
# -------------------------------
# email                           - one-hot code, drop original
# package_locker                  - make categorical
# refrigerated_locker             - make categorical
# tastes_and_preferences          - make categorical
# mobile_number                   - make categorical

# Count variables
# -------------------------------
# avg_clicks_per_visit            - keep
# total_photos_viewed             - keep
# master_classes_attended         - yes or no?
# median_meal_rating              - keep
# largest_order_size              - change name to avg_order_size 
# pc_logins                       - delete
# mobile_logins                   - delete
# weekly_plan                     - yes or no?
# early_deliveries                - add to late
# late_deliveries                 - add to early 
# cancellations_before_noon       - add to cancellations_after_noon
# cancellations_after_noon        - add to cancellations_before_noon
# total_meals_ordered             - compute : revenue / total meals ordered
# unique_meals_purch              - compute : unique / total meals ordered
# contacts_w_customer_service     - keep
# product_categories_viewed       - keep

In [7]:
# # checking if any columns have missing values 
# ac_dataset.isnull().sum()

In [8]:
def text_split_feature(col, df, sep = ' ', new_col_name = 'NUM_OF_NAMES'):
    """
Splits values in a string Series (as part of a DataFrame) and sums the number
of resulting items. Automatically appends summed column to original DataFrame.

PARAMETERS
----------
col          : column to split
df           : DataFrame where column is located
sep          : string sequence to split by, default ' '
new_col_name : name of new column after summing split, default
               'number_of_names'
"""
    
    df[new_col_name] = 0
    
    
    for index, val in df.iterrows():
        df.loc[index, new_col_name] = len(df.loc[index, col].split(sep = ' '))

In [9]:
# creating new column for number of names each customer has
text_split_feature(col = 'name', df = ac_dataset, 
                   new_col_name = 'number_of_names')

In [10]:
# dropping columns name, first_name, and family_name
ac_dataset = ac_dataset.drop(['name', 'first_name', 'family_name'], axis = 1)

In [11]:
# # checking for any null values
# print(ac_dataset.isnull().any().any())

**For continuous variables**

In [12]:
# To make revenue normally distributed, we do a log transform

# log transformation of revenue and saving as new column
ac_dataset['log_revenue'] = np.log10(ac_dataset['revenue'])

In [13]:
# # setting figure size
# fig, ax = plt.subplots(figsize = (15, 10))

# # ECDF of revenue
# plt.subplot(2, 2, 1)
# x = np.sort(ac_dataset['revenue'])
# y = np.arange(1, len(x) + 1) / len(x)
# _ = plt.plot(x, y, linestyle = '-', label = 'revenue')

# # ECDF of normally distributed array
# x = np.sort(np.random.normal(loc = np.mean(ac_dataset['revenue']), 
#                              scale = np.std(ac_dataset['revenue']),
#                              size = 1000))
# y = np.arange(1, len(x) + 1) / len(x)
# _ = plt.plot(x, y, linestyle = '-', label = 'normal ECDF')

# _ = plt.legend(loc = "lower right")
# _ = plt.xlabel('Revenue')
# _ = plt.ylabel('ECDF')

# plt.margins(0.02) # Keeps data off plot edges plt.show()



# # ECDF of log_revenue
# plt.subplot(2, 2, 2)
# x = np.sort(ac_dataset['log_revenue'])
# y = np.arange(1, len(x) + 1) / len(x)
# _ = plt.plot(x, y, linestyle = '-', label = 'log_revenue')

# # ECDF of normally distributed array
# x = np.sort(np.random.normal(loc = np.mean(ac_dataset['log_revenue']), 
#                              scale = np.std(ac_dataset['log_revenue']),
#                              size = 1000))
# y = np.arange(1, len(x) + 1) / len(x)
# _ = plt.plot(x, y, linestyle = '-', label = 'normal ECDF')

# _ = plt.legend(loc = "lower right")
# _ = plt.xlabel('Log Revenue')
# _ = plt.ylabel('ECDF')

# # adding vertical lines to mark trend changes
# _ = plt.axvline(x = 2.85)
# _ = plt.axvline(x = 3.15)
# _ = plt.axvline(x = 3.40)

# plt.margins(0.02) # Keeps data off plot edges plt.show()

# # cleaning up the layout, saving the figures, and displaying the results
# _ = plt.tight_layout()
# _ = plt.show

In [14]:
ac_dataset['log_revenue_one'] = 0
ac_dataset['log_revenue_two'] = 0
ac_dataset['log_revenue_three'] = 0
for index, value in ac_dataset.iterrows():
    if ac_dataset.loc[index, 'log_revenue'] >= 3.40:
        ac_dataset.loc[index, 'log_revenue_three'] = 1
    elif ac_dataset.loc[index, 'log_revenue'] >= 3.15:
        ac_dataset.loc[index, 'log_revenue_two'] = 1
    elif ac_dataset.loc[index, 'log_revenue'] >= 2.85:
        ac_dataset.loc[index, 'log_revenue_one'] = 1
    else:
        continue

In [15]:
# To make avg_prep_vid_time normally distributed, we do a log transform

# log transformation of avg_prep_vid_time and saving as new column
ac_dataset['log_avg_prep_vid_time'] = np.log10(ac_dataset['avg_prep_vid_time'])

In [16]:
# # setting figure size
# fig, ax = plt.subplots(figsize = (15, 10))

# # ECDF of avg_prep_vid_time
# plt.subplot(2, 2, 1)
# x = np.sort(ac_dataset['avg_prep_vid_time'])
# y = np.arange(1, len(x) + 1) / len(x)
# _ = plt.plot(x, y, linestyle = '-', label = 'avg_prep_vid_time')

# # ECDF of normally distributed array
# x = np.sort(np.random.normal(loc = np.mean(ac_dataset['avg_prep_vid_time']), 
#                              scale = np.std(ac_dataset['avg_prep_vid_time']),
#                              size = 1000))
# y = np.arange(1, len(x) + 1) / len(x)
# _ = plt.plot(x, y, linestyle = '-', label = 'normal ECDF')

# _ = plt.legend(loc = "lower right")
# _ = plt.xlabel('Avg. Time Video Played (Seconds)')
# _ = plt.ylabel('ECDF')



# # ECDF of log_avg_prep_vid_time
# plt.subplot(2, 2, 2)
# x = np.sort(ac_dataset['log_avg_prep_vid_time'])
# y = np.arange(1, len(x) + 1) / len(x)
# _ = plt.plot(x, y, linestyle = '-', label = 'log_avg_prep_vid_time')

# # ECDF of normally distributed array
# x = np.sort(np.random.normal(loc = np.mean(ac_dataset['log_avg_prep_vid_time']), 
#                              scale = np.std(ac_dataset['log_avg_prep_vid_time']),
#                              size = 1000))
# y = np.arange(1, len(x) + 1) / len(x)
# _ = plt.plot(x, y, linestyle = '-', label = 'normal ECDF')

# _ = plt.legend(loc = "lower right")
# _ = plt.xlabel('Log Avg. Time Video Played (Seconds)')
# _ = plt.ylabel('ECDF')

# plt.margins(0.02) # Keeps data off plot edges plt.show()

# # adding vertical lines to mark trend changes
# _ = plt.axvline(x = 1.77)
# _ = plt.axvline(x = 2.55)


# plt.margins(0.02) # Keeps data off plot edges plt.show()

# # cleaning up the layout, saving the figures, and displaying the results
# _ = plt.tight_layout()
# _ = plt.show

In [17]:
ac_dataset['vid_prep_one'] = 0
ac_dataset['vid_prep_two'] = 0
for index, value in ac_dataset.iterrows():
    if ac_dataset.loc[index, 'log_avg_prep_vid_time'] >= 2.60:
        ac_dataset.loc[index, 'vid_prep_one'] = 1
    elif ac_dataset.loc[index, 'log_avg_prep_vid_time'] >= 1.75:
        ac_dataset.loc[index, 'vid_prep_two'] = 1
    else:
        continue

In [18]:
# To make avg_time_per_site_visit normally distributed, we do a log transform

# log transformation of avg_time_per_site_visit and saving as new column
ac_dataset['log_avg_time_per_site_visit'] = np.log10(ac_dataset['avg_time_per_site_visit'])

In [19]:
# # setting figure size
# fig, ax = plt.subplots(figsize = (15, 10))

# # ECDF of avg_time_per_site_visit
# plt.subplot(2, 2, 1)
# x = np.sort(ac_dataset['avg_time_per_site_visit'])
# y = np.arange(1, len(x) + 1) / len(x)
# _ = plt.plot(x, y, linestyle = '-', label = 'avg_time_per_site_visit')

# # ECDF of normally distributed array
# x = np.sort(np.random.normal(loc = np.mean(ac_dataset['avg_time_per_site_visit']), 
#                              scale = np.std(ac_dataset['avg_time_per_site_visit']),
#                              size = 1000))
# y = np.arange(1, len(x) + 1) / len(x)
# _ = plt.plot(x, y, linestyle = '-', label = 'normal ECDF')

# _ = plt.legend(loc = "lower right")
# _ = plt.xlabel('Avg. Time Customer Spent per Web/Mobile Visit')
# _ = plt.ylabel('ECDF')

# plt.margins(0.02) # Keeps data off plot edges plt.show()


# # ECDF of log_avg_time_per_site_visit
# plt.subplot(2, 2, 2)
# x = np.sort(ac_dataset['log_avg_time_per_site_visit'])
# y = np.arange(1, len(x) + 1) / len(x)
# _ = plt.plot(x, y, linestyle = '-', label = 'log_avg_time_per_site_visit')

# # ECDF of normally distributed array
# x = np.sort(np.random.normal(loc = np.mean(ac_dataset['log_avg_time_per_site_visit']), 
#                              scale = np.std(ac_dataset['log_avg_time_per_site_visit']),
#                              size = 1000))
# y = np.arange(1, len(x) + 1) / len(x)
# _ = plt.plot(x, y, linestyle = '-', label = 'normal ECDF')


# # adding vertical lines to mark trend changes
# _ = plt.axvline(x = 1.75)
# _ = plt.axvline(x = 2.00)
# _ = plt.axvline(x = 2.50)

# _ = plt.legend(loc = "lower right")
# _ = plt.xlabel('Log Avg. Time Customer Spent per Web/Mobile Visit')
# _ = plt.ylabel('ECDF')

# plt.margins(0.02) # Keeps data off plot edges plt.show()

# # cleaning up the layout, saving the figures, and displaying the results
# _ = plt.tight_layout()
# _ = plt.show

In [20]:
ac_dataset['visit_time_one'] = 0
ac_dataset['visit_time_two'] = 0
ac_dataset['visit_time_three'] = 0
for index, value in ac_dataset.iterrows():
    if ac_dataset.loc[index, 'log_avg_prep_vid_time'] >= 2.50:
        ac_dataset.loc[index, 'visit_time_three'] = 1
    elif ac_dataset.loc[index, 'log_avg_prep_vid_time'] >= 2.00:
        ac_dataset.loc[index, 'visit_time_two'] = 1
    elif ac_dataset.loc[index, 'log_avg_prep_vid_time'] >= 1.75:
        ac_dataset.loc[index, 'visit_time_one'] = 1
    else:
        continue

In [21]:
# engineering feature for total time spent on website
ac_dataset['total_avg_time_on_site'] = ac_dataset['avg_prep_vid_time'] +\
                                   ac_dataset['avg_time_per_site_visit']

In [22]:
# To make total_time_on_site normally distributed, we do a log transform

# log transformation of total_avg_time_on_site and saving as new column
ac_dataset['log_total_avg_time_on_site'] = np.log10(ac_dataset['total_avg_time_on_site'])

In [23]:
# # setting figure size
# fig, ax = plt.subplots(figsize = (15, 10))

# # ECDF of avg_time_per_site_visit
# plt.subplot(2, 2, 1)
# x = np.sort(ac_dataset['total_avg_time_on_site'])
# y = np.arange(1, len(x) + 1) / len(x)
# _ = plt.plot(x, y, linestyle = '-', label = 'total_avg_time_on_site')

# # ECDF of normally distributed array
# x = np.sort(np.random.normal(loc = np.mean(ac_dataset['total_avg_time_on_site']), 
#                              scale = np.std(ac_dataset['total_avg_time_on_site']),
#                              size = 1000))
# y = np.arange(1, len(x) + 1) / len(x)
# _ = plt.plot(x, y, linestyle = '-', label = 'normal ECDF')

# _ = plt.legend(loc = "lower right")
# _ = plt.xlabel('Total Avg. Time Customer Spent on Site')
# _ = plt.ylabel('ECDF')


# # ECDF of log_avg_time_per_site_visit
# plt.subplot(2, 2, 2)
# x = np.sort(ac_dataset['log_total_avg_time_on_site'])
# y = np.arange(1, len(x) + 1) / len(x)
# _ = plt.plot(x, y, linestyle = '-', label = 'log_total_avg_time_on_site')

# # ECDF of normally distributed array
# x = np.sort(np.random.normal(loc = np.mean(ac_dataset['log_total_avg_time_on_site']), 
#                              scale = np.std(ac_dataset['log_total_avg_time_on_site']),
#                              size = 1000))
# y = np.arange(1, len(x) + 1) / len(x)
# _ = plt.plot(x, y, linestyle = '-', label = 'normal ECDF')


# # adding vertical lines to mark trend changes
# _ = plt.axvline(x = 2.80)

# _ = plt.legend(loc = "lower right")
# _ = plt.xlabel('Log Total Avg. Time Customer Spent on Site')
# _ = plt.ylabel('ECDF')

# plt.margins(0.02) # Keeps data off plot edges plt.show()


# # cleaning up the layout, saving the figures, and displaying the results
# _ = plt.tight_layout()
# _ = plt.show

In [24]:
ac_dataset['time_on_site_one'] = 0
for index, value in ac_dataset.iterrows():
    if ac_dataset.loc[index, 'log_avg_prep_vid_time'] >= 2.50:
        ac_dataset.loc[index, 'time_on_site_one'] = 1

**For Categorical Variables**

In [25]:
# the marketing team is very adamant about email classification so emails need
# to be classsified as requested

# creating lists for different categories
prof_email = ['mmm', 'amex', 'apple', 'boeing', 'caterpillar', 'chevron',
             'cisco', 'cocacola', 'disney', 'dupont', 'exxon', 'ge', 'walmart',
             'goldmansacs', 'homedepot', 'ibm', 'intel', 'jnj', 'jpmorgan',
             'mcdonalds', 'merck', 'microsoft', 'nike', 'pfizer', 'pg',
             'travelers', 'unitedtech', 'unitedhealth', 'verizon', 'visa']
per_email = ['gmail', 'yahoo', 'protonmail']
junk_email = ['me', 'aol', 'hotmail', 'live', 'msn', 'passport']

# creating a column email_category with just zeros
ac_dataset['email_category'] = '0'

# for loop to check the domain name and classify it based on grouping from
# marketing team
for index, email in ac_dataset[['email']].iterrows():
    domain_name = re.findall('@+\S+[.com|.org]', email[0])[0]
    for pattern in ['@', '.com', '.org']:
        domain_name = domain_name.replace(pattern, '')
    if domain_name in prof_email:
        ac_dataset.loc[index, 'email_category'] = 'Professional'
    elif domain_name in per_email:
        ac_dataset.loc[index, 'email_category'] = 'Personal'
    elif domain_name in junk_email:
        ac_dataset.loc[index, 'email_category'] = 'Junk'
    else:
        ac_dataset.loc[index, 'email_category'] = 'Undefined'

In [26]:
# ac_dataset['email_category'].value_counts()

In [27]:
# one hot encoding email_category
one_hot_email = pd.get_dummies(ac_dataset['email_category'], 
                               prefix = 'email')

# joining codings together
ac_dataset = ac_dataset.join(one_hot_email)

# changing all column names to lowercase
ac_dataset.columns = ac_dataset.columns.str.lower()

In [28]:
ac_dataset['domain_name'] = 0

for index, email in ac_dataset[['email']].iterrows():
    domain_name = re.findall('@+\S+[.com|.org]', email[0])[0]
    for pattern in ['@', '.com', '.org']:
        domain_name = domain_name.replace(pattern, '')
    ac_dataset.loc[index, 'domain_name'] = domain_name

In [29]:
# a list of all email domains highly correlated with cross_sell_success
email_list_one = ['aol', 'hotmail', 'msn', 'live']
email_list_two = ['microsoft', 'merck', 'jpmorgan', 'pg', 'amex', 'intel', 
                  'passport', 'me', 'caterpillar', 'unitedhealth']

ac_dataset['email_categories'] = 0

for index, domain in ac_dataset[['domain_name']].iterrows():
    if domain[0] in email_list_one:
        ac_dataset.loc[index, 'email_categories'] = 'group_one'
    elif domain[0] in email_list_two:
        ac_dataset.loc[index, 'email_categories'] = 'group_two'
    else:
        ac_dataset.loc[index, 'email_categories'] = 'group_three'

In [30]:
# one hot encoding email_category
one_hot_email_groups = pd.get_dummies(ac_dataset['email_categories'], 
                               prefix = 'email')

# joining codings together
ac_dataset = ac_dataset.join(one_hot_email_groups)

In [31]:
# # dropping categorical variable email_category & email_Personal after encoding
# ac_dataset = ac_dataset.drop(['email_category', 'email_personal', 'email'],
#                              axis = 1)


ac_dataset = ac_dataset.drop(['email_category', 'email_personal', 'email',
                              'email_categories', 'email_group_three', 
                              'domain_name'],
                             axis = 1)

In [32]:
# ac_dataset.columns

**For Count Variables**

In [33]:
# variables to split into 0s and 1s
variables_to_zero = ['master_classes_attended', 'weekly_plan']

# for loop to create column with 0s and 1s
for variable in variables_to_zero:
    ac_dataset['has_' + variable] = 0
    for index, value in ac_dataset.iterrows():
        if ac_dataset.loc[index, (variable)] > 0:
            ac_dataset.loc[index, ('has_' + variable)] = 1

In [34]:
# the dataset documentation has informed us that a column was mislabeled
# the column largest_order_size is meant to be average number of meals ordered

# changing the column name
ac_dataset = ac_dataset.rename(columns = {'largest_order_size': 'avg_order_size'})

In [35]:
# creating new feature for total logins 
ac_dataset['total_logins'] = ac_dataset['pc_logins'] + ac_dataset['mobile_logins']

In [36]:
# if early_delivries and late_deliveries both equal to zero, then customer
# got deliveries on-time every time 

# creating empty column
ac_dataset['delivery_time'] = 0

# for loop to create column with delivery description
for index, value in ac_dataset.iterrows():
        #if early and late equals to zero
        if ac_dataset.loc[index, 'early_deliveries'] == 0\
        and ac_dataset.loc[index, 'late_deliveries'] == 0:
            ac_dataset.loc[index, 'delivery_time'] = 'on_time'
        #if early and late greater than zero
        elif ac_dataset.loc[index, 'early_deliveries'] > 0\
        and ac_dataset.loc[index, 'late_deliveries'] > 0: 
            ac_dataset.loc[index, 'delivery_time'] = 'mixed'
        #if early greater than zero
        elif ac_dataset.loc[index, 'early_deliveries'] > 0: 
            ac_dataset.loc[index, 'delivery_time'] = 'early' 
        #if late greater than zero
        elif ac_dataset.loc[index, 'late_deliveries'] > 0: 
            ac_dataset.loc[index, 'delivery_time'] = 'late'
        # for handling errors
        else:
            ac_dataset.loc[index, 'delivery_time'] = 'unknown' 

In [37]:
# one hot encoding delivery_time
one_hot_email = pd.get_dummies(ac_dataset['delivery_time'], 
                               prefix = 'delivery_time')

# joining codings togetherz
ac_dataset = ac_dataset.join(one_hot_email)

# changing all column names to lowercase
ac_dataset.columns = ac_dataset.columns.str.lower()

In [38]:
# dropping categorical variables after encoding
ac_dataset = ac_dataset.drop(['delivery_time', 'delivery_time_on_time'],
                             axis = 1)

In [39]:
# if cancellations_before_noon and cancellations_after_noon both equal 
# to zero, then customer never cancelled 

# creating empty column
ac_dataset['cancellations'] = 0

# for loop to create column with cancellation description
for index, value in ac_dataset.iterrows():
        #if before_noon and after_noon equals to zero
        if ac_dataset.loc[index, 'cancellations_before_noon'] == 0\
        and ac_dataset.loc[index, 'cancellations_after_noon'] == 0:
            ac_dataset.loc[index, 'cancellations'] = 'none'
        #if before_noon greater than zero
        elif ac_dataset.loc[index, 'cancellations_before_noon'] > 0: 
            ac_dataset.loc[index, 'cancellations'] = 'before_noon' 
        #if after_noon greater than zero
        elif ac_dataset.loc[index, 'cancellations_after_noon'] > 0: 
            ac_dataset.loc[index, 'cancellations'] = 'after_noon'
        # for handling errors
        else:
            ac_dataset.loc[index, 'cancellations'] = 'unknown' 

In [40]:
# one hot encoding cancellations
one_hot_email = pd.get_dummies(ac_dataset['cancellations'], 
                               prefix = 'any_cancellations')

# joining codings together
ac_dataset = ac_dataset.join(one_hot_email)

# changing all column names to lowercase
ac_dataset.columns = ac_dataset.columns.str.lower()

In [41]:
# dropping categorical variables after encoding
ac_dataset = ac_dataset.drop(['cancellations', 'any_cancellations_none'],
                             axis = 1)

In [42]:
# total revenue from each customer divided by total meals ordered should
# give the revenue per meal

# creating empty column
ac_dataset['revenue_per_meal'] = ac_dataset['revenue'] / \
                                 ac_dataset['total_meals_ordered']

In [43]:
# total unique meals purchased by each customer divided by total meals ordered 
# should give a ratio of unique meals to total orders

# creating empty column
ac_dataset['unique_meal_ratio'] = ac_dataset['unique_meals_purch'] / \
                                 ac_dataset['total_meals_ordered']

In [44]:
# To make revenue_per_meal normally distributed, we do a log transform

# log transformation of revenue_per_meal and saving as new column
ac_dataset['log_revenue_per_meal'] = np.log10(ac_dataset['revenue_per_meal'])

In [45]:
# # setting figure size
# fig, ax = plt.subplots(figsize = (15, 10))

# # ECDF of revenue_per_meal
# plt.subplot(2, 2, 1)
# x = np.sort(ac_dataset['revenue_per_meal'])
# y = np.arange(1, len(x) + 1) / len(x)
# _ = plt.plot(x, y, linestyle = '-', label = 'revenue_per_meal')

# # ECDF of normally distributed array
# x = np.sort(np.random.normal(loc = np.mean(ac_dataset['revenue_per_meal']), 
#                              scale = np.std(ac_dataset['revenue_per_meal']),
#                              size = 1000))
# y = np.arange(1, len(x) + 1) / len(x)
# _ = plt.plot(x, y, linestyle = '-', label = 'normal ECDF')

# _ = plt.legend(loc = "lower right")
# _ = plt.xlabel('Revenue per Meal')
# _ = plt.ylabel('ECDF')


# # ECDF of log_revenue_per_meal
# plt.subplot(2, 2, 2)
# x = np.sort(ac_dataset['log_revenue_per_meal'])
# y = np.arange(1, len(x) + 1) / len(x)
# _ = plt.plot(x, y, linestyle = '-', label = 'log_revenue_per_meal')

# # ECDF of normally distributed array
# x = np.sort(np.random.normal(loc = np.mean(ac_dataset['log_revenue_per_meal']), 
#                              scale = np.std(ac_dataset['log_revenue_per_meal']),
#                              size = 1000))
# y = np.arange(1, len(x) + 1) / len(x)
# _ = plt.plot(x, y, linestyle = '-', label = 'normal ECDF')

# _ = plt.legend(loc = "lower right")
# _ = plt.xlabel('Log Revenue per Meal')
# _ = plt.ylabel('ECDF')

# plt.margins(0.02) # Keeps data off plot edges plt.show()

# # cleaning up the layout, saving the figures, and displaying the results
# _ = plt.tight_layout()
# _ = plt.show

In [46]:
# To make revenue_per_meal normally distributed, we do a log transform

# log transformation of revenue_per_meal and saving as new column
ac_dataset['log_unique_meal_ratio'] = np.log10(ac_dataset['unique_meal_ratio'])

In [47]:
# # setting figure size
# fig, ax = plt.subplots(figsize = (15, 10))

# # ECDF of unique_meal_ratio
# plt.subplot(2, 2, 1)
# x = np.sort(ac_dataset['unique_meal_ratio'])
# y = np.arange(1, len(x) + 1) / len(x)
# _ = plt.plot(x, y, linestyle = '-', label = 'unique_meal_ratio')

# # ECDF of normally distributed array
# x = np.sort(np.random.normal(loc = np.mean(ac_dataset['unique_meal_ratio']), 
#                              scale = np.std(ac_dataset['unique_meal_ratio']),
#                              size = 1000))
# y = np.arange(1, len(x) + 1) / len(x)
# _ = plt.plot(x, y, linestyle = '-', label = 'normal ECDF')

# _ = plt.legend(loc = "lower right")
# _ = plt.xlabel('Unique Meal Ratio')
# _ = plt.ylabel('ECDF')


# # ECDF of log_unique_meal_ratio
# plt.subplot(2, 2, 2)
# x = np.sort(ac_dataset['log_unique_meal_ratio'])
# y = np.arange(1, len(x) + 1) / len(x)
# _ = plt.plot(x, y, linestyle = '-', label = 'log_unique_meal_ratio')

# # ECDF of normally distributed array
# x = np.sort(np.random.normal(loc = np.mean(ac_dataset['log_unique_meal_ratio']), 
#                              scale = np.std(ac_dataset['log_unique_meal_ratio']),
#                              size = 1000))
# y = np.arange(1, len(x) + 1) / len(x)
# _ = plt.plot(x, y, linestyle = '-', label = 'normal ECDF')

# _ = plt.legend(loc = "lower right")
# _ = plt.xlabel('Log Unique Meal Ratio')
# _ = plt.ylabel('ECDF')

# plt.margins(0.02) # Keeps data off plot edges plt.show()

# # cleaning up the layout, saving the figures, and displaying the results
# _ = plt.tight_layout()
# _ = plt.show

In [48]:
# ac_dataset.columns

In [49]:
ac_dataset['procastinator'] = 0 # customers that have junk mail and spend more time on site
ac_dataset['working'] = 0 # custoemrs that have pro email and spend less time on site
ac_dataset['active_pc_user'] = 0 # more than mean time on site and have low mobile logins
ac_dataset['common_user'] = 0 # part of 75% that pays
ac_dataset['weekend_fighter'] = 0 # orders less than usual client but pays more

for index, value in ac_dataset.iterrows():
    
    if ac_dataset.loc[index, 'email_junk'] == 1 and \
    ac_dataset.loc[index, 'avg_time_per_site_visit'] >= 99.6:
        ac_dataset.loc[index, 'procastinator'] = 1
        
    if ac_dataset.loc[index, 'email_professional'] == 1 and \
    ac_dataset.loc[index, 'avg_time_per_site_visit'] <= 99.6:
        ac_dataset.loc[index, 'working'] = 1
        
    if ac_dataset.loc[index, 'pc_logins'] > 5 and \
    ac_dataset.loc[index, 'avg_time_per_site_visit'] >= 99.6 and \
    ac_dataset.loc[index, 'mobile_logins']:
        ac_dataset.loc[index, 'active_pc_user'] = 1
    
    if ac_dataset.loc[index, 'revenue_per_meal'] >= 25:
        ac_dataset.loc[index, 'common_user'] = 1
        
    if ac_dataset.loc[index, 'total_meals_ordered'] <= 60 and \
    ac_dataset.loc[index, 'revenue_per_meal'] > 34:
        ac_dataset.loc[index, 'weekend_fighter'] =1

In [50]:
# dropping all unused continuous features
ac_dataset = ac_dataset.drop(['revenue', 
                              'unique_meal_ratio',
                              'revenue_per_meal', 
                              'avg_prep_vid_time', 
                              'avg_time_per_site_visit',
                              'total_avg_time_on_site'], axis = 1)

In [51]:
# creating variable for count varibale columns
count_variables = ['avg_clicks_per_visit', 'median_meal_rating', 
                   'pc_logins', 'unique_meals_purch', 
                   'contacts_w_customer_service', 'product_categories_viewed']


# performing a log transform on all count variables 
for variable in count_variables:
    ac_dataset['log_' + variable] = np.log10(ac_dataset[variable])

In [52]:
# dropping all unused continuous features
ac_dataset = ac_dataset.drop(count_variables, axis = 1)

In [53]:
# variables to split into 0s and 1s
variables_to_zero = ['cancellations_before_noon', 'cancellations_after_noon',
                     'early_deliveries', 'late_deliveries']

# for loop to create column with 0s and 1s
for variable in variables_to_zero:
    ac_dataset['has_' + variable] = 0
    for index, value in ac_dataset.iterrows():
        if ac_dataset.loc[index, (variable)] > 0:
            ac_dataset.loc[index, ('has_' + variable)] = 1

In [54]:
# ac_dataset.columns

In [55]:
# df_corr = ac_dataset.corr(method = 'pearson').round(2)

# df_corr['cross_sell_success'].sort_values(ascending = False)

In [56]:
# # correlation heatmap

# # setting figure size
# fig, ax = plt.subplots(figsize=(50, 50))

# # visualizing the correlation matrix
# sns.heatmap(df_corr,
#             cmap = 'coolwarm',
#             square = True,
#             annot = True,
#             linecolor = 'black',
#             linewidths = 0.5,
#             cbar = False)

# # saving and displaying the correlation matrix
# plt.tight_layout()
# plt.show()

In [57]:
# # declaring explanatory variables
# ac_data = ac_dataset.drop('cross_sell_success', axis = 1)

# # declaring response variable
# ac_target = ac_dataset.loc[ : , 'cross_sell_success']

In [58]:
# # train-test split with stratification
# X_train, X_test, y_train, y_test = train_test_split(
#             ac_data,
#             ac_target,
#             test_size    = 0.25,
#             random_state = 219,
#             stratify     = ac_target)

# # merging training data for statsmodels
# ac_train = pd.concat([X_train, y_train], axis = 1)

In [59]:
# for val in ac_data:
#     print(f" {val} + ")

In [60]:
# # instantiating a logistic regression model object
# logistic_full = smf.logit(formula = """  cross_sell_success ~ 
#                                                  email_group_one +
#                                                  email_professional +
#                                                  number_of_names +
#                                                  any_cancellations_before_noon +
#                                                  cancellations_after_noon +
#                                                  procastinator
#                                                  """,
#                                          data    = ac_train)

# # fitting the model object
# results_full = logistic_full.fit()

# # checking the results SUMMARY
# results_full.summary()

In [61]:
# creating a dictionary to store candidate models

candidate_dict = { 

 # full model (set 1)
 'logit_full'   : ['email_professional', 'cancellations_before_noon',
                   'has_cancellations_before_noon', 'mobile_number',
                   'tastes_and_preferences', 'refrigerated_locker',
                   'log_pc_logins', 'log_revenue_two', 'has_master_classes_attended',
                   'log_contacts_w_customer_service', 'package_locker',
                   'has_cancellations_after_noon', 'mobile_logins',
                   'log_revenue_one', 'visit_time_one', 'log_avg_clicks_per_visit',
                   'log_revenue_per_meal', 'has_weekly_plan', 'has_early_deliveries',
                   'weekly_plan', 'log_revenue_three', 'log_unique_meal_ratio',
                   'avg_order_size', 'log_avg_time_per_site_visit',
                   'visit_time_three', 'vid_prep_two', 'log_median_meal_rating',
                   'log_unique_meals_purch', 'vid_prep_one', 'has_late_deliveries',
                   'total_logins', 'total_photos_viewed', 'total_meals_ordered',
                   'log_product_categories_viewed', 'late_deliveries',
                   'early_deliveries', 'visit_time_two', 'log_total_avg_time_on_site',
                   'procastinator', 'working', 'active_pc_user', 'common_user', 
                   'weekend_fighter', 'email_group_one', 'number_of_names'], 

 # full model (set 2)                  
 'logit_full_2'   : ['email_professional', 'cancellations_before_noon',
                   'has_cancellations_before_noon', 'mobile_number',
                   'tastes_and_preferences', 'refrigerated_locker',
                   'log_pc_logins', 'log_revenue_two', 'master_classes_attended',                   
                   'log_contacts_w_customer_service', 'package_locker',
                   'cancellations_after_noon', 'email_junk', 'mobile_logins', 
                   'log_revenue_one', 'visit_time_one', 'log_avg_clicks_per_visit',
                   'has_weekly_plan', 'has_early_deliveries',  
                   'weekly_plan', 'log_revenue', 'log_unique_meal_ratio',
                   'total_logins', 'log_avg_prep_vid_time',  
                   'time_on_site_one', 'vid_prep_two', 'log_median_meal_rating',
                   'log_unique_meals_purch', 'vid_prep_one', 'has_late_deliveries',
                   'total_logins', 'total_photos_viewed', 'total_meals_ordered',
                   'log_product_categories_viewed', 'late_deliveries',
                   'early_deliveries', 'visit_time_two', 'log_total_avg_time_on_site',
                   'procastinator', 'working', 'active_pc_user', 'common_user', 
                   'weekend_fighter', 'email_group_one', 'number_of_names'],
                      
 # significant variables only (set 1)
 'logit_sig_1'    : ['total_meals_ordered' , 'log_contacts_w_customer_service',
                   'mobile_number', 'tastes_and_preferences', 'email_junk',
                   'has_master_classes_attended', 'any_cancellations_before_noon',
                   'log_revenue_per_meal', 'email_professional', 
                   'refrigerated_locker'],
    
 # significant variables only (set 2)
 'logit_sig_2'  : ['total_meals_ordered', 'log_unique_meals_purch', 'email_junk',
                   'email_professional', 'log_unique_meal_ratio', 'mobile_number',
                   'log_contacts_w_customer_service', 'tastes_and_preferences',
                   'refrigerated_locker', 'log_revenue', 
                   'log_total_avg_time_on_site', 'has_master_classes_attended',
                   'any_cancellations_before_noon', 'number_of_names'],
    
 # significant variables only (set 3)
 'logit_sig_3'  : ['total_meals_ordered', 'mobile_number', 'refrigerated_locker',
                   'log_contacts_w_customer_service', 'mobile_logins', 'vid_prep_two',
                   'has_master_classes_attended', 'tastes_and_preferences',
                   'time_on_site_one', 'email_junk', 'email_professional',
                   'log_total_avg_time_on_site', 'has_master_classes_attended',
                   'total_logins', 'any_cancellations_before_noon',
                   'log_revenue_per_meal'],  
    
 # significant variables only (set 4)   
 'logit_sig_4'  : ['email_junk', 'email_professional', 'any_cancellations_before_noon',
                   'log_total_avg_time_on_site', 'log_avg_time_per_site_visit',
                   'log_revenue_per_meal', 'log_unique_meal_ratio', 'log_revenue',
                   'total_meals_ordered'],
    
 # significant variables only (set 5)   
 'logit_sig_5'  : ['total_meals_ordered', 'mobile_number', 'tastes_and_preferences',
                   'refrigerated_locker', 'log_revenue_one', 'vid_prep_two',
                   'visit_time_three', 'email_junk', 'email_professional',
                   'total_logins', 'any_cancellations_before_noon',
                   'log_revenue_per_meal', 'log_pc_logins',
                   'log_contacts_w_customer_service'],  

 # significant variables only (set 6)   
 'logit_sig_6'  : ['total_meals_ordered', 'log_contacts_w_customer_service',
                   'mobile_number', 'early_deliveries', 'cancellations_before_noon',
                   'tastes_and_preferences', 'mobile_logins', 'log_avg_prep_vid_time',
                   'log_pc_logins', 'has_early_deliveries', 'email_junk',
                   'email_professional', 'log_product_categories_viewed',
                   'log_avg_time_per_site_visit'],

 # significant variables only (set 7)   
 'logit_sig_7'  : ['email_junk', 'email_professional', 'mobile_number',
                   'early_deliveries', 'log_pc_logins', 'refrigerated_locker',
                   'cancellations_before_noon', 'tastes_and_preferences',
                   'has_master_classes_attended'],
  
 # significant variables only (set 8)    
 'logit_sig_8'  : ['log_contacts_w_customer_service', 'mobile_number',  
                   'cancellations_before_noon', 'email_professional',
                   'tastes_and_preferences', 'mobile_logins', 'weekly_plan',
                   'log_avg_prep_vid_time', 'email_junk', 'log_product_categories_viewed',
                   'has_master_classes_attended', 'log_pc_logins', 'procastinator',
                   'working', 'active_pc_user', 'common_user', 'weekend_fighter'],
    
 # significant variables only (set 9)    
 'logit_sig_9'  : ['log_contacts_w_customer_service', 'mobile_number',  
                   'cancellations_before_noon', 'email_professional',
                   'tastes_and_preferences', 'mobile_logins', 'weekly_plan',
                   'log_avg_prep_vid_time', 'email_group_one', 'log_product_categories_viewed',
                   'has_master_classes_attended', 'log_pc_logins', 'procastinator',
                   'working', 'active_pc_user', 'common_user', 'weekend_fighter', 
                   'number_of_names'],
    
 # significant variables only (set 10)   
 'logit_sig_10'  : ['total_meals_ordered', 'log_contacts_w_customer_service',
                   'mobile_number', 'cancellations_before_noon',
                   'tastes_and_preferences', 'mobile_logins', 'weekly_plan',
                   'log_avg_prep_vid_time', 'has_master_classes_attended',
                   'log_pc_logins', 'email_junk', 'procastinator',
                   'email_professional', 'log_product_categories_viewed',
                   'working', 'active_pc_user', 'common_user',
                   'weekend_fighter', 'number_of_names']

}                 

**Logistic Regression**

In [62]:
# train/test split with the full model
ac_data   =  ac_dataset.loc[ : , candidate_dict['logit_full_2']]
ac_target =  ac_dataset.loc[ : , 'cross_sell_success']


# this is the exact code we were using before
X_train, X_test, y_train, y_test = train_test_split(
            ac_data,
            ac_target,
            random_state = 219,
            test_size    = 0.25,
            stratify     = ac_target)

In [63]:
# ########################################
# # RandomizedSearchCV
# ########################################

# # declaring a hyperparameter space
# C_space          = np.arange(3.0, 10.0, 0.5)
# warm_start_space = [True, False]
# solver_space     = ['newton-cg', 'sag', 'lbfgs']
# tol_space        = np.arange(0.001, 0.01, 0.02)
# multi_class_space = ['auto', 'ovr', 'multinomial']


# # creating a hyperparameter grid
# param_grid = {'C'            : C_space,
#               'warm_start'   : warm_start_space,
#               'solver'       : solver_space,
#               'tol'          : tol_space,
#               'multi_class'  : multi_class_space}

# # INSTANTIATING the model object without hyperparameters
# lr_tuned = LogisticRegression(random_state = 219,
#                               max_iter     = 8000)

# # GridSearchCV object
# lr_tuned_cv = RandomizedSearchCV(estimator           = lr_tuned,   # the model object
#                                  param_distributions = param_grid, # parameters to tune
#                                  cv                  = 3,          # how many folds in cross-validation
#                                  n_iter              = 250,        # number of combinations of hyperparameters to try
#                                  random_state        = 219,        # starting point for random sequence
#                                  scoring = make_scorer(
#                                            roc_auc_score,
#                                            needs_threshold = False)) # scoring criteria (AUC)

# # FITTING to the FULL DATASET (due to cross-validation)
# lr_tuned_cv.fit(ac_data, ac_target)

# # PREDICT step is not needed

# # printing the optimal parameters and best score
# print("Tuned Parameters  :", lr_tuned_cv.best_params_)
# print("Tuned CV AUC      :", lr_tuned_cv.best_score_.round(4))

In [64]:
# INSTANTIATING a logistic regression model
logreg = LogisticRegression(C = 4.5, max_iter = 8000, random_state = 219,
                            tol = 0.001, warm_start = True)

# FITTING the training data
logreg_fit = logreg.fit(ac_data, ac_target)

# PREDICTING based on the testing set
logreg_pred = logreg_fit.predict(X_test)

# SCORING the results
print('LogReg Training ACCURACY:', logreg_fit.score(X_train, y_train).round(4))
print('LogReg Testing  ACCURACY:', logreg_fit.score(X_test, y_test).round(4))

# saving scoring data for future use
logreg_train_score = logreg_fit.score(X_train, y_train).round(4) # accuracy
logreg_test_score  = logreg_fit.score(X_test, y_test).round(4)   # accuracy

# displaying and saving the gap between training and testing
print('LogReg Train-Test Gap   :', abs(logreg_train_score - logreg_test_score).round(4))
logreg_test_gap = abs(logreg_train_score - logreg_test_score).round(4)


# unpacking the confusion matrix
logreg_tn, \
logreg_fp, \
logreg_fn, \
logreg_tp = confusion_matrix(y_true = y_test, y_pred = logreg_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {logreg_tn}
False Positives: {logreg_fp}
False Negatives: {logreg_fn}
True Positives : {logreg_tp}
""")

# area under the roc curve (auc)
print(f'''AUC Score: {roc_auc_score(y_true  = y_test,
                    y_score = logreg_pred).round(decimals = 4)}''')

# saving AUC score for future use
logreg_auc_score = roc_auc_score(y_true  = y_test,
                                 y_score = logreg_pred).round(decimals = 4)

LogReg Training ACCURACY: 0.7539
LogReg Testing  ACCURACY: 0.7577
LogReg Train-Test Gap   : 0.0038

True Negatives : 71
False Positives: 85
False Negatives: 33
True Positives : 298

AUC Score: 0.6777


In [65]:
# Full One 66%
# LogisticRegression(C=8.0, max_iter=8000, multi_class='multinomial',
#                    random_state=219, tol=0.001, warm_start=True)

# Full Two 67.77%
# LogisticRegression(C=4.5, max_iter=8000, random_state=219, tol=0.001,
#                   warm_start=True)

# Sig 1 62.49%
# LogisticRegression(C=3.5, max_iter=8000, random_state=219, solver='newton-cg',
#                    tol=0.001, warm_start=True)

# Sig 2 66.79%
# LogisticRegression(C=4.0, max_iter=8000, random_state=219, tol=0.001,
#                    warm_start=True)

# Sig 3 63.93%
# LogisticRegression(C=6.5, max_iter=8000, random_state=219, tol=0.001,
#                    warm_start=True)

# Sig 4 60.79%
# LogisticRegression(C=5.5, max_iter=8000, random_state=219, tol=0.001,
#                    warm_start=True)

# Sig 5 62.66%
# LogisticRegression(C=8.5, max_iter=8000, random_state=219, tol=0.001,
#                   warm_start=True)

# Sig 6 62.51%
# LogisticRegression(C=4.5, max_iter=8000, random_state=219, tol=0.001,
#                    warm_start=True)

# Sig 7 62.78%
# LogisticRegression(C=4.5, max_iter=8000, random_state=219, tol=0.001,
#                    warm_start=True)

# Sig 8 62%
# LogisticRegression(C=7.5, max_iter=8000, multi_class='multinomial',
#                    random_state=219, solver='newton-cg', tol=0.001,
#                    warm_start=True)

# Sig 9 63%
# LogisticRegression(C=3.5, max_iter=8000, random_state=219, solver='newton-cg',
#                    tol=0.001, warm_start=True)

# Sig 10 66.13%
# LogisticRegression(C=8.5, max_iter=8000, multi_class='multinomial',
#                    random_state=219, solver='newton-cg', tol=0.001,
#                    warm_start=True)

**Classification Trees**

In [66]:
# train/test split with the full model
ac_data   =  ac_dataset.loc[ : , candidate_dict['logit_full_2']]
ac_target =  ac_dataset.loc[ : , 'cross_sell_success']


# this is the exact code we were using before
X_train, X_test, y_train, y_test = train_test_split(
            ac_data,
            ac_target,
            random_state = 219,
            test_size    = 0.25,
            stratify     = ac_target)

In [67]:
# # declaring a hyperparameter space
# criterion_space = ['gini', 'entropy']
# splitter_space  = ['best', 'random']
# depth_space     = np.arange(1, 9, 1)
# sample_split_space = np.arange(10, 30, 1)
# leaf_space      = np.arange(20, 50, 1)
# feature_space   = ['auto', 'sqrt', 'log2', None]
# split_space     = np.arange(1, 50, 1)
# leaf_node_space = np.arange(5, 50, 1)



# # creating a hyperparameter grid
# param_grid = {'criterion'        : criterion_space,
#               'splitter'         : splitter_space,
#               'max_depth'        : depth_space,
#               'min_samples_leaf' : leaf_space,
#               'min_samples_split' : split_space,
#               'max_features'     : feature_space,
#               'min_samples_split' : sample_split_space,
#               'max_leaf_nodes': leaf_node_space}

# # INSTANTIATING the model object without hyperparameters
# tuned_tree = DecisionTreeClassifier(random_state = 219)

# # RandomizedSearchCV object
# tuned_tree_cv = RandomizedSearchCV(estimator             = tuned_tree,
#                                    param_distributions   = param_grid,
#                                    cv                    = 3,
#                                    n_iter                = 1000,
#                                    random_state          = 219,
#                                    scoring = make_scorer(roc_auc_score,
#                                              needs_threshold = False))

# # FITTING to the FULL DATASET (due to cross-validation)
# tuned_tree_cv.fit(ac_data, ac_target)

# # PREDICT step is not needed

# # printing the optimal parameters and best score
# print("Tuned Parameters  :", tuned_tree_cv.best_params_)
# print("Tuned Training AUC:", tuned_tree_cv.best_score_.round(4))

In [68]:
# INSTANTIATING a logistic regression model with tuned values
tree_tuned = DecisionTreeClassifier(max_depth=3, max_leaf_nodes=28, min_samples_leaf=22,
                       min_samples_split=19, random_state=219)

# FIT step is not needed
tree_tuned_fit = tree_tuned.fit(ac_data, ac_target)

# PREDICTING based on the testing set
tree_tuned_pred = tree_tuned_fit.predict(X_test)

# SCORING the results
print('Training ACCURACY:', tree_tuned.score(X_train, y_train).round(4))
print('Testing  ACCURACY:', tree_tuned.score(X_test, y_test).round(4))

# saving scoring data for future use
tree_tuned_train_score = tree_tuned.score(X_train, y_train).round(4) # accuracy
tree_tuned_test_score  = tree_tuned.score(X_test, y_test).round(4)   # accuracy

# displaying and saving the gap between training and testing
print('LogReg Train-Test Gap   :', abs(tree_tuned_train_score - \
                                       tree_tuned_test_score).round(4))

tree_tuned_test_gap = abs(tree_tuned_train_score - tree_tuned_test_score).round(4)

# unpacking the confusion matrix
tree_tuned_tn, \
tree_tuned_fp, \
tree_tuned_fn, \
tree_tuned_tp = confusion_matrix(y_true = y_test, y_pred = tree_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {tree_tuned_tn}
False Positives: {tree_tuned_fp}
False Negatives: {tree_tuned_fn}
True Positives : {tree_tuned_tp}
""")

print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                  y_score = tree_tuned_pred).round(4))

# saving the AUC score
tree_tuned_auc         = roc_auc_score(y_true  = y_test,
                                     y_score = tree_tuned_pred).round(4) # auc

Training ACCURACY: 0.7402
Testing  ACCURACY: 0.7762
LogReg Train-Test Gap   : 0.036

True Negatives : 95
False Positives: 61
False Negatives: 48
True Positives : 283

AUC Score        : 0.732


In [69]:
# Full One 71.22%
# DecisionTreeClassifier(criterion='entropy', max_depth=5, max_leaf_nodes=5,
#                        min_samples_leaf=43, min_samples_split=26,
#                        random_state=219

# Full Two 73.20%
# DecisionTreeClassifier(max_depth=3, max_leaf_nodes=28, min_samples_leaf=22,
#                        min_samples_split=19, random_state=219)

# Sig 1 64.30%
# DecisionTreeClassifier(max_depth=5, max_leaf_nodes=10, min_samples_leaf=21,
#                        min_samples_split=10, random_state=219,
#                        splitter='random')

# Sig 2 62.87%
# DecisionTreeClassifier(criterion='entropy', max_depth=5, max_leaf_nodes=5,
#                        min_samples_leaf=43, min_samples_split=26,
#                        random_state=219)

# Sig 3 64.30%
# DecisionTreeClassifier(max_depth=5, max_leaf_nodes=10, min_samples_leaf=21,
#                        min_samples_split=10, random_state=219,
#                        splitter='random')

# Sig 4 62.09%
# DecisionTreeClassifier(criterion='entropy', max_depth=5, max_leaf_nodes=5,
#                        min_samples_leaf=43, min_samples_split=26,
#                        random_state=219)

# Sig 5 64.30%
# DecisionTreeClassifier(max_depth=4, max_leaf_nodes=38, min_samples_leaf=21,
#                        min_samples_split=10, random_state=219,
#                        splitter='random')

# Sig 6 62.09%
# DecisionTreeClassifier(criterion='entropy', max_depth=1, max_leaf_nodes=46,
#                        min_samples_leaf=30, min_samples_split=12,
#                        random_state=219, splitter='random')

# Sig 7 61.86%
# DecisionTreeClassifier(max_depth=5, max_leaf_nodes=10, min_samples_leaf=21,
#                        min_samples_split=10, random_state=219,
#                        splitter='random')

# Sig 8 64.85%
# DecisionTreeClassifier(max_depth=4, max_leaf_nodes=34, min_samples_leaf=21,
#                        min_samples_split=13, random_state=219)

# Sig 9 71.41%
# DecisionTreeClassifier(max_depth=3, max_leaf_nodes=48, min_samples_leaf=41,
#                        min_samples_split=28, random_state=219)

# Sig 10 73.20%
# DecisionTreeClassifier(max_depth=3, max_leaf_nodes=28, min_samples_leaf=22,
#                        min_samples_split=19, random_state=219)

**KNN**

In [70]:
# train/test split with the full model
ac_data   =  ac_dataset.loc[ : , candidate_dict['logit_sig_7']]
ac_target =  ac_dataset.loc[ : , 'cross_sell_success']

# this is the exact code we were using before
X_train, X_test, y_train, y_test = train_test_split(
            ac_data,
            ac_target,
            random_state = 219,
            test_size    = 0.25,
            stratify     = ac_target)

In [71]:
# INSTANTIATING StandardScaler()
scaler = StandardScaler()

# FITTING the data
scaler.fit(ac_data)

# TRANSFORMING the data
X_scaled     = scaler.transform(ac_data)

# converting to a DataFrame
X_scaled_df  = pd.DataFrame(X_scaled) 

# train-test split with the scaled data
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(
            X_scaled_df,
            ac_target,
            random_state = 219,
            test_size = 0.25,
            stratify = ac_target)

In [72]:
# # declaring a hyperparameter space
# neighbor_space  = np.arange(1, 20, 1)
# weight_space  = ['uniform', 'distance']
# algoritm_space       = ['auto', 'ball_tree', 'kd_tree', 'brute']
# leaf_space  = np.arange(20, 80, 5)
# p_space = np.arange(1, 10, 1)


# # creating a hyperparameter grid
# param_grid = {'n_neighbors'     : neighbor_space,
#               'weights' : weight_space,
#               'algorithm'        : algoritm_space,
#               'leaf_size'        : leaf_space,
#               'p'       : p_space}

# # INSTANTIATING a KNN classification model with optimal neighbors
# knn_opt = KNeighborsClassifier()

# # GridSearchCV object
# knn_cv = RandomizedSearchCV(estimator           = knn_opt,
#                                param_distributions = param_grid,
#                                cv         = 3,
#                                n_iter     = 1000,
#                                scoring    = make_scorer(roc_auc_score,
#                                             needs_threshold = False))

# # FITTING to the FULL DATASET (due to cross-validation)
# knn_cv.fit(X_scaled_df, ac_target)

# # PREDICT step is not needed

# # printing the optimal parameters and best score
# print("Tuned Parameters  :", knn_cv.best_params_)
# print("Tuned Training AUC:", knn_cv.best_score_.round(4))

In [73]:
# building a model based on hyperparameter tuning results

# copy/pasting in the best_estimator_ results
# to avoid running another RandomizedSearch
knn_tuned = KNeighborsClassifier(algorithm = 'brute', leaf_size = 20,
                                 n_neighbors = 10, p = 8)


# FITTING the model object
knn_tuned_fit = knn_tuned.fit(X_train, y_train)


# PREDICTING based on the testing set
knn_tuned_pred = knn_tuned_fit.predict(X_test)


# SCORING the results
print('KNN Tuned Training ACCURACY:', knn_tuned.score(X_train, y_train).round(4))
print('KNN Tuned Testing  ACCURACY:', knn_tuned.score(X_test, y_test).round(4))

# saving scoring data for future use
knn_tuned_train_score = knn_tuned.score(X_train, y_train).round(4) # accuracy
knn_tuned_test_score  = knn_tuned.score(X_test, y_test).round(4)   # accuracy

# displaying and saving the gap between training and testing
print('KNN Train-Test Gap   :', abs(knn_tuned_train_score - \
                                       knn_tuned_test_score).round(4))

knn_tuned_gap = abs(knn_tuned_train_score - knn_tuned_test_score).round(4)

# unpacking the confusion matrix
knn_tn, \
knn_fp, \
knn_fn, \
knn_tp = confusion_matrix(y_true = y_test, y_pred = knn_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {knn_tn}
False Positives: {knn_fp}
False Negatives: {knn_fn}
True Positives : {knn_tp}
""")

print('KNN Tuned AUC Score :', roc_auc_score(y_true  = y_test,
                                             y_score = knn_tuned_pred).round(4))

knn_auc_score = roc_auc_score(y_true  = y_test, y_score = knn_tuned_pred).round(4)

KNN Tuned Training ACCURACY: 0.7423
KNN Tuned Testing  ACCURACY: 0.7002
KNN Train-Test Gap   : 0.0421

True Negatives : 62
False Positives: 94
False Negatives: 52
True Positives : 279

KNN Tuned AUC Score : 0.6202


In [74]:
# Full One 48.31%
# KNeighborsClassifier(leaf_size=35, n_neighbors=16, p=1)

# Full Two 47.92%
# KNeighborsClassifier(algorithm='ball_tree', leaf_size=45, n_neighbors=6, p=1)

# Sig 1 54.86%
# KNeighborsClassifier(algorithm='brute', leaf_size=25, n_neighbors=4, p=4)

# Sig 2 60.37%
# KNeighborsClassifier(algorithm='kd_tree', leaf_size=40, n_neighbors=6, p=1)

# Sig 3 55.23% - significant
# KNeighborsClassifier(n_neighbors=14, p=1)

# Sig 4 56.84%
# KNeighborsClassifier(algorithm='kd_tree', leaf_size=40, n_neighbors=4, p=4)

# Sig 5 51.01%
# KNeighborsClassifier(leaf_size=50, n_neighbors=4, p=1)

# Sig 6 56.42%
# KNeighborsClassifier(leaf_size=65, n_neighbors=4, p=1)

# Sig 7 62.02% 
# KNeighborsClassifier(algorithm='brute', leaf_size=20, n_neighbors=10, p=8)

# Sig 8 56.77%
# KNeighborsClassifier(algorithm='brute', leaf_size=45, n_neighbors=6, p=1)

# Sig 9 59.19%
# KNeighborsClassifier(algorithm='kd_tree', leaf_size=50, n_neighbors=8, p=1)

# Sig 10 52.59%
# KNeighborsClassifier(algorithm='kd_tree', leaf_size=70, n_neighbors=4, p=1,
#                      weights='distance')

**Random Forest**

In [75]:
# train/test split with the full model
ac_data   =  ac_dataset.loc[ : , candidate_dict['logit_sig_10']]
ac_target =  ac_dataset.loc[ : , 'cross_sell_success']

# this is the exact code we were using before
X_train, X_test, y_train, y_test = train_test_split(
            ac_data,
            ac_target,
            random_state = 219,
            test_size    = 0.25,
            stratify     = ac_target)

In [76]:
# # declaring a hyperparameter space
# estimator_space  = np.arange(200, 500, 50)
# criterion_space  = ['gini', 'entropy']
# depth_space = np.arange(1, 9, 1)
# sample_split_space = np.arange(2, 10, 2)
# leaf_space       = np.arange(1, 10, 1)
# features_space   = ["auto", "sqrt", "log2"]
# bootstrap_space  = [True, False]
# warm_start_space = [True, False]


# # creating a hyperparameter grid
# param_grid = {'n_estimators'     : estimator_space,
#               'min_samples_leaf' : leaf_space,
#               'criterion'        : criterion_space,
#               'bootstrap'        : bootstrap_space,
#               'min_samples_split': sample_split_space,
#               'warm_start'       : warm_start_space,
#               'max_depth'        : depth_space,
#               'max_features'     : features_space}


# # INSTANTIATING the model object without hyperparameters
# forest_grid = RandomForestClassifier(random_state = 219)


# # GridSearchCV object
# forest_cv = RandomizedSearchCV(estimator           = forest_grid,
#                                param_distributions = param_grid,
#                                cv         = 3,
#                                n_iter     = 500,
#                                scoring    = make_scorer(roc_auc_score,
#                                             needs_threshold = False))


# # FITTING to the FULL DATASET (due to cross-validation)
# forest_cv.fit(ac_data, ac_target)


# # PREDICT step is not needed


# # printing the optimal parameters and best score
# print("Tuned Parameters  :", forest_cv.best_params_)
# print("Tuned Training AUC:", forest_cv.best_score_.round(4))

In [77]:
# building a model based on hyperparameter tuning results

# copy/pasting in the best_estimator_ results
# to avoid running another RandomizedSearch
forest_tuned = RandomForestClassifier(bootstrap=False, max_depth=8, max_features='sqrt',
                        min_samples_split=4, n_estimators=350, random_state=219)


# FITTING the model object
forest_tuned_fit = forest_tuned.fit(ac_data, ac_target)


# PREDICTING based on the testing set
forest_tuned_pred = forest_tuned_fit.predict(X_test)


# SCORING the results
print('Forest Tuned Training ACCURACY:', forest_tuned.score(X_train, y_train).round(4))
print('Forest Tuned Testing  ACCURACY:', forest_tuned.score(X_test, y_test).round(4))



# saving scoring data for future use
forest_tuned_train_score = forest_tuned.score(X_train, y_train).round(4) # accuracy
forest_tuned_test_score  = forest_tuned.score(X_test, y_test).round(4)   # accuracy


# displaying and saving the gap between training and testing
print('Forest Train-Test Gap   :', abs(forest_tuned_train_score - \
                                       forest_tuned_test_score).round(4))

forest_tuned_gap = abs(forest_tuned_train_score - forest_tuned_test_score).round(4)

# unpacking the confusion matrix
rand_forest_tn, \
rand_forest_fp, \
rand_forest_fn, \
rand_forest_tp = confusion_matrix(y_true = y_test, y_pred = forest_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {rand_forest_tn}
False Positives: {rand_forest_fp}
False Negatives: {rand_forest_fn}
True Positives : {rand_forest_tp}
""")


# saving the AUC score
forest_tuned_auc = roc_auc_score(y_true  = y_test,
                                 y_score = forest_tuned_pred).round(4) # auc


print('Forest Tuned AUC Score        :', roc_auc_score(y_true  = y_test,
                                                       y_score = forest_tuned_pred).round(4))

Forest Tuned Training ACCURACY: 0.8547
Forest Tuned Testing  ACCURACY: 0.8727
Forest Train-Test Gap   : 0.018

True Negatives : 99
False Positives: 57
False Negatives: 5
True Positives : 326

Forest Tuned AUC Score        : 0.8098


In [78]:
# Full One 77.26% - significant
# RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=8,
#                        max_features='sqrt', min_samples_leaf=3,
#                        min_samples_split=8, n_estimators=450, random_state=219)

# Full Two 76.64% - significant
# RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=8,
#                        min_samples_leaf=6, n_estimators=250, random_state=219)

# Sig 1 71.45%
# RandomForestClassifier(bootstrap=False, max_depth=8, max_features='sqrt',
#                        min_samples_leaf=5, min_samples_split=8,
#                        n_estimators=480, random_state=219, warm_start=True)

# Sig 2 73.32% significant
# RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=8,
#                        max_features='log2', min_samples_leaf=7,
#                        min_samples_split=4, n_estimators=300, random_state=219)

# Sig 3 73.83% - significant
# RandomForestClassifier(bootstrap=False, max_depth=8, min_samples_leaf=4,
#                        n_estimators=300, random_state=219)

# Sig 4 68.74% significant
# RandomForestClassifier(max_depth=8, max_features='sqrt', min_samples_leaf=6,
#                        min_samples_split=6, n_estimators=200, random_state=219)

# Sig 5 66.68% significant
# RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=8,
#                        max_features='sqrt', min_samples_leaf=9,
#                        min_samples_split=4, n_estimators=500, random_state=219)

# Sig 6 66.91% significant
# RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=8,
#                        max_features='sqrt', min_samples_leaf=8,
#                        min_samples_split=8, n_estimators=600, random_state=219)

# Sig 7 66.49% - significant
# RandomForestClassifier(criterion='entropy', max_depth=7, max_features='log2',
#                        n_estimators=750, random_state=219)

# Sig 8 72.19% significant
# RandomForestClassifier(bootstrap=False, max_depth=8, max_features='sqrt',
#                        min_samples_leaf=2, n_estimators=500, random_state=219)

# Sig 9 67.08% significant
# RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=7,
#                        max_features='sqrt', min_samples_leaf=3,
#                        min_samples_split=8, n_estimators=1300,
#                        random_state=219)

# Sig 10 80.98% significant
# RandomForestClassifier(bootstrap=False, max_depth=8, max_features='sqrt',
#                        min_samples_split=4, n_estimators=350, random_state=219)

In [79]:
# ########################################
# # plot_feature_importances
# ########################################
# def plot_feature_importances(model, train, export = False):
#     """
#     Plots the importance of features from a CART model.
    
#     PARAMETERS
#     ----------
#     model  : CART model
#     train  : explanatory variable training data
#     export : whether or not to export as a .png image, default False
#     """
    
#     # declaring the number
#     n_features = train.shape[1]
    
#     # setting plot window
#     fig, ax = plt.subplots(figsize=(12,9))
    
#     plt.barh(range(n_features), model.feature_importances_, align='center')
#     plt.yticks(pd.np.arange(n_features), train.columns)
#     plt.xlabel("Feature importance")
#     plt.ylabel("Feature")
    
#     if export == True:
#         plt.savefig('./analysis_images/Feature_Importance.png')

In [80]:
# # plotting feature importances
# plot_feature_importances(forest_tuned_fit,
#                          train = X_train,
#                          export = False)

**Gradient Boosted Machines**

In [81]:
# train/test split with the full model
ac_data   =  ac_dataset.loc[ : , candidate_dict['logit_sig_10']]
ac_target =  ac_dataset.loc[ : , 'cross_sell_success']

# this is the exact code we were using before
X_train, X_test, y_train, y_test = train_test_split(
            ac_data,
            ac_target,
            random_state = 219,
            test_size    = 0.25,
            stratify     = ac_target)

In [82]:
# # declaring a hyperparameter space
# estimator_space     = np.arange(100, 500, 100)
# depth_space         = np.arange(1, 9, 1)
# max_features_space  = ['auto', 'sqrt', 'log2']
# loss_space          = ['deviance', 'exponential']
# criterion_space     = ['friedman_mse', 'mse', 'mae']
# min_split_space     = np.arange(2, 500, 100)
# warm_start_space    = [True, False]
# learn_space         = np.arange(0.1, 2.0, 0.1)

# # creating a hyperparameter grid
# param_grid = {'max_depth'     : depth_space,
#               'n_estimators'  : estimator_space,
#               'loss'          : loss_space,
#               'criterion'     : criterion_space,
#               'max_features'  : max_features_space,
#               'warm_start'    : warm_start_space,
#               'learning_rate' : learn_space}

# # INSTANTIATING the model object without hyperparameters
# full_gbm_grid = GradientBoostingClassifier(random_state = 219)


# # GridSearchCV object
# full_gbm_cv = RandomizedSearchCV(estimator     = full_gbm_grid,
#                            param_distributions = param_grid,
#                            cv                  = 3,
#                            n_iter              = 200,
#                            random_state        = 219,
#                            scoring             = make_scorer(roc_auc_score,
#                                                  needs_threshold = False))

# # FITTING to the FULL DATASET (due to cross-validation)
# full_gbm_cv.fit(ac_data, ac_target)

# # PREDICT step is not needed

# # printing the optimal parameters and best score
# print("Tuned Parameters  :", full_gbm_cv.best_params_)
# print("Tuned Training AUC:", full_gbm_cv.best_score_.round(4))

In [83]:
# INSTANTIATING the model object without hyperparameters
gbm_tuned =  GradientBoostingClassifier(criterion='mse', loss='exponential', max_depth=2,
                            max_features='sqrt', n_estimators=200,
                            random_state=219)

# FIT step is needed as we are not using .best_estimator
gbm_tuned_fit = gbm_tuned.fit(ac_data, ac_target)

# PREDICTING based on the testing set
gbm_tuned_pred = gbm_tuned_fit.predict(X_test)

# SCORING the results
print('Training ACCURACY:', gbm_tuned_fit.score(X_train, y_train).round(4))
print('Testing ACCURACY :', gbm_tuned_fit.score(X_test, y_test).round(4))


# saving scoring data for future use
full_gbm_train_score = gbm_tuned_fit.score(X_train, y_train).round(4) # accuracy
full_gbm_test_score  = gbm_tuned_fit.score(X_test, y_test).round(4)   # accuracy

# displaying and saving the gap between training and testing
print('Forest Train-Test Gap   :', abs(full_gbm_train_score - \
                                       full_gbm_test_score).round(4))

full_gbm_gap = abs(full_gbm_train_score - full_gbm_test_score).round(4)

# unpacking the confusion matrix
gbm_tuned_tn, \
gbm_tuned_fp, \
gbm_tuned_fn, \
gbm_tuned_tp = confusion_matrix(y_true = y_test, 
                                y_pred = gbm_tuned_pred).ravel()

# printing each result one-by-one
print(f"""
True Negatives : {gbm_tuned_tn}
False Positives: {gbm_tuned_fp}
False Negatives: {gbm_tuned_fn}
True Positives : {gbm_tuned_tp}
""")

print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                  y_score = gbm_tuned_pred).round(4))

# saving the AUC score
gbm_tuned_auc         = roc_auc_score(y_true  = y_test,
                                     y_score = gbm_tuned_pred).round(4) # auce

Training ACCURACY: 0.7779
Testing ACCURACY : 0.8008
Forest Train-Test Gap   : 0.0229

True Negatives : 84
False Positives: 72
False Negatives: 25
True Positives : 306

AUC Score        : 0.7315


In [84]:
# Full One 66.6% - significant
# GradientBoostingClassifier(loss='exponential', max_depth=1, max_features='auto',
#                            n_estimators=300, random_state=219, warm_start=True)

# Full Two 67.89% - significant
# GradientBoostingClassifier(loss='exponential', max_depth=1, max_features='auto',
#                            n_estimators=300, random_state=219, warm_start=True)

# Sig 1 62.09% - significant
# GradientBoostingClassifier(criterion='mae', learning_rate=1.2000000000000002,
#                            loss='exponential', max_depth=1, max_features='auto',
#                            random_state=219, warm_start=True)

# Sig 2 68.22% - significant
# GradientBoostingClassifier(criterion='mse', learning_rate=0.5,
#                            loss='exponential', max_depth=1, max_features='sqrt',
#                            random_state=219, warm_start=True)

# Sig 3 62.09% - significant
# GradientBoostingClassifier(criterion='mae', learning_rate=1.2000000000000002,
#                            loss='exponential', max_depth=1, max_features='auto',
#                            random_state=219, warm_start=True)

# Sig 4 62.09% - significant
# GradientBoostingClassifier(criterion='mae', learning_rate=1.2000000000000002,
#                            loss='exponential', max_depth=1, max_features='auto',
#                            random_state=219, warm_start=True)

# Sig 5 66.64% - significant
# GradientBoostingClassifier(learning_rate=0.30000000000000004,
#                            loss='exponential', max_depth=1, max_features='log2',
#                            n_estimators=400, random_state=219, warm_start=True)

# Sig 6 65.41% - significant
# GradientBoostingClassifier(learning_rate=1.9000000000000001, loss='exponential',
#                            max_depth=1, max_features='auto', random_state=219)

# Sig 7 63.38% - significant
# GradientBoostingClassifier(criterion='mse', learning_rate=0.5,
#                            loss='exponential', max_depth=1, max_features='sqrt',
#                            random_state=219, warm_start=True)

# Sig 8 64.02% - significant
# GradientBoostingClassifier(criterion='mse', learning_rate=0.2, max_depth=1,
#                            max_features='auto', n_estimators=400,
#                            random_state=219)

# Sig 9 70.43% - significant
# GradientBoostingClassifier(criterion='mse', loss='exponential', max_depth=2,
#                            max_features='sqrt', n_estimators=200,
#                            random_state=219)

# Sig 10 73.15% - significant
# GradientBoostingClassifier(criterion='mse', loss='exponential', max_depth=2,
#                            max_features='sqrt', n_estimators=200,
#                            random_state=219)

In [85]:
# creating a dictionary for model results
model_performance = {
    
    'Model Name'        : ['Logistic Regression',
                           'Classification Trees', 
                           'K-Nearest Neighbors Classification',
                           'Random Forest [CHOSEN]',
                           'Gradient Boosted Models'],
           
    'AUC Score'         : [logreg_auc_score, 
                           tree_tuned_auc, 
                           knn_auc_score,
                           forest_tuned_auc,
                           gbm_tuned_auc],
    
    'Training Accuracy' : [logreg_train_score, 
                           tree_tuned_train_score,
                           knn_tuned_train_score,
                           forest_tuned_train_score,
                           full_gbm_train_score],
           
    'Testing Accuracy'  : [logreg_test_score, 
                           tree_tuned_test_score,
                           knn_tuned_test_score,
                           forest_tuned_test_score,
                           full_gbm_test_score],
    
    'Train-Test Gap'    : [logreg_test_gap,
                           tree_tuned_test_gap,
                           knn_tuned_gap,
                           forest_tuned_gap,
                           full_gbm_gap],

    'Confusion Matrix (TN, FP, FN, TP)'  : [(logreg_tn, logreg_fp, logreg_fn, logreg_tp),
                           (tree_tuned_tn, tree_tuned_fp, tree_tuned_fn, tree_tuned_tp),
                           (knn_tn, knn_fp, knn_fn, knn_tp),
                           (rand_forest_tn, rand_forest_fp, rand_forest_fn, rand_forest_tp),                 
                           (gbm_tuned_tn, gbm_tuned_fp, gbm_tuned_fn, gbm_tuned_tp)]}


# converting model_performance into a DataFrame
model_performance = pd.DataFrame(model_performance)

model_performance

Unnamed: 0,Model Name,AUC Score,Training Accuracy,Testing Accuracy,Train-Test Gap,"Confusion Matrix (TN, FP, FN, TP)"
0,Logistic Regression,0.6777,0.7539,0.7577,0.0038,"(71, 85, 33, 298)"
1,Classification Trees,0.732,0.7402,0.7762,0.036,"(95, 61, 48, 283)"
2,K-Nearest Neighbors Classification,0.6202,0.7423,0.7002,0.0421,"(62, 94, 52, 279)"
3,Random Forest [CHOSEN],0.8098,0.8547,0.8727,0.018,"(99, 57, 5, 326)"
4,Gradient Boosted Models,0.7315,0.7779,0.8008,0.0229,"(84, 72, 25, 306)"


In [86]:
print('The chosen model is Random Forest (Classification) with variables:')
print(candidate_dict['logit_sig_10'])

The chosen model is Random Forest (Classification) with variables:
['total_meals_ordered', 'log_contacts_w_customer_service', 'mobile_number', 'cancellations_before_noon', 'tastes_and_preferences', 'mobile_logins', 'weekly_plan', 'log_avg_prep_vid_time', 'has_master_classes_attended', 'log_pc_logins', 'email_junk', 'procastinator', 'email_professional', 'log_product_categories_viewed', 'working', 'active_pc_user', 'common_user', 'weekend_fighter', 'number_of_names']


In [87]:
print("___ %s seconds ___" % (time.time() - start_time))

___ 12.565959930419922 seconds ___
