In [1]:
# importing libraries
import random            as rand                     # random number gen
import pandas            as pd                       # data science essentials
import matplotlib.pyplot as plt                      # data visualization
import seaborn           as sns                      # enhanced data viz
from sklearn.model_selection import train_test_split # train-test split
from sklearn.linear_model import LogisticRegression  # logistic regression
import statsmodels.formula.api as smf                # logistic regression
from sklearn.metrics import confusion_matrix         # confusion matrix
from sklearn.metrics import roc_auc_score            # auc score
from sklearn.neighbors import KNeighborsClassifier   # KNN for classification
from sklearn.neighbors import KNeighborsRegressor    # KNN for regression
from sklearn.preprocessing import StandardScaler     # standard scaler
# libraries for classification trees
from sklearn.tree import DecisionTreeClassifier      # classification trees
from sklearn.tree import export_graphviz             # exports graphics
from six import StringIO           # saves objects in memory
from IPython.display import Image                    # displays on frontend
import pydotplus                                     # interprets dot objects
from sklearn.model_selection import RandomizedSearchCV     # hyperparameter tuning
from sklearn.metrics import make_scorer 
from sklearn.ensemble import RandomForestClassifier     # random forest
from sklearn.ensemble import GradientBoostingClassifier

# loading data
file = './datasets/Apprentice_Chef_Dataset.xlsx'

chef = pd.read_excel(file)

# setting pandas print options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 100)


# displaying the head of the dataset
chef.head(n = 5)

Unnamed: 0,REVENUE,CROSS_SELL_SUCCESS,NAME,EMAIL,FIRST_NAME,FAMILY_NAME,TOTAL_MEALS_ORDERED,UNIQUE_MEALS_PURCH,CONTACTS_W_CUSTOMER_SERVICE,PRODUCT_CATEGORIES_VIEWED,AVG_TIME_PER_SITE_VISIT,MOBILE_NUMBER,CANCELLATIONS_BEFORE_NOON,CANCELLATIONS_AFTER_NOON,TASTES_AND_PREFERENCES,PC_LOGINS,MOBILE_LOGINS,WEEKLY_PLAN,EARLY_DELIVERIES,LATE_DELIVERIES,PACKAGE_LOCKER,REFRIGERATED_LOCKER,AVG_PREP_VID_TIME,LARGEST_ORDER_SIZE,MASTER_CLASSES_ATTENDED,MEDIAN_MEAL_RATING,AVG_CLICKS_PER_VISIT,TOTAL_PHOTOS_VIEWED
0,393.0,1,Saathos,saathos@unitedhealth.com,Saathos,Saathos,14,6,12,10,48.0,1,3,1,1,5,2,0,0,2,0,0,33.4,1,0,1,17,0
1,1365.0,1,Alysanne Osgrey,alysanne.osgrey@ge.org,Alysanne,Osgrey,87,3,8,8,40.35,1,0,0,1,5,1,12,0,2,0,0,84.8,1,0,3,13,170
2,800.0,1,Edwyd Fossoway,edwyd.fossoway@jnj.com,Edwyd,Fossoway,15,7,11,5,19.77,1,3,0,1,6,1,1,0,1,0,0,63.0,1,0,2,16,0
3,600.0,1,Eleyna Westerling,eleyna.westerling@ge.org,Eleyna,Westerling,13,6,11,5,90.0,1,2,0,1,6,1,14,0,3,0,0,43.8,1,0,2,14,0
4,1490.0,1,Elyn Norridge,elyn.norridge@jnj.com,Elyn,Norridge,47,8,6,10,40.38,1,0,0,0,5,1,5,0,8,0,0,84.8,1,1,3,12,205


In [2]:
#looping to detect features with missing values
for col in chef:

    # creating columns with 1s if missing and 0 if not
    if chef[col].isnull().astype(int).sum() > 0:
        chef['m_'+col] = chef[col].isnull().astype(int)


#summing the missing value flags to check the results of the loop above
chef[['m_FAMILY_NAME']].sum(axis = 0)

m_FAMILY_NAME    47
dtype: int64

In [3]:
# instantiating an imputation value
fill = "none"

# imputing 'FAMILY_NAME'
chef['FAMILY_NAME'] = chef['FAMILY_NAME'].fillna(fill)

In [4]:
# checking if all missing values have been taken care of
chef.isnull().any(axis = 0).any(axis = 0)

False

In [5]:
#splitting emails

# placeholder list
placeholder_lst = []

# looping over each email address
for index, col in chef.iterrows():
    
    # splitting email domain at '@'
    split_email = chef.loc[index, 'EMAIL'].split(sep = '@')
    
    # appending placeholder_lst with the results
    placeholder_lst.append(split_email)
    

# converting placeholder_lst into a DataFrame 
email_df = pd.DataFrame(placeholder_lst)


# displaying the results
email_df

Unnamed: 0,0,1
0,saathos,unitedhealth.com
1,alysanne.osgrey,ge.org
2,edwyd.fossoway,jnj.com
3,eleyna.westerling,ge.org
4,elyn.norridge,jnj.com
...,...,...
1941,obara.sand,yahoo.com
1942,quentyn.blackwood,yahoo.com
1943,rhonda.rowan,gmail.com
1944,turnip,yahoo.com


In [6]:
#concatenating with original DataFrame

# safety measure in case of multiple concatenations
chef = pd.read_excel(file)


# renaming column to concatenate
email_df.columns = ['0' , 'email_domain']


# concatenating email_domain with chef DataFrame
chef = pd.concat([chef, email_df['email_domain']],
                     axis = 1)


# printing value counts of email_domain
chef.loc[: ,'email_domain'].value_counts()

gmail.com           303
protonmail.com      284
yahoo.com           274
msn.com              72
aol.com              69
passport.com         64
hotmail.com          63
live.com             62
me.com               59
amex.com             30
jnj.com              28
merck.com            28
mcdonalds.com        28
cocacola.com         28
apple.com            27
nike.com             27
ge.org               26
dupont.com           26
ibm.com              26
microsoft.com        25
chevron.com          25
travelers.com        24
unitedhealth.com     24
exxon.com            24
boeing.com           23
caterpillar.com      22
pg.com               22
verizon.com          22
mmm.com              22
walmart.com          21
disney.com           21
visa.com             20
pfizer.com           20
jpmorgan.com         19
goldmansacs.com      18
cisco.com            18
unitedtech.com       18
homedepot.com        17
intel.com            17
Name: email_domain, dtype: int64

In [7]:
# email domain types
personal_email_domain = ['@gmail.com', '@yahoo.com']
company_email_domain  = ['@protonmail.com','@amex.com','@jnj.com','@mcdonalds.com',
                        '@merck.com','@cocacola.com','@nike.com','@apple.com','@ge.org','@dupont.com','@ibm.com',
                        '@chevron.com','@microsoft.com','@exxon.com','@travelers.com','@unitedhealth.com',
                        '@boeing.com','@mmm.com','@pg.com','@caterpillar.com','@verizon.com','@disney.com',
                        '@walmart.com','@visa.com','@pfizer.com','@jpmorgan.com','@unitedtech.com','@goldmansacs.com',
                        '@cisco.com','@intel.com','@homedepot.com']
junk_email_domain = ['@me.com','@aol.com','@hotmail.com','@live.com','@msn.com','@passport.com']


# placeholder list, creating an empty list
placeholder_lst = []


#looping to group observations by domain type
for domain in chef['email_domain']:
    
    if '@' + domain in personal_email_domain:
        placeholder_lst.append('personal')
        

    elif '@' + domain in company_email_domain:
        placeholder_lst.append('company')
        
    elif '@' + domain in junk_email_domain:
        placeholder_lst.append('junk')

    else:
            print('Unknown')


# concatenating with original DataFrame
chef['domain_group'] = pd.Series(placeholder_lst)


# checking results
chef['domain_group'].value_counts()

company     980
personal    577
junk        389
Name: domain_group, dtype: int64

In [8]:
#creating dummies for the domain_group and putting the data into a new dataframe with the created dummies
chef_data = pd.get_dummies(chef, columns = ['domain_group'])
chef_data.head() #checking if the columns are created using head()

Unnamed: 0,REVENUE,CROSS_SELL_SUCCESS,NAME,EMAIL,FIRST_NAME,FAMILY_NAME,TOTAL_MEALS_ORDERED,UNIQUE_MEALS_PURCH,CONTACTS_W_CUSTOMER_SERVICE,PRODUCT_CATEGORIES_VIEWED,AVG_TIME_PER_SITE_VISIT,MOBILE_NUMBER,CANCELLATIONS_BEFORE_NOON,CANCELLATIONS_AFTER_NOON,TASTES_AND_PREFERENCES,PC_LOGINS,MOBILE_LOGINS,WEEKLY_PLAN,EARLY_DELIVERIES,LATE_DELIVERIES,PACKAGE_LOCKER,REFRIGERATED_LOCKER,AVG_PREP_VID_TIME,LARGEST_ORDER_SIZE,MASTER_CLASSES_ATTENDED,MEDIAN_MEAL_RATING,AVG_CLICKS_PER_VISIT,TOTAL_PHOTOS_VIEWED,email_domain,domain_group_company,domain_group_junk,domain_group_personal
0,393.0,1,Saathos,saathos@unitedhealth.com,Saathos,Saathos,14,6,12,10,48.0,1,3,1,1,5,2,0,0,2,0,0,33.4,1,0,1,17,0,unitedhealth.com,1,0,0
1,1365.0,1,Alysanne Osgrey,alysanne.osgrey@ge.org,Alysanne,Osgrey,87,3,8,8,40.35,1,0,0,1,5,1,12,0,2,0,0,84.8,1,0,3,13,170,ge.org,1,0,0
2,800.0,1,Edwyd Fossoway,edwyd.fossoway@jnj.com,Edwyd,Fossoway,15,7,11,5,19.77,1,3,0,1,6,1,1,0,1,0,0,63.0,1,0,2,16,0,jnj.com,1,0,0
3,600.0,1,Eleyna Westerling,eleyna.westerling@ge.org,Eleyna,Westerling,13,6,11,5,90.0,1,2,0,1,6,1,14,0,3,0,0,43.8,1,0,2,14,0,ge.org,1,0,0
4,1490.0,1,Elyn Norridge,elyn.norridge@jnj.com,Elyn,Norridge,47,8,6,10,40.38,1,0,0,0,5,1,5,0,8,0,0,84.8,1,1,3,12,205,jnj.com,1,0,0


In [9]:
#########################
# text_split_feature
#########################
#creating a new variable in the dataframe called number of names
def text_split_feature(col, df, sep=' ', new_col_name='number_of_names'):
    """
Splits values in a string Series (as part of a DataFrame) and sums the number
of resulting items. Automatically appends summed column to original DataFrame.

PARAMETERS
----------
col          : column to split
df           : DataFrame where column is located
sep          : string sequence to split by, default ' '
new_col_name : name of new column after summing split, default
               'number_of_names'
"""
    
    df[new_col_name] = 0
    
    
    for index, val in df.iterrows():
        df.loc[index, new_col_name] = len(df.loc[index, col].split(sep = ' '))

In [10]:
text_split_feature(col = 'NAME',
                   df  = chef_data)


# checking results
chef_data['number_of_names'].value_counts().sort_index()

1     591
2    1201
3      98
4       9
5      35
6      12
Name: number_of_names, dtype: int64

In [11]:
chef_data.head()

Unnamed: 0,REVENUE,CROSS_SELL_SUCCESS,NAME,EMAIL,FIRST_NAME,FAMILY_NAME,TOTAL_MEALS_ORDERED,UNIQUE_MEALS_PURCH,CONTACTS_W_CUSTOMER_SERVICE,PRODUCT_CATEGORIES_VIEWED,AVG_TIME_PER_SITE_VISIT,MOBILE_NUMBER,CANCELLATIONS_BEFORE_NOON,CANCELLATIONS_AFTER_NOON,TASTES_AND_PREFERENCES,PC_LOGINS,MOBILE_LOGINS,WEEKLY_PLAN,EARLY_DELIVERIES,LATE_DELIVERIES,PACKAGE_LOCKER,REFRIGERATED_LOCKER,AVG_PREP_VID_TIME,LARGEST_ORDER_SIZE,MASTER_CLASSES_ATTENDED,MEDIAN_MEAL_RATING,AVG_CLICKS_PER_VISIT,TOTAL_PHOTOS_VIEWED,email_domain,domain_group_company,domain_group_junk,domain_group_personal,number_of_names
0,393.0,1,Saathos,saathos@unitedhealth.com,Saathos,Saathos,14,6,12,10,48.0,1,3,1,1,5,2,0,0,2,0,0,33.4,1,0,1,17,0,unitedhealth.com,1,0,0,1
1,1365.0,1,Alysanne Osgrey,alysanne.osgrey@ge.org,Alysanne,Osgrey,87,3,8,8,40.35,1,0,0,1,5,1,12,0,2,0,0,84.8,1,0,3,13,170,ge.org,1,0,0,2
2,800.0,1,Edwyd Fossoway,edwyd.fossoway@jnj.com,Edwyd,Fossoway,15,7,11,5,19.77,1,3,0,1,6,1,1,0,1,0,0,63.0,1,0,2,16,0,jnj.com,1,0,0,2
3,600.0,1,Eleyna Westerling,eleyna.westerling@ge.org,Eleyna,Westerling,13,6,11,5,90.0,1,2,0,1,6,1,14,0,3,0,0,43.8,1,0,2,14,0,ge.org,1,0,0,2
4,1490.0,1,Elyn Norridge,elyn.norridge@jnj.com,Elyn,Norridge,47,8,6,10,40.38,1,0,0,0,5,1,5,0,8,0,0,84.8,1,1,3,12,205,jnj.com,1,0,0,2


In [12]:
# declaring explanatory variables
chef_df = chef_data.drop(['NAME','EMAIL',
                        'FIRST_NAME',
                        'FAMILY_NAME','email_domain','CROSS_SELL_SUCCESS'],
                         axis = 1)

# declaring response variable
chef_target = chef.loc[ : , 'CROSS_SELL_SUCCESS']

In [13]:
# train-test split with stratification
x_train, x_test, y_train, y_test = train_test_split(
            chef_df,
            chef_target,
            test_size    = 0.25,
            random_state = 219,
            stratify     = chef_target)


# merging training data for statsmodels
chef_train = pd.concat([x_train, y_train], axis = 1)

In [14]:
print(f"""

Response Variable Proportions (Training Set)
--------------------------------------------
{y_train.value_counts(normalize = True).round(decimals = 2)}



Response Variable Proportions (Testing Set)
--------------------------------------------
{y_test.value_counts(normalize = True).round(decimals = 2)}
""")





Response Variable Proportions (Training Set)
--------------------------------------------
1    0.68
0    0.32
Name: CROSS_SELL_SUCCESS, dtype: float64



Response Variable Proportions (Testing Set)
--------------------------------------------
1    0.68
0    0.32
Name: CROSS_SELL_SUCCESS, dtype: float64



In [15]:
# formatting each explanatory variable
for val in chef_df:
    print(f" {val} + ")

 REVENUE + 
 TOTAL_MEALS_ORDERED + 
 UNIQUE_MEALS_PURCH + 
 CONTACTS_W_CUSTOMER_SERVICE + 
 PRODUCT_CATEGORIES_VIEWED + 
 AVG_TIME_PER_SITE_VISIT + 
 MOBILE_NUMBER + 
 CANCELLATIONS_BEFORE_NOON + 
 CANCELLATIONS_AFTER_NOON + 
 TASTES_AND_PREFERENCES + 
 PC_LOGINS + 
 MOBILE_LOGINS + 
 WEEKLY_PLAN + 
 EARLY_DELIVERIES + 
 LATE_DELIVERIES + 
 PACKAGE_LOCKER + 
 REFRIGERATED_LOCKER + 
 AVG_PREP_VID_TIME + 
 LARGEST_ORDER_SIZE + 
 MASTER_CLASSES_ATTENDED + 
 MEDIAN_MEAL_RATING + 
 AVG_CLICKS_PER_VISIT + 
 TOTAL_PHOTOS_VIEWED + 
 domain_group_company + 
 domain_group_junk + 
 domain_group_personal + 
 number_of_names + 


In [16]:
# instantiating a logistic regression model object
logit_full = smf.logit(formula = """ CROSS_SELL_SUCCESS ~     
                                      REVENUE +  
                                     CONTACTS_W_CUSTOMER_SERVICE +   
                                     MOBILE_NUMBER + 
                                     CANCELLATIONS_BEFORE_NOON +  
                                     TASTES_AND_PREFERENCES + 
                                     PC_LOGINS +   
                                     EARLY_DELIVERIES +  
                                     REFRIGERATED_LOCKER +  
                                     MASTER_CLASSES_ATTENDED + 
                                     MEDIAN_MEAL_RATING +  
                                     domain_group_company + 
                                     domain_group_junk + 
                                     number_of_names""",
                                     data    = chef_train)


# fitting the model object
logit_full = logit_full.fit()


# checking the results SUMMARY
logit_full.summary()

Optimization terminated successfully.
         Current function value: 0.541700
         Iterations 6


0,1,2,3
Dep. Variable:,CROSS_SELL_SUCCESS,No. Observations:,1459.0
Model:,Logit,Df Residuals:,1445.0
Method:,MLE,Df Model:,13.0
Date:,"Tue, 26 Jan 2021",Pseudo R-squ.:,0.1374
Time:,22:19:22,Log-Likelihood:,-790.34
converged:,True,LL-Null:,-916.19
Covariance Type:,nonrobust,LLR p-value:,2.83e-46

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-3.6668,0.727,-5.042,0.000,-5.092,-2.241
REVENUE,-0.0001,7.28e-05,-1.780,0.075,-0.000,1.31e-05
CONTACTS_W_CUSTOMER_SERVICE,0.0523,0.027,1.901,0.057,-0.002,0.106
MOBILE_NUMBER,0.8556,0.176,4.858,0.000,0.510,1.201
CANCELLATIONS_BEFORE_NOON,0.2777,0.047,5.970,0.000,0.187,0.369
TASTES_AND_PREFERENCES,0.3688,0.135,2.731,0.006,0.104,0.633
PC_LOGINS,0.2236,0.107,2.091,0.037,0.014,0.433
EARLY_DELIVERIES,0.0622,0.028,2.246,0.025,0.008,0.117
REFRIGERATED_LOCKER,0.4905,0.209,2.341,0.019,0.080,0.901


In [17]:
# creating a dictionary to store candidate models

candidate_dict = {

    #all variables (set)
 'logit_full'    : ['REVENUE', 'TOTAL_MEALS_ORDERED', 'UNIQUE_MEALS_PURCH', 'CONTACTS_W_CUSTOMER_SERVICE', 
 'PRODUCT_CATEGORIES_VIEWED', 'AVG_TIME_PER_SITE_VISIT', 'MOBILE_NUMBER', 'CANCELLATIONS_BEFORE_NOON',
 'CANCELLATIONS_AFTER_NOON', 'TASTES_AND_PREFERENCES', 'PC_LOGINS', 'MOBILE_LOGINS', 'WEEKLY_PLAN', 
 'EARLY_DELIVERIES', 'LATE_DELIVERIES', 'PACKAGE_LOCKER', 'REFRIGERATED_LOCKER','AVG_PREP_VID_TIME', 
 'LARGEST_ORDER_SIZE','MASTER_CLASSES_ATTENDED','MEDIAN_MEAL_RATING','AVG_CLICKS_PER_VISIT','TOTAL_PHOTOS_VIEWED', 
 'domain_group_company','domain_group_junk','domain_group_personal', 'number_of_names'],
    
 # significant variables only (set 1)
 'logit_sig'    : ['REVENUE','CONTACTS_W_CUSTOMER_SERVICE',   
                                     'MOBILE_NUMBER',
                                     'CANCELLATIONS_BEFORE_NOON',  
                                     'TASTES_AND_PREFERENCES',
                                     'PC_LOGINS',
                                     'EARLY_DELIVERIES',  
                                     'REFRIGERATED_LOCKER',  
                                     'MASTER_CLASSES_ATTENDED', 
                                     'MEDIAN_MEAL_RATING',
                                     'domain_group_company', 
                                     'domain_group_junk',
                                     'number_of_names']
  
}

In [18]:
#using the logit_sig (significant variables)
chef_df   =  chef_data.loc[ : , candidate_dict['logit_sig']]
chef_target =  chef_data.loc[ : , 'CROSS_SELL_SUCCESS']


# train/test split with the full model
x_train, x_test, y_train, y_test = train_test_split(
            chef_df,
            chef_target,
            random_state = 219,
            test_size    = 0.25,
            stratify     = chef_target)


# INSTANTIATING a logistic regression model
logreg = LogisticRegression(solver = 'lbfgs',
                            C = 1,
                            random_state = 219)


# FITTING the training data
logreg_fit = logreg.fit(x_train, y_train)


# PREDICTING based on the testing set
logreg_pred = logreg_fit.predict(x_test)


# SCORING the results
print('LogReg Training ACCURACY:', logreg_fit.score(x_train, y_train).round(4))
print('LogReg Testing  ACCURACY:', logreg_fit.score(x_test, y_test).round(4))

# saving scoring data for future use
logreg_train_score = logreg_fit.score(x_train, y_train).round(4) # accuracy
logreg_test_score  = logreg_fit.score(x_test, y_test).round(4)   # accuracy


# displaying and saving the gap between training and testing
print('LogReg Train-Test Gap   :', abs(logreg_train_score - logreg_test_score).round(4))
logreg_test_gap = abs(logreg_train_score - logreg_test_score).round(4)

LogReg Training ACCURACY: 0.7231
LogReg Testing  ACCURACY: 0.7207
LogReg Train-Test Gap   : 0.0024


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
# creating a confusion matrix
print(confusion_matrix(y_true = y_test,
                       y_pred = logreg_pred))

[[ 51 105]
 [ 31 300]]


In [20]:
# unpacking the confusion matrix
logreg_tn, \
logreg_fp, \
logreg_fn, \
logreg_tp = confusion_matrix(y_true = y_test, y_pred = logreg_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {logreg_tn}
False Positives: {logreg_fp}
False Negatives: {logreg_fn}
True Positives : {logreg_tp}
""")


True Negatives : 51
False Positives: 105
False Negatives: 31
True Positives : 300



In [21]:
# area under the roc curve (auc)
print(roc_auc_score(y_true  = y_test,
                    y_score = logreg_pred).round(decimals = 4))


# saving AUC score for future use
logreg_auc_score = roc_auc_score(y_true  = y_test,
                                 y_score = logreg_pred).round(decimals = 4)

0.6166


In [22]:
# zipping each feature name to its coefficient
logreg_model_values = zip(chef_df[candidate_dict['logit_sig']].columns,
                          logreg_fit.coef_.ravel().round(decimals = 2))


# setting up a placeholder list to store model features
logreg_model_lst = [('intercept', logreg_fit.intercept_[0].round(decimals = 2))]


# printing out each feature-coefficient pair one by one
for val in logreg_model_values:
    logreg_model_lst.append(val)
    

# checking the results
for pair in logreg_model_lst:
    print(pair)

('intercept', -0.24)
('REVENUE', -0.0)
('CONTACTS_W_CUSTOMER_SERVICE', 0.02)
('MOBILE_NUMBER', 0.54)
('CANCELLATIONS_BEFORE_NOON', 0.26)
('TASTES_AND_PREFERENCES', 0.28)
('PC_LOGINS', -0.15)
('EARLY_DELIVERIES', 0.05)
('REFRIGERATED_LOCKER', 0.29)
('MASTER_CLASSES_ATTENDED', 0.24)
('MEDIAN_MEAL_RATING', 0.01)
('domain_group_company', 0.56)
('domain_group_junk', -1.21)
('number_of_names', 0.36)


In [23]:
########################################
# display_tree
########################################
#creating a user defined function
def display_tree(tree, feature_df, height = 500, width = 800):
    """
    PARAMETERS
    ----------
    tree       : fitted tree model object
        fitted CART model to visualized
    feature_df : DataFrame
        DataFrame of explanatory features (used to generate labels)
    height     : int, default 500
        height in pixels to which to constrain image in html
    width      : int, default 800
        width in pixels to which to constrain image in html
    """

    # visualizing the tree
    dot_data = StringIO()

    
    # exporting tree to graphviz
    export_graphviz(decision_tree      = tree,
                    out_file           = dot_data,
                    filled             = True,
                    rounded            = True,
                    special_characters = True,
                    feature_names      = feature_df.columns)


    # declaring a graph object
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())


    # creating image
    img = Image(graph.create_png(),
                height = height,
                width  = width)
    
    return img

########################################
# plot_feature_importances
########################################
def plot_feature_importances(model, train, export = False):
    """
    Plots the importance of features from a CART model.
    
    PARAMETERS
    ----------
    model  : CART model
    train  : explanatory variable training data
    export : whether or not to export as a .png image, default False
    """
    
    # declaring the number
    n_features = x_train.shape[1]
    
    # setting plot window
    fig, ax = plt.subplots(figsize=(12,9))
    
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(pd.np.arange(n_features), train.columns)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    
    if export == True:
        plt.savefig('Tree_Leaf_50_Feature_Importance.png')

In [24]:
#Full Tree Model
# INSTANTIATING a classification tree object
full_tree = DecisionTreeClassifier()


# FITTING the training data
full_tree_fit = full_tree.fit(x_train, y_train)


# PREDICTING on new data
full_tree_pred = full_tree_fit.predict(x_test)


# SCORING the model
print('Full Tree Training ACCURACY:', full_tree_fit.score(x_train,
                                                     y_train).round(4))

print('Full Tree Testing ACCURACY :', full_tree_fit.score(x_test,
                                                     y_test).round(4))

print('Full Tree AUC Score:', roc_auc_score(y_true  = y_test,
                                            y_score = full_tree_pred).round(4))


# saving scoring data for future use
full_tree_train_score = full_tree_fit.score(x_train, y_train).round(4) # accuracy
full_tree_test_score  = full_tree_fit.score(x_test, y_test).round(4)   # accuracy


# saving AUC
full_tree_auc_score   = roc_auc_score(y_true  = y_test,
                                      y_score = full_tree_pred).round(4) # auc

Full Tree Training ACCURACY: 1.0
Full Tree Testing ACCURACY : 0.6283
Full Tree AUC Score: 0.5707


In [25]:
# unpacking the confusion matrix
full_tree_tn, \
full_tree_fp, \
full_tree_fn, \
full_tree_tp = confusion_matrix(y_true = y_test, y_pred = full_tree_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {full_tree_tn}
False Positives: {full_tree_fp}
False Negatives: {full_tree_fn}
True Positives : {full_tree_tp}
""")


True Negatives : 64
False Positives: 92
False Negatives: 89
True Positives : 242



In [26]:
#Pruned Tree Model
# INSTANTIATING a classification tree object
pruned_tree = DecisionTreeClassifier(max_depth = 4,
                                     min_samples_leaf = 25,
                                     random_state = 219)


# FITTING the training data
pruned_tree_fit  = pruned_tree.fit(x_train, y_train)


# PREDICTING on new data
pruned_tree_pred = pruned_tree_fit.predict(x_test)


# SCORING the model
print('Training ACCURACY:', pruned_tree_fit.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', pruned_tree_fit.score(x_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = pruned_tree_pred).round(4))


# saving scoring data for future use
pruned_tree_train_score = pruned_tree_fit.score(x_train, y_train).round(4) # accuracy
pruned_tree_test_score  = pruned_tree_fit.score(x_test, y_test).round(4)   # accuracy


# saving auc score
pruned_tree_auc_score   = roc_auc_score(y_true  = y_test,
                                        y_score = pruned_tree_pred).round(4) # auc

Training ACCURACY: 0.7505
Testing  ACCURACY: 0.7598
AUC Score        : 0.686


In [27]:
# unpacking the confusion matrix
pruned_tree_tn, \
pruned_tree_fp, \
pruned_tree_fn, \
pruned_tree_tp = confusion_matrix(y_true = y_test, y_pred = pruned_tree_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {pruned_tree_tn}
False Positives: {pruned_tree_fp}
False Negatives: {pruned_tree_fn}
True Positives : {pruned_tree_tp}
""")


True Negatives : 75
False Positives: 81
False Negatives: 36
True Positives : 295



In [28]:
# printing and comparing results
print(f"""
Model         AUC Score      TN, FP, FN, TP
-----         ---------      --------------
Logistic      {logreg_auc_score}         {logreg_tn, logreg_fp, logreg_fn, logreg_tp}
Full Tree     {full_tree_auc_score}         {full_tree_tn, full_tree_fp, full_tree_fn, full_tree_tp}
Pruned Tree   {pruned_tree_auc_score}         {pruned_tree_tn, pruned_tree_fp, pruned_tree_fn, pruned_tree_tp}
""")


# creating a dictionary for model results
model_performance = {
    
    'Model Name'    : ['Logistic', 'Full Tree', 'Pruned Tree'],
           
    'AUC Score' : [logreg_auc_score, full_tree_auc_score, pruned_tree_auc_score],
    
    'Training Accuracy' : [logreg_train_score, full_tree_train_score,
                           pruned_tree_train_score],
           
    'Testing Accuracy'  : [logreg_test_score, full_tree_test_score,
                           pruned_tree_test_score],

    'Confusion Matrix'  : [(logreg_tn, logreg_fp, logreg_fn, logreg_tp),
                           (full_tree_tn, full_tree_fp, full_tree_fn, full_tree_tp),
                           (pruned_tree_tn, pruned_tree_fp, pruned_tree_fn, pruned_tree_tp)]}


# converting model_performance into a DataFrame
model_performance = pd.DataFrame(model_performance)


# sending model results to Excel
model_performance.to_excel('./model_results/classification_model_performance.xlsx',
                           index = False)


Model         AUC Score      TN, FP, FN, TP
-----         ---------      --------------
Logistic      0.6166         (51, 105, 31, 300)
Full Tree     0.5707         (64, 92, 89, 242)
Pruned Tree   0.686         (75, 81, 36, 295)



In [29]:
########################################
# display_tree
########################################
def display_tree(tree, feature_df, height = 500, width = 800, export = False):
    """
    PARAMETERS
    ----------
    tree       : fitted tree model object
        fitted CART model to visualized
    feature_df : DataFrame
        DataFrame of explanatory features (used to generate labels)
    height     : int, default 500
        height in pixels to which to constrain image in html
    width      : int, default 800
        width in pixels to which to constrain image in html
    export     : bool, defalut False
        whether or not to export the tree as a .png file
    """

    # visualizing the tree
    dot_data = StringIO()

    
    # exporting tree to graphviz
    export_graphviz(decision_tree      = tree,
                    out_file           = dot_data,
                    filled             = True,
                    rounded            = True,
                    special_characters = True,
                    feature_names      = feature_df.columns)


    # declaring a graph object
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())


    # creating image
    img = Image(graph.create_png(),
                height = height,
                width  = width,
                unconfined = True)


    return img

In [30]:
# train/test split with the logit_sig variables (set 1)
chef_df   =  chef_data.loc[ : , candidate_dict['logit_sig']]
chef_target =  chef_data.loc[ : , 'CROSS_SELL_SUCCESS']


# train/test split
x_train, x_test, y_train, y_test = train_test_split(
            chef_df,
            chef_target,
            random_state = 219,
            test_size    = 0.25,
            stratify     = chef_target)

In [31]:
# INSTANTIATING a logistic regression model with default values
lr_default = LogisticRegression(solver = 'lbfgs',
                                C = 1.0,
                                warm_start = False,
                                random_state = 219)

In [32]:
# FITTING the training data
lr_default_fit = lr_default.fit(x_train, y_train)


# PREDICTING based on the testing set
lr_default_pred = lr_default_fit.predict(x_test)


# SCORING the results
print('Training ACCURACY:', lr_default_fit.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', lr_default_fit.score(x_test, y_test).round(4))


# SCORING with AUC
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = lr_default_pred).round(4))


# saving scoring data for future use
logreg_train_score = lr_default_fit.score(x_train, y_train).round(4) # accuracy
logreg_test_score  = lr_default_fit.score(x_test, y_test).round(4)   # accuracy


# saving AUC score
logreg_auc_score = roc_auc_score(y_true  = y_test,
                                 y_score = lr_default_pred).round(4)

Training ACCURACY: 0.7231
Testing  ACCURACY: 0.7207
AUC Score        : 0.6166


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [33]:
########################################
# RandomizedSearchCV
########################################

# declaring a hyperparameter space
#C_space          = pd.np.arange(0.1, 5.0, 0.1)
#warm_start_space = [True, False]
#solver_space     = ['newton-cg', 'sag', 'lbfgs']


# creating a hyperparameter grid
#param_grid = {'C'          : C_space,
 #             'warm_start' : warm_start_space,
  #            'solver'     : solver_space}


# INSTANTIATING the model object without hyperparameters
#lr_tuned = LogisticRegression(random_state = 219,
 #                             max_iter     = 1000)


# GridSearchCV object
#lr_tuned_cv = RandomizedSearchCV(estimator           = lr_tuned,   # the model object
 #                                param_distributions = param_grid, # parameters to tune
  #                               cv                  = 3,          # how many folds in cross-validation
   #                              n_iter              = 250,        # number of combinations of hyperparameters to try
    #                             random_state        = 219,        # starting point for random sequence
     #                            scoring = make_scorer(
      #                                     roc_auc_score,
       #                                    needs_threshold = False)) # scoring criteria (AUC)


# FITTING to the FULL DATASET (due to cross-validation)
#lr_tuned_cv.fit(chef_df, chef_target)


# printing the optimal parameters and best score
#print("Tuned Parameters  :", lr_tuned_cv.best_params_)
#print("Tuned CV AUC      :", lr_tuned_cv.best_score_.round(4))

In [34]:
# checking the best estimator for the model
#lr_tuned_cv.best_estimator_ 

In [35]:
#Tuned Logistic Regression
# building a model based on hyperparameter tuning results

# copy/pasting in the best_estimator_ results
# to avoid running another RandomizedSearch

# INSTANTIATING a logistic regression model with tuned values
lr_tuned = LogisticRegression(C=1.2000000000000002, max_iter=1000, random_state=219,
                   solver='newton-cg', warm_start=True)


# FITTING the model object
lr_tuned_fit = lr_tuned.fit(chef_df, chef_target)


# PREDICTING based on the testing set
lr_tuned_pred = lr_tuned.predict(x_test)


# SCORING the results
print('LR Tuned Training ACCURACY:', lr_tuned.score(x_train, y_train).round(4))
print('LR Tuned Testing  ACCURACY:', lr_tuned.score(x_test, y_test).round(4))
print('LR Tuned AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = lr_tuned_pred).round(4))


# saving scoring data for future use
lr_tuned_train_score = lr_tuned.score(x_train, y_train).round(4) # accuracy
lr_tuned_test_score  = lr_tuned.score(x_test, y_test).round(4)   # accuracy


# saving the AUC score
lr_tuned_auc         = roc_auc_score(y_true  = y_test,
                                     y_score = lr_tuned_pred).round(4) # auc

LR Tuned Training ACCURACY: 0.7457
LR Tuned Testing  ACCURACY: 0.7392
LR Tuned AUC Score        : 0.6421


In [36]:
# unpacking the confusion matrix
lr_tuned_tn, \
lr_tuned_fp, \
lr_tuned_fn, \
lr_tuned_tp = confusion_matrix(y_true = y_test, y_pred = lr_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {lr_tuned_tn}
False Positives: {lr_tuned_fp}
False Negatives: {lr_tuned_fn}
True Positives : {lr_tuned_tp}
""")


True Negatives : 58
False Positives: 98
False Negatives: 29
True Positives : 302



In [37]:
# loading model performance
model_performance = pd.read_excel('./model_results/classification_model_performance.xlsx')


# declaring model performance objects
lr_train_acc = lr_tuned.score(x_train, y_train).round(4)
lr_test_acc  = lr_tuned.score(x_test, y_test).round(4)
lr_auc       = roc_auc_score(y_true  = y_test,
                             y_score = lr_tuned_pred).round(4)


# appending to model_performance
model_performance = model_performance.append(
                          {'Model Name'        : 'Tuned LR',
                           'Training Accuracy' : lr_train_acc,
                           'Testing Accuracy'  : lr_test_acc,
                           'AUC Score'         : lr_auc,
                           'Confusion Matrix'  : (lr_tuned_tn,
                                                  lr_tuned_fp,
                                                  lr_tuned_fn,
                                                  lr_tuned_tp)},
                           ignore_index = True)


# checking the results
model_performance

Unnamed: 0,Model Name,AUC Score,Training Accuracy,Testing Accuracy,Confusion Matrix
0,Logistic,0.6166,0.7231,0.7207,"(51, 105, 31, 300)"
1,Full Tree,0.5707,1.0,0.6283,"(64, 92, 89, 242)"
2,Pruned Tree,0.686,0.7505,0.7598,"(75, 81, 36, 295)"
3,Tuned LR,0.6421,0.7457,0.7392,"(58, 98, 29, 302)"


In [38]:
# declaring a hyperparameter space
#criterion_space = ['gini', 'entropy']
#splitter_space  = ['best', 'random']
#depth_space     = pd.np.arange(1, 25, 1)
#leaf_space      = pd.np.arange(1, 100, 1)


# creating a hyperparameter grid
#param_grid = {'criterion'        : criterion_space,
 #             'splitter'         : splitter_space,
  #            'max_depth'        : depth_space,
   #           'min_samples_leaf' : leaf_space}


# INSTANTIATING the model object without hyperparameters
#tuned_tree = DecisionTreeClassifier(random_state = 219)


# RandomizedSearchCV object
#tuned_tree_cv = RandomizedSearchCV(estimator             = tuned_tree,
 #                                  param_distributions   = param_grid,
  #                                 cv                    = 3,
   #                                n_iter                = 1000,
    #                               random_state          = 219,
     #                              scoring = make_scorer(roc_auc_score,
      #                                       needs_threshold = False))


# FITTING to the FULL DATASET (due to cross-validation)
#tuned_tree_cv.fit(chef_df, chef_target)

# printing the optimal parameters and best score
#print("Tuned Parameters  :", tuned_tree_cv.best_params_)
#print("Tuned Training AUC:", tuned_tree_cv.best_score_.round(4))

In [151]:
# checking the best estimator for the model
#tuned_tree_cv.best_estimator_

DecisionTreeClassifier(max_depth=3, min_samples_leaf=16, random_state=219)

In [39]:
#Tuned Tree Model
# building a model based on hyperparameter tuning results

# copy/pasting in the best_estimator_ results
# to avoid running another RandomizedSearch

# INSTANTIATING a logistic regression model with tuned values
tree_tuned = DecisionTreeClassifier(max_depth=3, min_samples_leaf=16, random_state=219)


# FITTING the model object
tree_tuned_fit = tree_tuned.fit(chef_df, chef_target)


# PREDICTING based on the testing set
tree_tuned_pred = tree_tuned.predict(x_test)


# SCORING the results
print('Training ACCURACY:', tree_tuned.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', tree_tuned.score(x_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = tree_tuned_pred).round(4))


# saving scoring data for future use
tree_tuned_train_score = tree_tuned.score(x_train, y_train).round(4) # accuracy
tree_tuned_test_score  = tree_tuned.score(x_test, y_test).round(4)   # accuracy


# saving the AUC score
tree_tuned_auc         = roc_auc_score(y_true  = y_test,
                                     y_score = tree_tuned_pred).round(4) # auc

Training ACCURACY: 0.7402
Testing  ACCURACY: 0.7762
AUC Score        : 0.732


In [40]:
# unpacking the confusion matrix
tuned_tree_tn, \
tuned_tree_fp, \
tuned_tree_fn, \
tuned_tree_tp = confusion_matrix(y_true = y_test, y_pred = tree_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {tuned_tree_tn}
False Positives: {tuned_tree_fp}
False Negatives: {tuned_tree_fn}
True Positives : {tuned_tree_tp}
""")


True Negatives : 95
False Positives: 61
False Negatives: 48
True Positives : 283



In [41]:
# declaring model performance objects
tree_train_acc = tree_tuned.score(x_train, y_train).round(4)
tree_test_acc  = tree_tuned.score(x_test, y_test).round(4)
tree_auc       = roc_auc_score(y_true  = y_test,
                              y_score = tree_tuned_pred).round(4)


# appending to model_performance
model_performance = model_performance.append(
                          {'Model Name'        : 'Tuned Tree',
                           'Training Accuracy' : tree_train_acc,
                           'Testing Accuracy'  : tree_test_acc,
                           'AUC Score'         : tree_auc,
                           'Confusion Matrix'  : (tuned_tree_tn,
                                                  tuned_tree_fp,
                                                  tuned_tree_fn,
                                                  tuned_tree_tp)},
                           ignore_index = True)


# checking the results
model_performance

Unnamed: 0,Model Name,AUC Score,Training Accuracy,Testing Accuracy,Confusion Matrix
0,Logistic,0.6166,0.7231,0.7207,"(51, 105, 31, 300)"
1,Full Tree,0.5707,1.0,0.6283,"(64, 92, 89, 242)"
2,Pruned Tree,0.686,0.7505,0.7598,"(75, 81, 36, 295)"
3,Tuned LR,0.6421,0.7457,0.7392,"(58, 98, 29, 302)"
4,Tuned Tree,0.732,0.7402,0.7762,"(95, 61, 48, 283)"


In [42]:
########################################
# plot_feature_importances
########################################
def plot_feature_importances(model, train, export = False):
    """
    Plots the importance of features from a CART model.
    
    PARAMETERS
    ----------
    model  : CART model
    train  : explanatory variable training data
    export : whether or not to export as a .png image, default False
    """
    
    # declaring the number
    n_features = train.shape[1]
    
    # setting plot window
    fig, ax = plt.subplots(figsize=(12,9))
    
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(pd.np.arange(n_features), train.columns)
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    
    if export == True:
        plt.savefig('./analysis_images/Feature_Importance.png')

In [43]:
# train/test split with the logit_full variables (set)
chef_df   =  chef_data.loc[ : , candidate_dict['logit_full']]
chef_target =  chef_data.loc[ : , 'CROSS_SELL_SUCCESS']


# train/test split
x_train, x_test, y_train, y_test = train_test_split(
            chef_df,
            chef_target,
            random_state = 219,
            test_size    = 0.25,
            stratify     = chef_target)

In [44]:
#Random Forest Model
# INSTANTIATING a random forest model with default values
rf_default = RandomForestClassifier(n_estimators     = 100,
                                    criterion        = 'gini',
                                    max_depth        = None,
                                    min_samples_leaf = 1,
                                    bootstrap        = True,
                                    warm_start       = False,
                                    random_state     = 219)

In [45]:
# FITTING the training data
rf_default_fit = rf_default.fit(x_train, y_train)


# PREDICTING based on the testing set
rf_default_fit_pred = rf_default_fit.predict(x_test)


# SCORING the results
print('Training ACCURACY:', rf_default_fit.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', rf_default_fit.score(x_test, y_test).round(4))


# saving AUC score
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = rf_default_fit_pred).round(4))

Training ACCURACY: 1.0
Testing  ACCURACY: 0.7392
AUC Score        : 0.6455


In [46]:
# unpacking the confusion matrix
rf_tn, \
rf_fp, \
rf_fn, \
rf_tp = confusion_matrix(y_true = y_test, y_pred = rf_default_fit_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {rf_tn}
False Positives: {rf_fp}
False Negatives: {rf_fn}
True Positives : {rf_tp}
""")


True Negatives : 60
False Positives: 96
False Negatives: 31
True Positives : 300



In [47]:
# declaring model performance objects
rf_train_acc = rf_default_fit.score(x_train, y_train).round(4)
rf_test_acc  = rf_default_fit.score(x_test, y_test).round(4)
rf_auc       = roc_auc_score(y_true  = y_test,
                             y_score = rf_default_fit_pred).round(4)


# appending to model_performance
model_performance = model_performance.append(
                          {'Model Name'         : 'Random Forest (Full)',
                           'Training Accuracy'  : rf_train_acc,
                           'Testing Accuracy'   : rf_test_acc,
                           'AUC Score'          : rf_auc,
                           'Confusion Matrix'   : (rf_tn,
                                                   rf_fp,
                                                   rf_fn,
                                                   rf_tp)},
                          ignore_index = True)


# checking the results
model_performance

Unnamed: 0,Model Name,AUC Score,Training Accuracy,Testing Accuracy,Confusion Matrix
0,Logistic,0.6166,0.7231,0.7207,"(51, 105, 31, 300)"
1,Full Tree,0.5707,1.0,0.6283,"(64, 92, 89, 242)"
2,Pruned Tree,0.686,0.7505,0.7598,"(75, 81, 36, 295)"
3,Tuned LR,0.6421,0.7457,0.7392,"(58, 98, 29, 302)"
4,Tuned Tree,0.732,0.7402,0.7762,"(95, 61, 48, 283)"
5,Random Forest (Full),0.6455,1.0,0.7392,"(60, 96, 31, 300)"


In [83]:
# FITTING the training data
#rf_default_fit = rf_default.fit(x_train, y_train)


# PREDICTING based on the testing set
#rf_default_fit_pred = rf_default_fit.predict(x_test)


# declaring a hyperparameter space
#estimator_space  = pd.np.arange(600, 1100, 250)
#leaf_space       = pd.np.arange(1, 30, 1)
#depth_space      = pd.np.arange(1, 8, 1)
#criterion_space  = ['gini', 'entropy']
#bootstrap_space  = [True, False]
#warm_start_space = [True, False]


# creating a hyperparameter grid
#param_grid = {'n_estimators'     : estimator_space,
 #             'min_samples_leaf' : leaf_space,
  #            'max_depth'        : depth_space,
   #           'criterion'        : criterion_space,
    #          'bootstrap'        : bootstrap_space,
     #         'warm_start'       : warm_start_space}


# INSTANTIATING the model object without hyperparameters
#forest_grid = RandomForestClassifier(random_state = 219)


# GridSearchCV object
#forest_cv = RandomizedSearchCV(estimator           = forest_grid,
 #                              param_distributions = param_grid,
  #                             cv         = 3,
   #                            n_iter     = 1000,
    #                           scoring    = make_scorer(roc_auc_score,
     #                                       needs_threshold = False))


# FITTING to the FULL DATASET (due to cross-validation)
#forest_cv.fit(chef_df, chef_target)

# printing the optimal parameters and best score
#print("Tuned Parameters  :", forest_cv.best_params_)
#print("Tuned Training AUC:", forest_cv.best_score_.round(4))

  estimator_space  = pd.np.arange(600, 1100, 250)
  leaf_space       = pd.np.arange(1, 30, 1)
  depth_space      = pd.np.arange(1, 8, 1)


Tuned Parameters  : {'warm_start': True, 'n_estimators': 850, 'min_samples_leaf': 3, 'max_depth': 7, 'criterion': 'entropy', 'bootstrap': False}
Tuned Training AUC: 0.63


In [138]:
# checking the best estimator for the model
#forest_cv.best_estimator_

RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=7,
                       min_samples_leaf=3, n_estimators=850, random_state=219,
                       warm_start=True)

In [48]:
#Tuned Random Forest
# building a model based on hyperparameter tuning results

# copy/pasting in the best_estimator_ results
# to avoid running another RandomizedSearch
forest_tuned = RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=7,
                       min_samples_leaf=3, n_estimators=850, random_state=219,
                       warm_start=True)

# FITTING the model object
forest_tuned_fit = forest_tuned.fit(chef_df, chef_target)


# PREDICTING based on the testing set
forest_tuned_pred = forest_tuned_fit.predict(x_test)


# SCORING the results
print('Forest Tuned Training ACCURACY:', forest_tuned.score(x_train, y_train).round(4))
print('Forest Tuned Testing  ACCURACY:', forest_tuned.score(x_test, y_test).round(4))
print('Forest Tuned AUC Score        :', roc_auc_score(y_true  = y_test,
                                                   y_score = forest_tuned_pred).round(4))


# saving scoring data for future use
forest_tuned_train_score = forest_tuned.score(x_train, y_train).round(4) # accuracy
forest_tuned_test_score  = forest_tuned.score(x_test, y_test).round(4)   # accuracy


# saving the AUC score
forest_tuned_auc = roc_auc_score(y_true  = y_test,
                                 y_score = forest_tuned_pred).round(4) # auc

Forest Tuned Training ACCURACY: 0.8307
Forest Tuned Testing  ACCURACY: 0.8337
Forest Tuned AUC Score        : 0.7506


In [49]:
# unpacking the confusion matrix
tuned_rf_tn, \
tuned_rf_fp, \
tuned_rf_fn, \
tuned_rf_tp = confusion_matrix(y_true = y_test, y_pred = forest_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {tuned_rf_tn}
False Positives: {tuned_rf_fp}
False Negatives: {tuned_rf_fn}
True Positives : {tuned_rf_tp}
""")


True Negatives : 81
False Positives: 75
False Negatives: 6
True Positives : 325



In [50]:
# declaring model performance objects
tuned_rf_train_acc = forest_tuned_fit.score(x_train, y_train).round(4)
tuned_rf_test_acc  = forest_tuned_fit.score(x_test, y_test).round(4)
tuned_rf_auc       = roc_auc_score(y_true  = y_test,
                                   y_score = forest_tuned_pred).round(4)


# appending to model_performance
model_performance = model_performance.append(
                          {'Model Name'         : 'Tuned Random Forest (Full)',
                           'Training Accuracy'  : tuned_rf_train_acc,
                           'Testing Accuracy'   : tuned_rf_test_acc,
                           'AUC Score'          : tuned_rf_auc,
                           'Confusion Matrix'   : (tuned_rf_tn,
                                                   tuned_rf_fp,
                                                   tuned_rf_fn,
                                                   tuned_rf_tp)},
                          ignore_index = True)


# checking the results
model_performance

Unnamed: 0,Model Name,AUC Score,Training Accuracy,Testing Accuracy,Confusion Matrix
0,Logistic,0.6166,0.7231,0.7207,"(51, 105, 31, 300)"
1,Full Tree,0.5707,1.0,0.6283,"(64, 92, 89, 242)"
2,Pruned Tree,0.686,0.7505,0.7598,"(75, 81, 36, 295)"
3,Tuned LR,0.6421,0.7457,0.7392,"(58, 98, 29, 302)"
4,Tuned Tree,0.732,0.7402,0.7762,"(95, 61, 48, 283)"
5,Random Forest (Full),0.6455,1.0,0.7392,"(60, 96, 31, 300)"
6,Tuned Random Forest (Full),0.7506,0.8307,0.8337,"(81, 75, 6, 325)"


In [51]:
print("Tuned Random Forest (Full) is the best model with the AUC score of 0.7506 and the train-test gap of 0.003 ")
#sorting basis AUC score so that the highest score is at the top
model_performance.sort_values(by = 'AUC Score',
                              ascending = False)

Tuned Random Forest (Full) is the best model with the AUC score of 0.7506 and the train-test gap of 0.003 


Unnamed: 0,Model Name,AUC Score,Training Accuracy,Testing Accuracy,Confusion Matrix
6,Tuned Random Forest (Full),0.7506,0.8307,0.8337,"(81, 75, 6, 325)"
4,Tuned Tree,0.732,0.7402,0.7762,"(95, 61, 48, 283)"
2,Pruned Tree,0.686,0.7505,0.7598,"(75, 81, 36, 295)"
5,Random Forest (Full),0.6455,1.0,0.7392,"(60, 96, 31, 300)"
3,Tuned LR,0.6421,0.7457,0.7392,"(58, 98, 29, 302)"
0,Logistic,0.6166,0.7231,0.7207,"(51, 105, 31, 300)"
1,Full Tree,0.5707,1.0,0.6283,"(64, 92, 89, 242)"
