***
***
***


<br><strong>Assignment 1 | Regression-Based Analysis</strong><br>
<strong>FINAL MODEL</strong>
<br>
Machine Learning | Cohort 3
<br>
Linh Le<br>
<br>

***
***
***

<br>
The purpose of this model is to predict the expected revenue from each Apprentice Chef customer within their first year of orders. <br><br>

***

In [2]:
# importing necessary libraries
import pandas as pd                                     # data science essentials
import numpy as np              
import seaborn as sns                                   # essential graphical output
import matplotlib.pyplot as plt                         # enhanced graphical output
import statsmodels.formula.api as smf                   # regression modeling
from sklearn.model_selection import train_test_split    # train/test split
import sklearn.linear_model                             # linear models (scikit-learn)
from sklearn.ensemble import GradientBoostingRegressor  # Gradient Boost Regressor model


# setting pandas print options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


# specifying file name
file = "Apprentice_Chef_Dataset.xlsx"


# reading the file into Python
chef = pd.read_excel(file)

##############################################################################
##### Imputing Missing Values #####
##############################################################################

# Looping over columns with missing values
for col in chef:

    # creating columns with 1s if missing and 0 if not
    if chef[col].isnull().astype(int).sum() > 0:
        chef['m_'+col] = chef[col].isnull().astype(int)
        
        
# Creating an imputation value (through soft-coding)
fill = "Unknown"

# Imputing 'FAMILY_NAME'
chef['FAMILY_NAME'] = chef['FAMILY_NAME'].fillna(fill)


##############################################################################
##### Outlier Thresholds #####
##############################################################################

# Setting outlier thresholds based on histograms

total_meals_hi  = 180          # data starts to become skewed after this point   
unique_meals_lo = 1.5          # no data points below this point
unique_meals_hi = 9            # there's a sharp drop at 10
contacts_cust_lo = 3           # there's a drop below 3
contacts_cust_hi = 10          # uncharacteristic increase after 10 that does not follow normal distribution
prod_viewed_lo = 1             # no data points below 1
prod_viewed_hi = 10            # no data points above 10
avg_site_time_hi = 175         # few customers spend over 175 seconds on the website
canc_before_noon_hi = 5        # data skews after 5
canc_after_noon_lo = 1         # very few data points below this number
canc_after_noon_hi = 2         # very few data points after this number
pc_logins_lo = 5               # very few data points below 5
pc_logins_hi = 6               # very few data points above 6
mobile_logins_lo = 1           # very few data points below 1
mobile_logins_hi = 2           # very few data points above 2
weekly_plan_hi = 14            # sharp drop after 14
early_del_hi = 4               # sharp drop after 4
late_del_hi = 7                # data skewed after this point
avg_prep_time_lo = 80          # few points below 80
avg_prep_time_hi = 230         # data skews after this point
largest_order_lo = 2           # sharp drop below this point
largest_order_hi = 7           # sharp drop after this point
master_class_hi = 1            # more than 1 class is an outlier
median_rating_lo = 2           # few points below 2
median_rating_hi = 4           # very few points after 4
avg_clicks_lo = 8              # few points below 8
avg_clicks_hi = 17.5           # few points above 17.5
#####
revenue_lo = 500               # few values below 500
revenue_hi = 2500              # small uncharacteristic rise after this point


# Developing features (columns) for outliers based on previously-defined thresholds

# Total Meals
chef['out_total_meals'] = 0
condition_hi = chef.loc[0:,'out_total_meals'][chef['TOTAL_MEALS_ORDERED'] > total_meals_hi]

chef['out_total_meals'].replace(to_replace = condition_hi,
                                value      = 1,
                                inplace    = True)


# Unique Meals Purchased
chef['out_unique_meals'] = 0
condition_hi = chef.loc[0:,'out_unique_meals'][chef['UNIQUE_MEALS_PURCH'] > unique_meals_hi]
condition_lo = chef.loc[0:,'out_unique_meals'][chef['UNIQUE_MEALS_PURCH'] < unique_meals_lo]

chef['out_unique_meals'].replace(to_replace = condition_hi,
                                    value      = 1,
                                    inplace    = True)

chef['out_unique_meals'].replace(to_replace = condition_lo,
                                    value      = 1,
                                    inplace    = True)


# Contacts with Customer Service
chef['out_contacts_cust'] = 0
condition_hi = chef.loc[0:,'out_contacts_cust'][chef['CONTACTS_W_CUSTOMER_SERVICE'] > contacts_cust_hi]
condition_lo = chef.loc[0:,'out_contacts_cust'][chef['CONTACTS_W_CUSTOMER_SERVICE'] < contacts_cust_lo]

chef['out_contacts_cust'].replace(to_replace = condition_hi,
                                    value      = 1,
                                    inplace    = True)

chef['out_contacts_cust'].replace(to_replace = condition_lo,
                                    value      = 1,
                                    inplace    = True)


# Product Categories Viewed
chef['out_prod_viewed'] = 0
condition_hi = chef.loc[0:,'out_prod_viewed'][chef['PRODUCT_CATEGORIES_VIEWED'] > prod_viewed_hi]
condition_lo = chef.loc[0:,'out_prod_viewed'][chef['PRODUCT_CATEGORIES_VIEWED'] < prod_viewed_lo]

chef['out_prod_viewed'].replace(to_replace = condition_hi,
                                    value      = 1,
                                    inplace    = True)

chef['out_prod_viewed'].replace(to_replace = condition_lo,
                                    value      = 1,
                                    inplace    = True)


# Average Time per Site Visit
chef['out_avg_site_time'] = 0
condition_hi = chef.loc[0:,'out_avg_site_time'][chef['AVG_TIME_PER_SITE_VISIT'] > avg_site_time_hi]

chef['out_avg_site_time'].replace(to_replace = condition_hi,
                                value      = 1,
                                inplace    = True)


# Cancellations Before Noon
chef['out_canc_before_noon'] = 0
condition_hi = chef.loc[0:,'out_canc_before_noon'][chef['CANCELLATIONS_BEFORE_NOON'] > canc_before_noon_hi]

chef['out_canc_before_noon'].replace(to_replace = condition_hi,
                                value      = 1,
                                inplace    = True)

# Cancellations After Noon
chef['out_canc_after_noon'] = 0
condition_hi = chef.loc[0:,'out_canc_after_noon'][chef['CANCELLATIONS_AFTER_NOON'] > canc_after_noon_hi]
condition_lo = chef.loc[0:,'out_canc_after_noon'][chef['CANCELLATIONS_AFTER_NOON'] < canc_after_noon_hi]

chef['out_canc_after_noon'].replace(to_replace = condition_hi,
                                value      = 1,
                                inplace    = True)

chef['out_canc_after_noon'].replace(to_replace = condition_lo,
                                    value      = 1,
                                    inplace    = True)


# PC Logins
chef['out_pc_logins'] = 0
condition_hi = chef.loc[0:,'out_pc_logins'][chef['PC_LOGINS'] > pc_logins_hi]
condition_lo = chef.loc[0:,'out_pc_logins'][chef['PC_LOGINS'] < pc_logins_lo]

chef['out_pc_logins'].replace(to_replace = condition_hi,
                                    value      = 1,
                                    inplace    = True)

chef['out_pc_logins'].replace(to_replace = condition_lo,
                                    value      = 1,
                                    inplace    = True)


# Mobile Logins
chef['out_mobile_logins'] = 0
condition_hi = chef.loc[0:,'out_mobile_logins'][chef['MOBILE_LOGINS'] > mobile_logins_hi]
condition_lo = chef.loc[0:,'out_mobile_logins'][chef['MOBILE_LOGINS'] < mobile_logins_lo]

chef['out_mobile_logins'].replace(to_replace = condition_hi,
                                    value      = 1,
                                    inplace    = True)

chef['out_mobile_logins'].replace(to_replace = condition_lo,
                                    value      = 1,
                                    inplace    = True)


# Weekly Plan
chef['out_weekly_plan'] = 0
condition_hi = chef.loc[0:,'out_weekly_plan'][chef['WEEKLY_PLAN'] > weekly_plan_hi]

chef['out_weekly_plan'].replace(to_replace = condition_hi,
                                value      = 1,
                                inplace    = True)


# Early Deliveries
chef['out_early_deliveries'] = 0
condition_hi = chef.loc[0:,'out_early_deliveries'][chef['EARLY_DELIVERIES'] > early_del_hi]

chef['out_early_deliveries'].replace(to_replace = condition_hi,
                                value      = 1,
                                inplace    = True)

# Late Deliveries
chef['out_late_deliveries'] = 0
condition_hi = chef.loc[0:,'out_late_deliveries'][chef['LATE_DELIVERIES'] > late_del_hi]

chef['out_late_deliveries'].replace(to_replace = condition_hi,
                                value      = 1,
                                inplace    = True)


# Average Preparation Video Time
chef['out_avg_prep_vid_time'] = 0
condition_hi = chef.loc[0:,'out_avg_prep_vid_time'][chef['AVG_PREP_VID_TIME'] > avg_prep_time_hi]
condition_lo = chef.loc[0:,'out_avg_prep_vid_time'][chef['AVG_PREP_VID_TIME'] < avg_prep_time_lo]

chef['out_avg_prep_vid_time'].replace(to_replace = condition_hi,
                                    value      = 1,
                                    inplace    = True)

chef['out_avg_prep_vid_time'].replace(to_replace = condition_lo,
                                    value      = 1,
                                    inplace    = True)


# Largest Order
chef['out_largest_order'] = 0
condition_hi = chef.loc[0:,'out_largest_order'][chef['LARGEST_ORDER_SIZE'] > largest_order_hi]
condition_lo = chef.loc[0:,'out_largest_order'][chef['LARGEST_ORDER_SIZE'] < largest_order_lo]

chef['out_largest_order'].replace(to_replace = condition_hi,
                                    value      = 1,
                                    inplace    = True)

chef['out_largest_order'].replace(to_replace = condition_lo,
                                    value      = 1,
                                    inplace    = True)



# Master Classes Attended
chef['out_master_classes'] = 0
condition_hi = chef.loc[0:,'out_master_classes'][chef['MASTER_CLASSES_ATTENDED'] > master_class_hi]

chef['out_master_classes'].replace(to_replace = condition_hi,
                                value      = 1,
                                inplace    = True)


# Median Meal Rating
chef['out_median_rating'] = 0
condition_hi = chef.loc[0:,'out_median_rating'][chef['MEDIAN_MEAL_RATING'] > median_rating_hi]
condition_lo = chef.loc[0:,'out_median_rating'][chef['MEDIAN_MEAL_RATING'] < median_rating_lo]

chef['out_median_rating'].replace(to_replace = condition_hi,
                                    value      = 1,
                                    inplace    = True)

chef['out_median_rating'].replace(to_replace = condition_lo,
                                    value      = 1,
                                    inplace    = True)

# Average Clicks per Visit
chef['out_avg_clicks'] = 0
condition_hi = chef.loc[0:,'out_avg_clicks'][chef['AVG_CLICKS_PER_VISIT'] > avg_clicks_hi]
condition_lo = chef.loc[0:,'out_avg_clicks'][chef['AVG_CLICKS_PER_VISIT'] < avg_clicks_lo]

chef['out_avg_clicks'].replace(to_replace = condition_hi,
                                    value      = 1,
                                    inplace    = True)

chef['out_avg_clicks'].replace(to_replace = condition_lo,
                                    value      = 1,
                                    inplace    = True)

#####

# Revenue
chef['out_revenue'] = 0
condition_hi = chef.loc[0:,'out_revenue'][chef['REVENUE'] > revenue_hi]
condition_lo = chef.loc[0:,'out_revenue'][chef['REVENUE'] < revenue_lo]

chef['out_revenue'].replace(to_replace = condition_hi,
                                    value      = 1,
                                    inplace    = True)

chef['out_revenue'].replace(to_replace = condition_lo,
                                    value      = 1,
                                    inplace    = True)


##############################################################################
##### Trend-Based Features #####
##############################################################################

# Setting thresholds at points where the trend changes

total_meals_ordered_change = 150          # values start to scatter more after this point
unique_meals_purch_change = 0             # zero inflated
contacts_cust_change = 10                 # sudden, significant drop after this point
product_categories_viewed_change = 5      # sudden increase at 6
avg_time_site_visit_change = 200          # values scatter after this point
canc_before_noon_change = 0               # zero inflated
canc_after_noon_change = 0                # zero inflated
pc_logins_change = 6                      # sudden drop after this point
#mobile_logins_change = 1 
weekly_plan_change = 0                    # zero inflated
early_deliveries_change = 0               # zero inflated
late_deliveries_change = 0                # zero inflated
followed_recommendations_change = 0       # zero inflated
avg_prep_vid_time_change = 280            # after steadily increasing, values start to scatter more here
largest_order_size_change = 5             # after steadily increasing, values start a downward trend at this point
master_classes_attended_change = 0        # zero inflated
median_meal_rating_change = 4             # sudden drop here
avg_clicks_per_visit_change = 10          # downward trend starts here after having had an upward trend
total_photos_viewed_change = 0            # zero inflated


# Developing trend-based features

chef['change_Total_Meals_Ordered'] = 0
condition = chef.loc[0:,'change_Total_Meals_Ordered'][chef['TOTAL_MEALS_ORDERED'] > total_meals_ordered_change]

chef['change_Total_Meals_Ordered'].replace(to_replace = condition,
                                   value      = 1,
                                   inplace    = True)


chef['change_Unique_Meals_Purch'] = 0
condition = chef.loc[0:,'change_Unique_Meals_Purch'][chef['UNIQUE_MEALS_PURCH'] > unique_meals_purch_change]

chef['change_Unique_Meals_Purch'].replace(to_replace = condition,
                                   value      = 1,
                                   inplace    = True)


chef['change_Contacts_Customer_Service'] = 0
condition = chef.loc[0:,'change_Contacts_Customer_Service'][chef['CONTACTS_W_CUSTOMER_SERVICE'] > contacts_cust_change]

chef['change_Contacts_Customer_Service'].replace(to_replace = condition,
                                   value      = 1,
                                   inplace    = True)


chef['change_Product_Categories_Viewed'] = 0
condition = chef.loc[0:,'change_Product_Categories_Viewed'][chef['PRODUCT_CATEGORIES_VIEWED'] > product_categories_viewed_change]

chef['change_Product_Categories_Viewed'].replace(to_replace = condition,
                                   value      = 1,
                                   inplace    = True)


chef['change_Avg_Time_Site_Visit'] = 0
condition = chef.loc[0:,'change_Avg_Time_Site_Visit'][chef['AVG_TIME_PER_SITE_VISIT'] > avg_time_site_visit_change]

chef['change_Avg_Time_Site_Visit'].replace(to_replace = condition,
                                   value      = 1,
                                   inplace    = True)


chef['change_Canc_Before_Noon'] = 0
condition = chef.loc[0:,'change_Canc_Before_Noon'][chef['CANCELLATIONS_BEFORE_NOON'] > canc_before_noon_change]

chef['change_Canc_Before_Noon'].replace(to_replace = condition,
                                   value      = 1,
                                   inplace    = True)


chef['change_Canc_After_Noon'] = 0
condition = chef.loc[0:,'change_Canc_After_Noon'][chef['CANCELLATIONS_AFTER_NOON'] > canc_after_noon_change]

chef['change_Canc_After_Noon'].replace(to_replace = condition,
                                   value      = 1,
                                   inplace    = True)


chef['change_PC_Logins'] = 0
condition = chef.loc[0:,'change_PC_Logins'][chef['PC_LOGINS'] > pc_logins_change]

chef['change_PC_Logins'].replace(to_replace = condition,
                                   value      = 1,
                                   inplace    = True)


#chef['change_Mobile_Logins'] = 0
#condition = chef.loc[0:,'change_Mobile_Logins'][chef['MOBILE_LOGINS'] > mobile_logins_change]

#chef['change_Mobile_Logins'].replace(to_replace = condition,
                                   #value      = 1,
                                   #inplace    = True)


chef['change_Weekly_Plan'] = 0
condition = chef.loc[0:,'change_Weekly_Plan'][chef['WEEKLY_PLAN'] > weekly_plan_change]

chef['change_Weekly_Plan'].replace(to_replace = condition,
                                   value      = 1,
                                   inplace    = True)


chef['change_Early_Deliveries'] = 0
condition = chef.loc[0:,'change_Early_Deliveries'][chef['EARLY_DELIVERIES'] > early_deliveries_change]

chef['change_Early_Deliveries'].replace(to_replace = condition,
                                   value      = 1,
                                   inplace    = True)


chef['change_Late_Deliveries'] = 0
condition = chef.loc[0:,'change_Late_Deliveries'][chef['LATE_DELIVERIES'] > late_deliveries_change]

chef['change_Late_Deliveries'].replace(to_replace = condition,
                                   value      = 1,
                                   inplace    = True)


chef['change_Followed_Recommendations'] = 0
condition = chef.loc[0:,'change_Followed_Recommendations'][chef['FOLLOWED_RECOMMENDATIONS_PCT'] > followed_recommendations_change]

chef['change_Followed_Recommendations'].replace(to_replace = condition,
                                   value      = 1,
                                   inplace    = True)


chef['change_Avg_Prep_Vid_Time'] = 0
condition = chef.loc[0:,'change_Avg_Prep_Vid_Time'][chef['AVG_PREP_VID_TIME'] > avg_prep_vid_time_change]

chef['change_Avg_Prep_Vid_Time'].replace(to_replace = condition,
                                   value      = 1,
                                   inplace    = True)


chef['change_Largest_Order_Size'] = 0
condition = chef.loc[0:,'change_Largest_Order_Size'][chef['LARGEST_ORDER_SIZE'] > largest_order_size_change]

chef['change_Largest_Order_Size'].replace(to_replace = condition,
                                   value      = 1,
                                   inplace    = True)


chef['change_Master_Classes_Attended'] = 0
condition = chef.loc[0:,'change_Master_Classes_Attended'][chef['MASTER_CLASSES_ATTENDED'] > master_classes_attended_change]

chef['change_Master_Classes_Attended'].replace(to_replace = condition,
                                   value      = 1,
                                   inplace    = True)


chef['change_Median_Meal_Rating'] = 0
condition = chef.loc[0:,'change_Median_Meal_Rating'][chef['MEDIAN_MEAL_RATING'] > median_meal_rating_change]

chef['change_Median_Meal_Rating'].replace(to_replace = condition,
                                   value      = 1,
                                   inplace    = True)


chef['change_Avg_Clicks_Per_Visit'] = 0
condition = chef.loc[0:,'change_Avg_Clicks_Per_Visit'][chef['AVG_CLICKS_PER_VISIT'] > avg_clicks_per_visit_change]

chef['change_Avg_Clicks_Per_Visit'].replace(to_replace = condition,
                                   value      = 1,
                                   inplace    = True)


chef['change_Total_Photos_Viewed'] = 0
condition = chef.loc[0:,'change_Total_Photos_Viewed'][chef['TOTAL_PHOTOS_VIEWED'] > total_photos_viewed_change]

chef['change_Total_Photos_Viewed'].replace(to_replace = condition,
                                   value      = 1,
                                   inplace    = True)


##############################################################################
##### One-Hot Encoding Categorical Variables #####
##############################################################################

# Splitting emails

# placeholder list
placeholder_lst = []     

# looping over each email address
for index, col in chef.iterrows():
    
    # splitting email domain at '@'
    split_email = chef.loc[index, 'EMAIL'].split(sep = '@')
    
    # appending placeholder_lst with the results
    placeholder_lst.append(split_email)
    

# converting placeholder_lst into a DataFrame 
email_df = pd.DataFrame(placeholder_lst)

# Renaming column to concatenate
email_df.columns = ['name' , 'email_domain']     # Renaming columns 0 and 1 from before to "name" and "domain"


# Concatenating email_domain with chef DataFrame
chef = pd.concat([chef, email_df['email_domain']],
                   axis = 1)


# Aggregating domains into higher-level categories

# Email domain types
professional_email_domains = ['@mmm.com',
                              '@amex.com',
                              '@apple.com',
                              '@boeing.com',
                              '@caterpillar.com',
                              '@chevron.com',
                              '@cisco.com',
                              '@cocacola.com',
                              '@disney.com',
                              '@dupont.com',
                              '@exxon.com',
                              '@ge.org',
                              '@goldmansacs.com',
                              '@homedepot.com',
                              '@ibm.com',
                              '@intel.com',
                              '@jnj.com',
                              '@jpmorgan.com',
                              '@mcdonalds.com',
                              '@merck.com',
                              '@microsoft.com',
                              '@nike.com',
                              '@pfizer.com',
                              '@pg.com',
                              '@travelers.com',
                              '@unitedtech.com',
                              '@unitedhealth.com',
                              '@verizon.com',
                              '@visa.com',
                              '@walmart.com']
 
            
personal_email_domains = ['@gmail.com', 
                          '@yahoo.com',
                          '@protonmail.com']


junk_email_domains  = ['@me.com',
                       '@aol.com',
                       '@hotmail.com',
                       '@live.com',
                       '@msn.com',
                       '@passport.com']


# Placeholder list
placeholder_lst = []


# Looping to group observations by domain type
for domain in chef['email_domain']:
        if '@' + domain in professional_email_domains:
            placeholder_lst.append('professional')
    
        elif '@' + domain in personal_email_domains:
            placeholder_lst.append('personal')
            
        elif '@' + domain in junk_email_domains:
            placeholder_lst.append('junk')
            
        else:
            print('Unknown')


# Concatenating with original DataFrame
chef['domain_group'] = pd.Series(placeholder_lst)


# One hot encoding emails
one_hot_domain_group = pd.get_dummies(chef['domain_group'])

# Joining encoded variables with dataset
chef = chef.join([one_hot_domain_group])

# Dropping categorical variables after they've been encoded
chef_dropped = chef.drop(['EMAIL', 'email_domain', 'domain_group'],
               axis = 1)


##############################################################################
##### OLS Regression Model #####
##############################################################################

# Log transformation on Revenue
chef['log_revenue'] = np.log(chef['REVENUE'])

# Instantiating a model object
lm_best = smf.ols(formula = """chef['log_revenue'] ~ chef['TOTAL_MEALS_ORDERED'] + 
                                               chef['UNIQUE_MEALS_PURCH'] +
                                               chef['CONTACTS_W_CUSTOMER_SERVICE'] + 
                                               chef['AVG_TIME_PER_SITE_VISIT'] + 
                                               chef['REFRIGERATED_LOCKER'] + 
                                               chef['AVG_PREP_VID_TIME'] + 
                                               chef['LARGEST_ORDER_SIZE'] + 
                                               chef['MASTER_CLASSES_ATTENDED'] +
                                               chef['MEDIAN_MEAL_RATING'] + 
                                               chef['out_unique_meals'] +
                                               chef['out_largest_order'] +
                                               chef['out_master_classes'] +
                                               chef['out_median_rating'] +
                                               chef['change_Unique_Meals_Purch'] +
                                               chef['change_Contacts_Customer_Service'] +
                                               chef['change_Canc_After_Noon'] +
                                               chef['change_Weekly_Plan'] +
                                               chef['change_Avg_Prep_Vid_Time'] +
                                               chef['change_Master_Classes_Attended'] +
                                               chef['change_Median_Meal_Rating'] +
                                               chef['change_Avg_Clicks_Per_Visit'] +
                                               chef['change_Total_Photos_Viewed'] +
                                               chef['professional'] +
                                               chef['personal'] +
                                               chef['junk']""",
                                               data = chef)


# Fitting the model based on the data
results_lm = lm_best.fit()


# Printing the results
print(results_lm.summary())


##############################################################################
##### FINAL MODEL: Gradient Boost Regressor #####
##############################################################################

# Preparing explanatory variable data
chef_explanatory   = chef.drop(['REVENUE',                    # Response variable
                               'log_revenue',                 # Transformed response variable
                               'out_revenue',                 # Revenue outliers - this was engineered
                               'NAME',                        # Dropping string objects in the dataset
                               'EMAIL',
                               'email_domain', 
                               'domain_group',
                               'FIRST_NAME',
                               'FAMILY_NAME',
                               'TOTAL_PHOTOS_VIEWED',         #  Dropping insignificant variables in the dataset according to p-value 
                               'MOBILE_LOGINS',
                               'out_pc_logins',
                               'change_PC_Logins',
                               'change_Avg_Time_Site_Visit',
                               'change_Late_Deliveries',
                               'out_canc_before_noon',
                               'CANCELLATIONS_BEFORE_NOON',
                               'PRODUCT_CATEGORIES_VIEWED',
                               'out_avg_clicks',
                               'change_Canc_Before_Noon',
                               'PACKAGE_LOCKER',
                               'PC_LOGINS',
                               'change_Total_Meals_Ordered',
                               'out_prod_viewed',
                               'out_early_deliveries',
                               'change_Early_Deliveries',
                               'change_Largest_Order_Size',
                               'TASTES_AND_PREFERENCES',
                               'WEEKLY_PLAN',
                               'MOBILE_NUMBER',
                               'LATE_DELIVERIES',
                               'out_mobile_logins',
                               'out_late_deliveries',
                               'out_avg_prep_vid_time',
                               'EARLY_DELIVERIES',
                               'out_avg_site_time',
                               'AVG_CLICKS_PER_VISIT',
                               'change_Followed_Recommendations',
                               'FOLLOWED_RECOMMENDATIONS_PCT',
                               'out_weekly_plan',
                               'change_Product_Categories_Viewed',
                               'out_contacts_cust',
                               'CROSS_SELL_SUCCESS',
                               'out_total_meals'],  
                                axis = 1) 

# Preparing response variable data
chef_target = chef.loc[:, 'log_revenue']


# Preparing training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
            chef_explanatory,
            chef_target,
            test_size = 0.25,
            random_state = 222)


# Training set 
print(X_train.shape)
print(y_train.shape)

# Testing set
print(X_test.shape)
print(y_test.shape)



# INSTANTIATING a model object
gradient_model = sklearn.ensemble.GradientBoostingRegressor(n_estimators = 100,
                                                            min_samples_leaf = 123,
                                                            max_depth = 3,
                                                            random_state = 222)

# FITTING the training data
gradient_fit = gradient_model.fit(X_train, y_train)


# PREDICTING on new data
gradient_pred = gradient_fit.predict(X_test)

print('Training Score:', gradient_model.score(X_train, y_train).round(4))
print('Testing Score:',  gradient_model.score(X_test, y_test).round(4))


# saving scoring data for future use
gradient_train_score = gradient_model.score(X_train, y_train).round(4)
gradient_test_score  = gradient_model.score(X_test, y_test).round(4)

                             OLS Regression Results                            
Dep. Variable:     chef['log_revenue']   R-squared:                       0.806
Model:                             OLS   Adj. R-squared:                  0.804
Method:                  Least Squares   F-statistic:                     348.0
Date:                 Sat, 07 Mar 2020   Prob (F-statistic):               0.00
Time:                         17:58:48   Log-Likelihood:                 234.18
No. Observations:                 1946   AIC:                            -420.4
Df Residuals:                     1922   BIC:                            -286.6
Df Model:                           23                                         
Covariance Type:             nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------