In [None]:
# to time the program
import time
start_time = time.time()

In [None]:
# importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
import sklearn.linear_model
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.preprocessing import StandardScaler

In [None]:
# storing file path as file
file = './Apprentice_Chef_Dataset.xlsx'

# creating dataset using the file path
ac_dataset = pd.read_excel(io = file)

# setting pandas print options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
# changing all column names to lowercase
ac_dataset.columns = ac_dataset.columns.str.lower()

# checking information of all columns
#ac_dataset.info()

In [None]:
# viewing first five rows of dataset
#ac_dataset.head(n = 5)

In [None]:
#ac_dataset.describe(include = 'all')

In [None]:
# checking if any columns have missing values 
#ac_dataset.isnull().sum()

In [None]:
# showing the rows with null values in column family_name
#ac_dataset[ac_dataset['family_name'].isnull()].head(n = 5)

# The family_name probably didn't get taken because they are in brackets
# Nothing interesting of note can be found and family name does not affect our
# analysis

In [None]:
# dropping columns name, first_name, and family_name
ac_dataset = ac_dataset.drop(['name', 'first_name', 'family_name'], axis = 1)

In [None]:
# checking new dataset column names
#ac_dataset.columns

In [None]:
# checking for any null values
#print(ac_dataset.isnull().any().any())

**Looking into the continuous variables**

In [None]:
# displaying the plot for 'Mas Vnr Area'
#sns.displot(x = 'revenue',
#            data = ac_dataset,
#            height = 5,
#            aspect = 2)


# title and labels
#plt.title('Distribution of Revenue')

# displaying the plot
#plt.show()

Our response variable is skewed and has outliers. To fix this, we should should perform a log transformation to bring the values closer together.

In [None]:
# log transformation of revenue and saving as new column
ac_dataset['log_revenue'] = np.log10(ac_dataset['revenue'])

In [None]:
# displaying the plot for 'Mas Vnr Area'
#sns.displot(x = 'log_revenue',
#            data = ac_dataset,
#            height = 5,
#            aspect = 2)


# title and labels
#plt.title('Distribution of Log of Revenue')

# displaying the plot
#plt.show()

In [None]:
########################
# Visual EDA (Scatterplots)
########################

# setting figure size
#fig, ax = plt.subplots(figsize = (10, 8))


# developing a scatterplot
#plt.subplot(2, 2, 1)
#sns.scatterplot(x = ac_dataset['avg_prep_vid_time'],
#                y = ac_dataset['revenue'],
#                color = 'b')

# adding labels but not adding title
# plt.xlabel(xlabel = 'Average Time Prep Instruction Video Played (Seconds)')
# plt.ylabel(ylabel = 'Revenue')


########################

# developing a scatterplot
# plt.subplot(2, 2, 2)
# sns.scatterplot(x = ac_dataset['avg_time_per_site_visit'],
#                 y = ac_dataset['revenue'],
#                 color = 'r')

# adding labels but not adding title
# plt.xlabel(xlabel = 'Average Time Spent Per Web/Mobile Visit')
# plt.ylabel(ylabel = 'Revenue')


########################

# cleaning up the layout, saving the figures, and displaying the results
# plt.tight_layout()
# plt.show()

In [None]:
# tranforming the variables to log
ac_dataset['log_avg_time_per_site_visit'] = np.log10\
                                            (ac_dataset['avg_time_per_site_visit'])

ac_dataset['log_avg_prep_vid_time'] = np.log10\
                                            (ac_dataset['avg_prep_vid_time'])

In [None]:
########################
# Visual EDA (Scatterplots)
########################

# setting figure size
# fig, ax = plt.subplots(figsize = (10, 8))


# developing a scatterplot
# plt.subplot(2, 2, 1)
# sns.scatterplot(x = ac_dataset['log_avg_prep_vid_time'],
#                 y = ac_dataset['log_revenue'],
#                 color = 'b')

# adding labels but not adding title
# plt.xlabel(xlabel = 'Log Average Time Prep Instruction Video Played (Seconds)')
# plt.ylabel(ylabel = 'Log Revenue')


########################

# developing a scatterplot
# plt.subplot(2, 2, 2)
# sns.scatterplot(x = ac_dataset['log_avg_time_per_site_visit'],
#                 y = ac_dataset['log_revenue'],
#                 color = 'r')

# adding labels but not adding title
# plt.xlabel(xlabel = 'Log Average Time Spent Per Web/Mobile Visit')
# plt.ylabel(ylabel = 'Log Revenue')


########################

# cleaning up the layout, saving the figures, and displaying the results
# plt.tight_layout()
# plt.show()

**Looking into the categorical variables**

In [None]:
# the marketing team is very adamant about email classification so emails need
# to be classsified as requested

# creating lists for different categories
prof_email = ['mmm', 'amex', 'apple', 'boeing', 'caterpillar', 'chevron',
             'cisco', 'cocacola', 'disney', 'dupont', 'exxon', 'ge', 'walmart',
             'goldmansacs', 'homedepot', 'ibm', 'intel', 'jnj', 'jpmorgan',
             'mcdonalds', 'merck', 'microsoft', 'nike', 'pfizer', 'pg',
             'travelers', 'unitedtech', 'unitedhealth', 'verizon', 'visa']
per_email = ['gmail', 'yahoo', 'protonmail']
junk_email = ['me', 'aol', 'hotmail', 'live', 'msn', 'passport']

# creating a column email_category with just zeros
ac_dataset['email_category'] = '0'

# for loop to check the domain name and classify it based on grouping from
# marketing team
for index, email in ac_dataset[['email']].iterrows():
    domain_name = re.findall('@+\S+[.com|.org]', email[0])[0]
    for pattern in ['@', '.com', '.org']:
        domain_name = domain_name.replace(pattern, '')
    if domain_name in prof_email:
        ac_dataset.loc[index, 'email_category'] = 'Professional'
    elif domain_name in per_email:
        ac_dataset.loc[index, 'email_category'] = 'Personal'
    elif domain_name in junk_email:
        ac_dataset.loc[index, 'email_category'] = 'Junk'
    else:
        ac_dataset.loc[index, 'email_category'] = 'Undefined'

In [None]:
# ac_dataset['email_category'].value_counts()

In [None]:
# looking at all the other categorical variables

# printing columns
# print(f"""
# Success of Cross Sell Promotion
# --------------------------------------------------------
# {ac_dataset['cross_sell_success'].value_counts()}


# Customer's Residence has Package Room
# --------------------------------------------------------
# {ac_dataset['package_locker'].value_counts()}


# Email Categories
# --------------------------------------------------------
# {ac_dataset['email_category'].value_counts()}


# Customer's Residence has Package Room has Refrigerator 
# --------------------------------------------------------
# {ac_dataset['refrigerated_locker'].value_counts()}


# Customer Specified Tastes and Preference
# --------------------------------------------------------
# {ac_dataset['tastes_and_preferences'].value_counts()}


# Customer's Registered Number Mobile (1) or Landline (0)
# --------------------------------------------------------
# {ac_dataset['mobile_number'].value_counts()}
# """)

In [None]:
# defining a function for categorical boxplots
# def categorical_boxplots(response, cat_var, data):
#     """
# 	This function is designed to generate a boxplot for  can be used for categorical variables.
#     Make sure matplotlib.pyplot and seaborn have been imported (as plt and sns).

#     PARAMETERS
# 	----------
# 	response : str, response variable
# 	cat_var  : str, categorical variable
# 	data     : DataFrame of the response and categorical variables
# 	"""

#     fig, ax = plt.subplots(figsize = (8, 8))
    
#     sns.boxplot(x    = cat_var,
#                 y    = response,
#                 data = data)
    
#     plt.suptitle("")
#     plt.show()

In [None]:
# calling the function for each categorical variable
# categorical_boxplots(response = 'revenue',
# 					 cat_var  = 'cross_sell_success',
# 					 data     = ac_dataset)

# categorical_boxplots(response = 'revenue',
# 					 cat_var  = 'package_locker',
# 					 data     = ac_dataset)

# categorical_boxplots(response = 'revenue',
# 					 cat_var  = 'email_category',
# 					 data     = ac_dataset)

# categorical_boxplots(response = 'revenue',
# 					 cat_var  = 'refrigerated_locker',
# 					 data     = ac_dataset)

# categorical_boxplots(response = 'revenue',
# 					 cat_var  = 'tastes_and_preferences',
# 					 data     = ac_dataset)

# categorical_boxplots(response = 'revenue',
# 					 cat_var  = 'mobile_number',
# 					 data     = ac_dataset)

In [None]:
# one hot encoding email_category
one_hot_email = pd.get_dummies(ac_dataset['email_category'], drop_first = True)

# dropping categorical variable email_category after it's been encoded
ac_dataset = ac_dataset.drop('email_category', axis = 1)

# joining codings together
ac_dataset = ac_dataset.join(one_hot_email)

In [None]:
# changing all column names to lowercase
ac_dataset.columns = ac_dataset.columns.str.lower()

**Looking into the count variables**

In [None]:
# the dataset documentation has informed us that a column was mislabeled
# the column largest_order_size is meant to be average number of meals ordered

# changing the column name
ac_dataset = ac_dataset.rename(columns = {'largest_order_size': 'avg_order_size'})

In [None]:
# checking if column name was changed successfully 
# print('largest_order_size' in ac_dataset.columns)

# print('avg_order_size'in ac_dataset.columns)

In [None]:
# creating variable for count varibale columns
count_variables = ['avg_clicks_per_visit', 'median_meal_rating', 
                   'avg_order_size', 'pc_logins', 'unique_meals_purch', 
                   'contacts_w_customer_service', 'product_categories_viewed']

# creating variable for count varibale columns with zeros
count_variables_zero = ['total_photos_viewed', 'master_classes_attended',
                        'mobile_logins', 'weekly_plan', 'early_deliveries',
                        'late_deliveries', 'cancellations_before_noon',
                        'cancellations_after_noon']

In [None]:
# def scatterplot(response, var, data):
#     """
# 	This function is designed to generate a scatterplot that can be used for variables.
#     Make sure matplotlib.pyplot and seaborn have been imported (as plt and sns).

#     PARAMETERS
# 	----------
# 	response : str, response variable
# 	var  : str, variable
# 	data     : DataFrame of the response and categorical variables
# 	"""

#     fig, ax = plt.subplots(figsize = (12, 8))

#     sns.scatterplot(x = data[var],
#                     y = data[response],
#                     color = 'b')
    
#     plt.suptitle("")
#     plt.show()

In [None]:
# defining a function for count boxplots
# def count_boxplots(response, count_var, data):
#     """
# 	This function is designed to generate a boxplot for  can be used for count variables.
#     Make sure matplotlib.pyplot and seaborn have been imported (as plt and sns).

#     PARAMETERS
# 	----------
# 	response : str, response variable
# 	cat_var  : str, categorical variable
# 	data     : DataFrame of the response and categorical variables
# 	"""
    
#     fig, ax = plt.subplots(figsize = (12, 8))
    
#     sns.boxplot(x    = count_var,
#                 y    = response,
#                 data = data)
    
#     plt.suptitle("")
#     plt.show()

In [None]:
# calling the function for each count varibale column with zeros
# for variable in count_variables_zero:
#     scatterplot(response = 'revenue', 
#                 var = variable, 
#                 data = ac_dataset)

In [None]:
# dummy variable for each variable in count_variables_zero

# for each variable in count_variables_zero
for variable in count_variables_zero:
    # create column filled with zero
    ac_dataset['has_' + variable] = 0
    # creating index and value variables for each row
    for index, value in ac_dataset.iterrows():
        # if value in variable column greater than zero
        if ac_dataset.loc[index, (variable)] > 0:
            # turn the value in that index to one
            ac_dataset.loc[index, ('has_' + variable)] = 1

In [None]:
# checking results
# ac_dataset.head(n = 5)

In [None]:
# looking at all the other categorical variables

# printing columns
# print(f"""
# Average Clicks Per Visit
# --------------------------------------------------------
# {ac_dataset['avg_clicks_per_visit'].value_counts()}


# Median Meal Rating
# --------------------------------------------------------
# {ac_dataset['median_meal_rating'].value_counts()}


# Average Order Size
# --------------------------------------------------------
# {ac_dataset['avg_order_size'].value_counts()}


# PC Logins
# --------------------------------------------------------
# {ac_dataset['pc_logins'].value_counts()}


# Unique Meals Purchased
# --------------------------------------------------------
# {ac_dataset['unique_meals_purch'].value_counts()}


# Contacts with Customer Service
# --------------------------------------------------------
# {ac_dataset['contacts_w_customer_service'].value_counts()}


# Product Categories Viewed
# --------------------------------------------------------
# {ac_dataset['product_categories_viewed'].value_counts()}
# """)

In [None]:
# calling the function for each count variable
# for variable in count_variables:
#     count_boxplots(response = 'revenue',
#                    count_var  = variable,
#                    data     = ac_dataset)

In [None]:
# performing a log transform on all count variables 
for variable in count_variables:
    ac_dataset['log_' + variable] = np.log10(ac_dataset[variable])

In [None]:
# looking at total_meals_ordered

# setting figure size
# fig, ax = plt.subplots(figsize = (10, 8))

# developing a scatterplot
# sns.scatterplot(x = ac_dataset['total_meals_ordered'],
#                 y = ac_dataset['log_revenue'],
#                 color = 'r')

# adding labels but not adding title
# plt.xlabel(xlabel = 'Total Meals Ordered')
# plt.ylabel(ylabel = 'Log Revenue')

In [None]:
# performing log transformation on total_meals_ordered
ac_dataset['log_total_meals_ordered'] = np.log10(ac_dataset['total_meals_ordered'])

In [None]:
# setting figure size
# fig, ax = plt.subplots(figsize = (10, 8))

# developing a scatterplot
# sns.scatterplot(x = ac_dataset['log_total_meals_ordered'],
#                 y = ac_dataset['log_revenue'],
#                 color = 'r')

# adding labels but not adding title
# plt.xlabel(xlabel = 'Log Total Meals Ordered')
# plt.ylabel(ylabel = 'Log Revenue')

**Checking for correlation**

In [None]:
# ac_dataset.columns

In [None]:
# corr_one = ['revenue', 'cross_sell_success', 'email', 'total_meals_ordered', 
# 'unique_meals_purch', 'contacts_w_customer_service', 
# 'product_categories_viewed', 'avg_time_per_site_visit', 'mobile_number', 
# 'cancellations_before_noon']


# corr_two = ['cancellations_after_noon', 'tastes_and_preferences', 
# 'pc_logins', 'mobile_logins', 'weekly_plan', 'early_deliveries',
# 'late_deliveries', 'package_locker', 'refrigerated_locker', 
# 'avg_prep_vid_time', 'avg_order_size','master_classes_attended', 
# 'median_meal_rating']


# corr_three = ['avg_clicks_per_visit', 'total_photos_viewed',
# 'personal', 'professional', 'has_total_photos_viewed', 
# 'has_master_classes_attended', 'has_mobile_logins', 
# 'has_weekly_plan', 'has_early_deliveries', 'has_late_deliveries', 
# 'has_cancellations_before_noon', 'has_cancellations_after_noon']

In [None]:
# creating correlation set with some variables 
# corr_set = ac_dataset[corr_one + corr_two]

# instantiating a correlation matrix
# df_corr = corr_set.corr().round(2)

# setting figure size
# fig, ax = plt.subplots(figsize=(15,15))

# visualizing the correlation matrix
# sns.heatmap(df_corr,
#             cmap = 'coolwarm',
#             square = True,
#             annot = True,
#             linecolor = 'black',
#             linewidths = 0.5)

# plt.tight_layout()
# plt.show()

In [None]:
# creating correlation set with some variables 
# corr_set = ac_dataset[corr_one + corr_three]

# instantiating a correlation matrix
# df_corr = corr_set.corr().round(2)

# setting figure size
# fig, ax = plt.subplots(figsize=(15,15))

# visualizing the correlation matrix
# sns.heatmap(df_corr,
#             cmap = 'coolwarm',
#             square = True,
#             annot = True,
#             linecolor = 'black',
#             linewidths = 0.5)

# plt.tight_layout()
# plt.show()

In [None]:
# creating correlation set with some variables 
# corr_set = ac_dataset[corr_two + corr_three]

# instantiating a correlation matrix
# df_corr = corr_set.corr().round(2)

# setting figure size
# fig, ax = plt.subplots(figsize=(15,15))

# visualizing the correlation matrix
# sns.heatmap(df_corr,
#             cmap = 'coolwarm',
#             square = True,
#             annot = True,
#             linecolor = 'black',
#             linewidths = 0.5)

# plt.tight_layout()
# plt.show()

**OLS REGRESSION**

In [None]:
# making a copy of housing
# ac_dataset_explanatory = ac_dataset.copy()

# dropping SalePrice and Order from the explanatory variable set
# ac_dataset_explanatory = ac_dataset_explanatory.drop(['revenue',
#                                                 'log_revenue'], axis = 1)

# formatting each explanatory variable for statsmodels
# for val in ac_dataset_explanatory:
#     print(f"{val} +")

In [None]:
# creating a (Pearson) correlation matrix
# df_corr = ac_dataset.corr().round(2)

# printing (Pearson) correlations with SalePrice
# print(df_corr.loc['revenue'].sort_values(ascending = False))

In [None]:
# preparing explanatory variable data
ac_data = ac_dataset.drop(['revenue', 'log_revenue', 'email'], axis = 1)

# preparing response variables
ac_target = ac_dataset.loc[ : , 'revenue']
log_ac_target = ac_dataset.loc[ : , 'log_revenue']

In [None]:
# preparing training and testing sets for revenue
# X_train, X_test, y_train, y_test = train_test_split(
#             ac_data,
#             ac_target,
#             test_size = 0.25,
#             random_state = 219)

# merging X_train and y_train so that they can be used in statsmodels
# ac_train = pd.concat([X_train, y_train], axis = 1)

# build a model
# lm_best = smf.ols(formula =  """revenue ~ cross_sell_success +
#                                         total_meals_ordered +
#                                         unique_meals_purch +
#                                         contacts_w_customer_service +
#                                         product_categories_viewed +
#                                         avg_time_per_site_visit +
#                                         mobile_number +
#                                         cancellations_before_noon +
#                                         cancellations_after_noon +
#                                         tastes_and_preferences +
#                                         pc_logins +
#                                         mobile_logins +
#                                         weekly_plan +
#                                         early_deliveries +
#                                         late_deliveries +
#                                         package_locker +
#                                         refrigerated_locker +
#                                         avg_prep_vid_time +
#                                         avg_order_size +
#                                         master_classes_attended +
#                                         median_meal_rating +
#                                         avg_clicks_per_visit +
#                                         total_photos_viewed +
#                                         log_avg_time_per_site_visit +
#                                         log_avg_prep_vid_time +
#                                         personal +
#                                         professional +
#                                         has_total_photos_viewed +
#                                         has_master_classes_attended +
#                                         has_mobile_logins +
#                                         has_weekly_plan +
#                                         has_early_deliveries +
#                                         has_late_deliveries +
#                                         has_cancellations_before_noon +
#                                         has_cancellations_after_noon +
#                                         log_avg_clicks_per_visit +
#                                         log_median_meal_rating +
#                                         log_avg_order_size +
#                                         log_pc_logins +
#                                         log_unique_meals_purch +
#                                         log_contacts_w_customer_service +
#                                         log_product_categories_viewed +
#                                         log_total_meals_ordered""",
#                                 data = ac_train)

# fit the model based on the data
# results = lm_best.fit()

# analyze the summary output
# print(results.summary())

In [None]:
# preparing training and testing sets for log_revenue
# log_X_train, log_X_test, log_y_train, log_y_test = train_test_split(
#             ac_data,
#             log_ac_target,
#             test_size = 0.25,
#             random_state = 219)

# merging X_train and y_train so that they can be used in statsmodels
# ac_train = pd.concat([log_X_train, log_y_train], axis = 1)

# build a model
# lm_best = smf.ols(formula =  """log_revenue ~ cross_sell_success +
#                                         total_meals_ordered +
#                                         unique_meals_purch +
#                                         contacts_w_customer_service +
#                                         product_categories_viewed +
#                                         avg_time_per_site_visit +
#                                         mobile_number +
#                                         cancellations_before_noon +
#                                         cancellations_after_noon +
#                                         tastes_and_preferences +
#                                         pc_logins +
#                                         mobile_logins +
#                                         weekly_plan +
#                                         early_deliveries +
#                                         late_deliveries +
#                                         package_locker +
#                                         refrigerated_locker +
#                                         avg_prep_vid_time +
#                                         avg_order_size +
#                                         master_classes_attended +
#                                         median_meal_rating +
#                                         avg_clicks_per_visit +
#                                         total_photos_viewed +
#                                         log_avg_time_per_site_visit +
#                                         log_avg_prep_vid_time +
#                                         personal +
#                                         professional +
#                                         has_total_photos_viewed +
#                                         has_master_classes_attended +
#                                         has_mobile_logins +
#                                         has_weekly_plan +
#                                         has_early_deliveries +
#                                         has_late_deliveries +
#                                         has_cancellations_before_noon +
#                                         has_cancellations_after_noon +
#                                         log_avg_clicks_per_visit +
#                                         log_median_meal_rating +
#                                         log_avg_order_size +
#                                         log_pc_logins +
#                                         log_unique_meals_purch +
#                                         log_contacts_w_customer_service +
#                                         log_product_categories_viewed +
#                                         log_total_meals_ordered""",
#                                 data = ac_train)

# fit the model based on the data
# results = lm_best.fit()

# analyze the summary output
# print(results.summary())

In [None]:
# preparing training and testing sets for log_revenue removing high p-values
X_train, X_test, y_train, y_test = train_test_split(
            ac_data,
            log_ac_target,
            test_size = 0.25,
            random_state = 219)

In [None]:
# merging X_train and y_train so that they can be used in statsmodels
ac_train = pd.concat([X_train, y_train], axis = 1)

# build a model
lm_best = smf.ols(formula =  """log_revenue ~ cross_sell_success +
                                        total_meals_ordered +
                                        unique_meals_purch +
                                        product_categories_viewed +
                                        avg_order_size +
                                        median_meal_rating +
                                        total_photos_viewed +
                                        log_avg_prep_vid_time +
                                        log_median_meal_rating +
                                        log_unique_meals_purch +
                                        log_contacts_w_customer_service +
                                        log_product_categories_viewed +
                                        log_total_meals_ordered""",
                                data = ac_train)

# fit the model based on the data
results = lm_best.fit()

# analyze the summary output
print(results.summary())

In [None]:
x_variables = ['cross_sell_success', 'total_meals_ordered', 'unique_meals_purch',
               'product_categories_viewed', 'avg_order_size', 'median_meal_rating',
               'total_photos_viewed', 'log_avg_prep_vid_time', 
               'log_median_meal_rating', 'log_unique_meals_purch',
               'log_contacts_w_customer_service', 'log_product_categories_viewed',
               'log_total_meals_ordered']

# preparing x-variables from the OLS model
ols_data = ac_dataset[x_variables]

# preparing training and testing sets for log_revenue removing high p-values
X_train_ols, X_test_ols, y_train_ols, y_test_ols = train_test_split(
                                                        ols_data,
                                                        log_ac_target,
                                                        test_size = 0.25,
                                                        random_state = 219)

# instantiating model object
lr = sklearn.linear_model.LinearRegression()

# fitting to training data 
lr_fit = lr.fit(X_train_ols, y_train_ols)

# predicting on new data
lr_pred = lr_fit.predict(X_test_ols)

# scoring the results
print('OLS Training Score :', lr.score(X_train_ols, y_train_ols).round(4))  # using R-square
print('OLS Testing Score  :',  lr.score(X_test_ols, y_test_ols).round(4)) # using R-square

lr_train_score = lr.score(X_train_ols, y_train_ols).round(4)
lr_test_score  = lr.score(X_test_ols, y_test_ols).round(4)

# displaying and saving the gap between training and testing
print('OLS Train-Test Gap :', abs(lr_train_score - lr_test_score).round(4))
lr_test_gap = abs(lr_train_score - lr_test_score).round(4)

In [None]:
# zipping each feature name to its coefficient
lr_model_values = zip(ac_dataset[x_variables].columns,
                     lr_fit.coef_.round(decimals = 4))

# setting up a placeholder list to store model features
lr_model_lst = [('intercept', lr_fit.intercept_.round(decimals = 4))]

# printing out each feature-coefficient pair one by one
for val in lr_model_values:
    lr_model_lst.append(val)

**LASSO REGRESSION**

In [None]:
# preparing training and testing sets for log_revenue removing high p-values
X_train, X_test, y_train, y_test = train_test_split(
            ac_data,
            log_ac_target,
            test_size = 0.25,
            random_state = 219)

In [None]:
# importing library for hyperparameter tuning
from sklearn.model_selection import GridSearchCV

# using hyperparameter tuning
param_grid = {'alpha': np.arange(0.1, 1.1, 0.1)}
lasso = sklearn.linear_model.Lasso()
lasso_cv = GridSearchCV(lasso, param_grid, cv = 10)
lasso_cv.fit(X_train, y_train)
print(lasso_cv.best_params_)
print(lasso_cv.best_score_)

# saving best score as a variable
lass_best_param = lasso_cv.best_params_['alpha']

In [None]:
# instantiating a model object
#lasso = sklearn.linear_model.Lasso(alpha = lass_best_param, normalize = False)
lasso = sklearn.linear_model.Lasso(alpha = 0.4, normalize = False)

# fitting to training data 
lasso_fit = lasso.fit(X_train, y_train)

# predicting on new data
lasso_pred = lasso_fit.predict(X_test)

# scoring the results
print('Lasso Training Score :', lasso.score(X_train, y_train).round(4))  # using R-square
print('Lasso Testing Score  :',  lasso.score(X_test, y_test).round(4)) # using R-square

lasso_train_score = lasso.score(X_train, y_train).round(4)
lasso_test_score  = lasso.score(X_test, y_test).round(4)

# displaying and saving the gap between training and testing
print('Lasso Train-Test Gap :', abs(lasso_train_score - lasso_test_score).round(4))
lasso_test_gap = abs(lasso_train_score - lasso_test_score).round(4)

In [None]:
# zipping each feature name to its coefficient
lasso_model_values = zip(ac_dataset.columns, lasso_fit.coef_.round(decimals = 4))


# setting up a placeholder list to store model features
lasso_model_lst = [('intercept', lasso_fit.intercept_.round(decimals = 4))]


# printing out each feature-coefficient pair one by one
for val in lasso_model_values:
    lasso_model_lst.append(val)

In [None]:
# dropping coefficients that are equal to zero
for feature, coefficient in lasso_model_lst:  
        if coefficient == 0:
            lasso_model_lst.remove((feature, coefficient))

**ARD**

In [None]:
# preparing training and testing sets for log_revenue removing high p-values
X_train, X_test, y_train, y_test = train_test_split(
            ac_data,
            log_ac_target,
            test_size = 0.25,
            random_state = 219)

In [None]:
# instantiating a model object
ard = sklearn.linear_model.ARDRegression()

# fitting to training data 
ard_fit = ard.fit(X_train, y_train)

# predicting on new data
ard_pred = ard_fit.predict(X_test)

# scoring the results
print('ARD Training Score :', ard.score(X_train, y_train).round(4))  # using R-square
print('ARD Testing Score  :',  ard.score(X_test, y_test).round(4)) # using R-square

ard_train_score = ard.score(X_train, y_train).round(4)
ard_test_score  = ard.score(X_test, y_test).round(4)

# displaying and saving the gap between training and testing
print('ARD Train-Test Gap :', abs(ard_train_score - ard_test_score).round(4))
ard_test_gap = abs(ard_train_score - ard_test_score).round(4)

In [None]:
# zipping each feature name to its coefficient
ard_model_values = zip(ac_dataset.columns, ard_fit.coef_.round(decimals = 4))


# setting up a placeholder list to store model features
ard_model_lst = [('intercept', ard_fit.intercept_.round(decimals = 4))]


# printing out each feature-coefficient pair one by one
for val in ard_model_values:
    ard_model_lst.append(val)

In [None]:
# dropping coefficients that are equal to zero
for feature, coefficient in ard_model_lst:  
        if coefficient == 0:
            ard_model_lst.remove((feature, coefficient))

**KNN**

In [None]:
# preparing explanatory variable data
ac_data_two   = ac_dataset.drop(['revenue', 'log_revenue', 'email', 
                             'log_avg_time_per_site_visit', 
                             'log_avg_prep_vid_time', 'log_avg_clicks_per_visit',
                             'log_median_meal_rating', 'log_avg_order_size', 
                             'log_pc_logins', 'log_unique_meals_purch',
                             'log_contacts_w_customer_service', 
                             'log_product_categories_viewed', 
                             'log_total_meals_ordered'], axis = 1)

# preparing response variables
log_ac_target = ac_dataset.loc[ : , 'log_revenue']

In [None]:
# instantiating a StandardScaler() object
scaler = StandardScaler()

# FITTING the scaler with the data
scaler.fit(ac_data_two)

# TRANSFORMING our data after fit
X_scaled = scaler.transform(ac_data_two)

# converting scaled data into a DataFrame
X_scaled_df = pd.DataFrame(X_scaled)

# adding labels to the scaled DataFrame
X_scaled_df.columns = ac_data_two.columns

In [None]:
# preparing training and testing sets for log_revenue removing logs
X_train, X_test, y_train, y_test = train_test_split(
            ac_data_two,
            log_ac_target,
            test_size = 0.25,
            random_state = 219)

In [None]:
# importing library for hyperparameter tuning
from sklearn.model_selection import GridSearchCV

# using hyperparameter tuning
param_grid = {'n_neighbors': np.arange(1, 50),
              'algorithm': ['auto']}
knn = KNeighborsRegressor()
knn_cv = GridSearchCV(knn, param_grid, cv = 5)
knn_cv.fit(X_train, y_train)

# saving best parameter as variable
opt_neighbors = knn_cv.best_params_['n_neighbors']

print(knn_cv.best_params_)
print(knn_cv.best_score_)

In [None]:
# instantiating a model object
# knn = KNeighborsRegressor(algorithm = 'auto',
#                           n_neighbors = opt_neighbors)
knn = KNeighborsRegressor(algorithm = 'auto',
                          n_neighbors = 16)

# fitting to training data 
knn_fit = knn.fit(X_train, y_train)

# predicting on new data
knn_pred = knn_fit.predict(X_test)

# scoring the results
print('KNN Training Score :', knn.score(X_train, y_train).round(4))  # using R-square
print('KNN Testing Score  :',  knn.score(X_test, y_test).round(4)) # using R-square

knn_train_score = knn.score(X_train, y_train).round(4)
knn_test_score  = knn.score(X_test, y_test).round(4)

# displaying and saving the gap between training and testing
print('KNN Train-Test Gap :', abs(knn_train_score - knn_test_score).round(4))
knn_test_gap = abs(knn_train_score - knn_test_score).round(4)

In [None]:
# creating a dictionary for model results
model_performance = {
    
    'Model Type'    : ['OLS (Chosen Model)', 'Lasso', 'ARD', 'KNN'],
           
    'Training Score' : [lr_train_score, lasso_train_score,
                  ard_train_score, knn_train_score],
           
    'Testing Score'  : [lr_test_score, lasso_test_score,
                  ard_test_score, knn_test_score],
                    
    'Train-Test Gap' : [lr_test_gap, lasso_test_gap,
                        ard_test_gap, knn_test_gap],
                    
    'Model Size' : [len(lr_model_lst), len(lasso_model_lst),
                    len(ard_model_lst), 'N/A'],
                    
    'Model Coeffs and Variables' : [lr_model_lst, lasso_model_lst, ard_model_lst, 'N/A']}


# converting model_performance into a DataFrame
model_performance = pd.DataFrame(model_performance)

# calling the model_performance variable
model_performance

In [None]:
print("--- %s seconds ---" % (time.time() - start_time))