In [1]:
import os

import warnings
warnings.filterwarnings('ignore')

# basic libraries for reading data and plotting:
#------------------------------
import pandas as pd
import dask.dataframe as dd
from datetime import timedelta
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import json
from pandas.io.json import json_normalize #package for flattening json in pandas df

# models,scoring, hyperparameter tuning libraries:
# --------------------------------------
from sklearn import linear_model
from sklearn.linear_model import SGDRegressor
import lightgbm as lgb # Light GBM model
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

from mlxtend.classifier import StackingClassifier

from sklearn import preprocessing
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from random import random
from random import randrange


#feature importance:
#from yellowbrick.model_selection import RFECV

#for results table:
from prettytable import PrettyTable
from IPython.display import Image

### Function-1 (with LGB models) :

In [40]:
def final_fun_1(Query_point):
    
    # reading test_point:
    #----------------------------------------
    
    data_columns = ['channelGrouping', 'date', 'fullVisitorId', 'visitId', 'visitNumber',
                    'visitStartTime', 'device.browser', 'device.operatingSystem',
                    'device.isMobile', 'device.deviceCategory', 'geoNetwork.continent',
                    'geoNetwork.subContinent', 'geoNetwork.country', 'geoNetwork.region',
                    'geoNetwork.metro', 'geoNetwork.city', 'geoNetwork.networkDomain',
                    'totals.hits', 'totals.pageviews', 'totals.timeOnSite',
                    'totals.sessionQualityDim', 'totals.transactions',
                    'totals.transactionRevenue', 'trafficSource.referralPath',
                    'trafficSource.campaign', 'trafficSource.source',
                    'trafficSource.medium', 'trafficSource.keyword',
                    'trafficSource.adContent']
    
    test_data = pd.DataFrame(data = Query_point ,columns=data_columns)
    #----------------------------------------
    
    
    
    
    # pre-processing the test point :
    #----------------------------------------
    #----------------------------------------
    
    
    
    # Handling the boolean feature:
    #----------------------------------------
    test_data['device.isMobile']  = test_data['device.isMobile'].astype(bool)
    print("Boolean feature preprocessing done..!")
    #----------------------------------------
    
    
    
    # Handling the numerical feature:
    #----------------------------------------
    numeric_feat = ['visitNumber','visitStartTime','totals.hits','totals.pageviews',
                    'totals.timeOnSite','totals.transactions']
    
    for col in numeric_feat:
        test_data[col].fillna(0,inplace=True)
        test_data[col] = test_data[col].astype('float')
    
    print("Numerical feature preprocessing done..!")
    #----------------------------------------
    
    
    
    # Handling the categorical features:
    #----------------------------------------
    categorical_feat = ['channelGrouping','device.browser','device.operatingSystem','device.deviceCategory',
                        'geoNetwork.continent','geoNetwork.subContinent','geoNetwork.country','geoNetwork.region',
                        'geoNetwork.metro','geoNetwork.city','geoNetwork.networkDomain','totals.sessionQualityDim',
                        'trafficSource.campaign','trafficSource.source','trafficSource.medium','trafficSource.keyword',
                        'trafficSource.referralPath', 'trafficSource.adContent']
    
    for feature in categorical_feat:
    
        label_encoder = preprocessing.LabelEncoder() # intitalizing label encoder object
    
        label_encoder.classes_ = np.load(feature+'.npy') # reading all-ready saved files       
        
        test_data[feature]  = label_encoder.transform(list(test_data[feature].values.astype('str')))
        
    
    print("categorical feature preprocessing done..!")
    #----------------------------------------
    
    
    
    
    # Featurization of query data point:
    #----------------------------------------
    
    test_frame_k_maxdate = max(test_data['date'])
    test_frame_k_mindate = min(test_data['date'])
    
    test_data_featurized = test_data.groupby('fullVisitorId').agg({
            'geoNetwork.networkDomain': [('networkDomain' , lambda x: x.dropna().max())], #max value of network domain
            'geoNetwork.city':          [('city' , lambda x: x.dropna().max())],  #max value of city
            'device.operatingSystem':   [('operatingSystem' , lambda x: x.dropna().max())],  #max value of Operating System
            'geoNetwork.metro':         [('metro' , lambda x: x.dropna().max())],  #max value of metro
            'geoNetwork.region':        [('region' , lambda x: x.dropna().max())],   #max vaue of region
            'channelGrouping':          [('channelGrouping' , lambda x: x.dropna().max())],  #max value of channel grouping
          'trafficSource.referralPath': [('referralPath' , lambda x: x.dropna().max())],  #max value of referral path
            'geoNetwork.country':       [('country' , lambda x: x.dropna().max())],    #max value of country
            'trafficSource.source':     [('source' , lambda x: x.dropna().max())],   #max value of source
            'trafficSource.medium':     [('medium' , lambda x: x.dropna().max())],   #max value of medium
            'trafficSource.keyword':    [('keyword', lambda x: x.dropna().max())], #max value of keyboard
            'device.browser':           [('browser' , lambda x: x.dropna().max())],  #max value of browser
            'device.deviceCategory':    [('deviceCategory', lambda x: x.dropna().max())], #max of device category
            'geoNetwork.continent':     [('continent' , lambda x: x.dropna().max())],      #max of continent value
            'geoNetwork.subContinent':  [('subcontinent' , lambda x: x.dropna().max())],  #max of sub_continent value
            'totals.timeOnSite':        [('timeOnSite_sum'  , lambda x: x.dropna().sum()),     # total timeonsite of user
                                         ('timeOnSite_min'  , lambda x: x.dropna().min()),     # min timeonsite
                                         ('timeOnSite_max'  , lambda x: x.dropna().max()),     # max timeonsite
                                         ('timeOnSite_mean' , lambda x: x.dropna().mean())],  # mean timeonsite
            'totals.pageviews':         [('pageviews_sum'  , lambda x: x.dropna().sum()),     # total of page views
                                         ('pageviews_min'  , lambda x: x.dropna().min()),     # min of page views
                                         ('pageviews_max'  , lambda x: x.dropna().max()),     # max of page views
                                         ('pageviews_mean' , lambda x: x.dropna().mean())],  # mean of page views
            'totals.hits':              [('hits_sum'  , lambda x: x.dropna().sum()),     # total of hits
                                         ('hits_min'  , lambda x: x.dropna().min()),     # min of hits
                                         ('hits_max'  , lambda x: x.dropna().max()),     # max of hits
                                         ('hits_mean' , lambda x: x.dropna().mean())],  # mean of hits
            'visitStartTime':           [('visitStartTime_counts' , lambda x: x.dropna().count())], #Count of visitStartTime
            'totals.sessionQualityDim': [('sessionQualityDim' , lambda x: x.dropna().max())], #Max value of sessionQualityDim
            'device.isMobile':          [('isMobile' ,  lambda x: x.dropna().max())], #Max value of isMobile
            'visitNumber':              [('visitNumber_max' , lambda x: x.dropna().max())],  #Maximum number of visits.        
            'totals.transactions' :     [('transactions' , lambda x:x.dropna().sum())], #Summation of all the transaction counts.
            'date':                     [('first_ses_from_the_period_start' , lambda x: x.dropna().min() - test_frame_k_mindate), #first shopping session for customer after the period end date for current frame.
                                         ('last_ses_from_the_period_end', lambda x: test_frame_k_maxdate - x.dropna().max()), #Last shopping session for customer before the period end date for current frame.
                                         ('interval_dates' , lambda x: x.dropna().max() - x.dropna().min()),  #interval calculated as the latest date on which customer visited - oldest date on which they visited.
                                         ('unqiue_date_num' , lambda x: len(set(x.dropna())))] , # Unique number of dates customer visited.           
                                                         })

    
    # Drop the parent level of features. for e.g. drop geoNetwork.networkDomain and keep only 'networkDomain' which stores max value from the group. 
    test_data_featurized.columns = test_data_featurized.columns.droplevel() 
    test_data_featurized         = test_data_featurized.reset_index()
    
    print("feature engineering process done..!")
    #----------------------------------------
    
    
    
    # passing Query point to models trianed on best best hyperparameter values:
    #----------------------------------------
    
    
    # Reading pretrained classification model:
    #----------------------------------------------
    Pkl_Filename = "lgb_classification_model.pkl" 
    
    with open(Pkl_Filename, 'rb') as file:  
        lgb_classification_model = pickle.load(file)
    
    classification_pred  = lgb_classification_model.predict(test_data_featurized.drop('fullVisitorId', axis=1))     
    #------------------------------------------------
    
    
    # Reading pretrained regression model:
    #----------------------------------------------
    Pkl_Filename = "lgb_regression_model.pkl" 
    
    with open(Pkl_Filename, 'rb') as file:  
        lgb_regression_model = pickle.load(file)
        
    regression_pred      = lgb_regression_model.predict(test_data_featurized.drop('fullVisitorId', axis=1))
    #------------------------------------------------
    
    
        
    final_prediction     =  classification_pred*regression_pred
    
    print("prediction for query point done..!")
    
    #----------------------------------------
    
    
    
    # returning the model_predictions:
    #----------------------------------------
    
    
    return final_prediction

In [41]:
#reading our test data(unzipped file):
#--------------------------------------
test_df  = pd.read_csv('case study data/preprocessed_test_df.csv',dtype={'fullVisitorId': 'str'},index_col=0).reset_index()
#--------------------------------------


# passing the first 10 rows of test data as a nd-array for predictions:

predictions = final_fun_1(test_df.iloc[0:9].values)

Boolean feature preprocessing done..!
Numerical feature preprocessing done..!
categorical feature preprocessing done..!
feature engineering process done..!
prediction for query point done..!


In [34]:
predictions

array([0.00223092, 0.10557366, 0.0056328 , 0.01775645, 0.04549532,
       0.2376245 , 0.01432871, 0.0045344 , 0.08579153])

__________________________

__________________________________________

### Function-2 (using ensemble models):

In [13]:
def final_fun_1(Query_point):
    
    # reading test_point:
    #----------------------------------------
    
    data_columns = ['channelGrouping', 'date', 'fullVisitorId', 'visitId', 'visitNumber',
                    'visitStartTime', 'device.browser', 'device.operatingSystem',
                    'device.isMobile', 'device.deviceCategory', 'geoNetwork.continent',
                    'geoNetwork.subContinent', 'geoNetwork.country', 'geoNetwork.region',
                    'geoNetwork.metro', 'geoNetwork.city', 'geoNetwork.networkDomain',
                    'totals.hits', 'totals.pageviews', 'totals.timeOnSite',
                    'totals.sessionQualityDim', 'totals.transactions',
                    'totals.transactionRevenue', 'trafficSource.referralPath',
                    'trafficSource.campaign', 'trafficSource.source',
                    'trafficSource.medium', 'trafficSource.keyword',
                    'trafficSource.adContent']
    
    test_data = pd.DataFrame(data = Query_point ,columns=data_columns)
    #----------------------------------------
    
    
    
    
    # pre-processing the test point :
    #----------------------------------------
    #----------------------------------------
    
    
    
    # Handling the boolean feature:
    #----------------------------------------
    test_data['device.isMobile']  = test_data['device.isMobile'].astype(bool)
    print("Boolean feature preprocessing done..!")
    #----------------------------------------
    
    
    
    # Handling the numerical feature:
    #----------------------------------------
    numeric_feat = ['visitNumber','visitStartTime','totals.hits','totals.pageviews',
                    'totals.timeOnSite','totals.transactions']
    
    for col in numeric_feat:
        test_data[col].fillna(0,inplace=True)
        test_data[col] = test_data[col].astype('float')
    
    print("Numerical feature preprocessing done..!")
    #----------------------------------------
    
    
    
    # Handling the categorical features:
    #----------------------------------------
    categorical_feat = ['channelGrouping','device.browser','device.operatingSystem','device.deviceCategory',
                        'geoNetwork.continent','geoNetwork.subContinent','geoNetwork.country','geoNetwork.region',
                        'geoNetwork.metro','geoNetwork.city','geoNetwork.networkDomain','totals.sessionQualityDim',
                        'trafficSource.campaign','trafficSource.source','trafficSource.medium','trafficSource.keyword',
                        'trafficSource.referralPath', 'trafficSource.adContent']
    
    for feature in categorical_feat:
    
        label_encoder = preprocessing.LabelEncoder() # intitalizing label encoder object
    
        label_encoder.classes_ = np.load(feature+'.npy') # reading all-ready saved files       
        
        test_data[feature]  = label_encoder.transform(list(test_data[feature].values.astype('str')))
        
    
    print("categorical feature preprocessing done..!")
    #----------------------------------------
    
    
    
    
    # Featurization of query data point:
    #----------------------------------------
    
    test_frame_k_maxdate = max(test_data['date'])
    test_frame_k_mindate = min(test_data['date'])
    
    test_data_featurized = test_data.groupby('fullVisitorId').agg({
            'geoNetwork.networkDomain': [('networkDomain' , lambda x: x.dropna().max())], #max value of network domain
            'geoNetwork.city':          [('city' , lambda x: x.dropna().max())],  #max value of city
            'device.operatingSystem':   [('operatingSystem' , lambda x: x.dropna().max())],  #max value of Operating System
            'geoNetwork.metro':         [('metro' , lambda x: x.dropna().max())],  #max value of metro
            'geoNetwork.region':        [('region' , lambda x: x.dropna().max())],   #max vaue of region
            'channelGrouping':          [('channelGrouping' , lambda x: x.dropna().max())],  #max value of channel grouping
          'trafficSource.referralPath': [('referralPath' , lambda x: x.dropna().max())],  #max value of referral path
            'geoNetwork.country':       [('country' , lambda x: x.dropna().max())],    #max value of country
            'trafficSource.source':     [('source' , lambda x: x.dropna().max())],   #max value of source
            'trafficSource.medium':     [('medium' , lambda x: x.dropna().max())],   #max value of medium
            'trafficSource.keyword':    [('keyword', lambda x: x.dropna().max())], #max value of keyboard
            'device.browser':           [('browser' , lambda x: x.dropna().max())],  #max value of browser
            'device.deviceCategory':    [('deviceCategory', lambda x: x.dropna().max())], #max of device category
            'geoNetwork.continent':     [('continent' , lambda x: x.dropna().max())],      #max of continent value
            'geoNetwork.subContinent':  [('subcontinent' , lambda x: x.dropna().max())],  #max of sub_continent value
            'totals.timeOnSite':        [('timeOnSite_sum'  , lambda x: x.dropna().sum()),     # total timeonsite of user
                                         ('timeOnSite_min'  , lambda x: x.dropna().min()),     # min timeonsite
                                         ('timeOnSite_max'  , lambda x: x.dropna().max()),     # max timeonsite
                                         ('timeOnSite_mean' , lambda x: x.dropna().mean())],  # mean timeonsite
            'totals.pageviews':         [('pageviews_sum'  , lambda x: x.dropna().sum()),     # total of page views
                                         ('pageviews_min'  , lambda x: x.dropna().min()),     # min of page views
                                         ('pageviews_max'  , lambda x: x.dropna().max()),     # max of page views
                                         ('pageviews_mean' , lambda x: x.dropna().mean())],  # mean of page views
            'totals.hits':              [('hits_sum'  , lambda x: x.dropna().sum()),     # total of hits
                                         ('hits_min'  , lambda x: x.dropna().min()),     # min of hits
                                         ('hits_max'  , lambda x: x.dropna().max()),     # max of hits
                                         ('hits_mean' , lambda x: x.dropna().mean())],  # mean of hits
            'visitStartTime':           [('visitStartTime_counts' , lambda x: x.dropna().count())], #Count of visitStartTime
            'totals.sessionQualityDim': [('sessionQualityDim' , lambda x: x.dropna().max())], #Max value of sessionQualityDim
            'device.isMobile':          [('isMobile' ,  lambda x: x.dropna().max())], #Max value of isMobile
            'visitNumber':              [('visitNumber_max' , lambda x: x.dropna().max())],  #Maximum number of visits.        
            'totals.transactions' :     [('transactions' , lambda x:x.dropna().sum())], #Summation of all the transaction counts.
            'date':                     [('first_ses_from_the_period_start' , lambda x: x.dropna().min() - test_frame_k_mindate), #first shopping session for customer after the period end date for current frame.
                                         ('last_ses_from_the_period_end', lambda x: test_frame_k_maxdate - x.dropna().max()), #Last shopping session for customer before the period end date for current frame.
                                         ('interval_dates' , lambda x: x.dropna().max() - x.dropna().min()),  #interval calculated as the latest date on which customer visited - oldest date on which they visited.
                                         ('unqiue_date_num' , lambda x: len(set(x.dropna())))] , # Unique number of dates customer visited.           
                                                         })

    
    # Drop the parent level of features. for e.g. drop geoNetwork.networkDomain and keep only 'networkDomain' which stores max value from the group. 
    test_data_featurized.columns = test_data_featurized.columns.droplevel() 
    test_data_featurized         = test_data_featurized.reset_index()
    
    print("feature engineering process done..!")
    #----------------------------------------
    
    
    
    # passing Query point to models trianed on best best hyperparameter values:
    #----------------------------------------
    
    
    # Reading pretrained Ensemble models:
    #----------------------------------------------
    Pkl_Filename = "lgb_classification_model.pkl" 
    
    with open(Pkl_Filename, 'rb') as file:  
        lgb_classification_model = pickle.load(file)
    
    classification_pred  = lgb_classification_model.predict(test_data_featurized.drop('fullVisitorId', axis=1))     
    #------------------------------------------------
    
    
    # Reading pretrained regression model:
    #----------------------------------------------
    
    # 1st ensemble model:
    
    Pkl_Filename = "Ensemble_models/Pickle_Rf_Model.pkl" 
    
    with open(Pkl_Filename, 'rb') as file:  
        regression_model_1 = pickle.load(file)
        
    regression_pred_1      = regression_model_1.predict(test_data_featurized.drop('fullVisitorId', axis=1))
    
    # 2nd ensemble model:
    
    Pkl_Filename = "Ensemble_models/Pickle_Lgb_Model.pkl" 
    
    with open(Pkl_Filename, 'rb') as file:  
        regression_model_2 = pickle.load(file)
        
    regression_pred_2      = regression_model_2.predict(test_data_featurized.drop('fullVisitorId', axis=1))
    
    
    # 3rd ensemble model:
    
    Pkl_Filename = "Ensemble_models/Pickle_Xgb_Model.pkl" 
    
    with open(Pkl_Filename, 'rb') as file:  
        regression_model_3 = pickle.load(file)
        
    regression_pred_3      = regression_model_3.predict(test_data_featurized.drop('fullVisitorId', axis=1))
    
    
    pred_df_reg_1                        = pd.DataFrame({"fullVisitorId":test_data_featurized["fullVisitorId"].values})
    pred_df_reg_1["PredictedLogRevenue"] = regression_pred_1
    pred_df_reg_1.columns                = ["fullVisitorId", "PredictedLogRevenue"]
    
    pred_df_reg_2                        = pd.DataFrame({"fullVisitorId":test_data_featurized["fullVisitorId"].values})
    pred_df_reg_2["PredictedLogRevenue"] = regression_pred_2
    pred_df_reg_2.columns                = ["fullVisitorId", "PredictedLogRevenue"]
    
    pred_df_reg_3                        = pd.DataFrame({"fullVisitorId":test_data_featurized["fullVisitorId"].values})
    pred_df_reg_3["PredictedLogRevenue"] = regression_pred_3
    pred_df_reg_3.columns                = ["fullVisitorId", "PredictedLogRevenue"]
    
    final_ensemble_pred = pred_df_reg_1.merge(pred_df_reg_2, on='fullVisitorId').merge(pred_df_reg_3,on='fullVisitorId') 

    final_ensemble_pred['mean'] = final_ensemble_pred.mean(axis=1)

    # making zero if mean is negative:
    final_ensemble_pred['mean'][final_ensemble_pred['mean'] < 0] = 0
    
    final_ensemble_pred = final_ensemble_pred.reset_index()
    final_ensemble_pred = final_ensemble_pred[['fullVisitorId','mean']]
    final_ensemble_pred.columns = ['fullVisitorId', 'PredictedLogRevenue']


        
    
    
    print("prediction for query point done..!")
    
    #----------------------------------------
    
    
    
    # returning the model_predictions:
    #----------------------------------------
    
    
    return final_ensemble_pred

In [14]:
#reading our test data(unzipped file):
#--------------------------------------
test_df  = pd.read_csv('case study data/preprocessed_test_df.csv',dtype={'fullVisitorId': 'str'},index_col=0).reset_index()
#--------------------------------------


# passing the first 10 rows of test data as a nd-array for predictions:

predictions = final_fun_1(test_df.iloc[0:9].values)

Boolean feature preprocessing done..!
Numerical feature preprocessing done..!
categorical feature preprocessing done..!
feature engineering process done..!
prediction for query point done..!


In [15]:
predictions

Unnamed: 0,fullVisitorId,PredictedLogRevenue
0,459669224143241747,0.0
1,1303090465617023038,0.0
2,2235365487897339889,0.056965
3,2866297766347322467,0.0
4,3461808543879602873,0.029014
5,460252456180441002,0.297482
6,7460955084541987166,0.340299
7,8381672768065729990,0.067387
8,975129477712150630,0.82901
