# 1. SETTINGS

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import scipy.stats

from pandas.io.json import json_normalize
import json
import os

In [2]:
# pandas options
pd.set_option("display.max_columns", None)

In [3]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [4]:
# garbage collection
import gc
gc.enable()

# 2. FUNCTIONS

In [5]:
##### FUNCTION 1: ENCODING FACTORS
def encode_factors(df, method = "label"):
    
    # label encoding
    if method == "label":
        factors = [f for f in df.columns if df[f].dtype == "object"]
        for var in factors:
            df[var], _ = pd.factorize(df[var])
        
    # dummy encoding
    if method == "dummy":
        df = pd.get_dummies(df, drop_first = True)
    
    # dataset
    return df

In [6]:
##### FUNCTION 2: AGGREGATIONS
def aggregate_data(df, group_var, num_stats = ['min', 'max', 'mean', 'median', 'std', 'var', 'skew'], 
                   label = None, sd_zeros = False):
    
    
    ### SEPARATE FEATURES
  
    # display info
    print("- Preparing the dataset...")

    # find factors
    df_factors = [f for f in df.columns if df[f].dtype == "object"]
    
    # partition subsets
    num_df = df[[group_var] + list(set(df.columns) - set(df_factors))]
    fac_df = df[df_factors]
    
    # display info
    num_facs = fac_df.shape[1] - 1
    num_nums = num_df.shape[1] - 1
    print("- Extracted %.0f factors and %.0f numerics..." % (num_facs, num_nums))


    ##### AGGREGATION
 
    # aggregate numerics
    if (num_nums > 0):
        print("- Aggregating numeric features...")
        num_df = num_df.groupby([group_var]).agg(num_stats)
        num_df.columns = ["_".join(col).strip() for col in num_df.columns.values]
        num_df = num_df.sort_index()

    # aggregate factors
    if (num_facs > 0):
        print("- Aggregating factor features...")
        fac_df = fac_df.groupby([group_var]).agg([("mode",   lambda x: scipy.stats.mode(x)[0][0]),
                                                  ("nunique",  lambda x: x.nunique())])
        fac_df.columns = ["_".join(col).strip() for col in fac_df.columns.values]
        fac_df = fac_df.sort_index()


    ##### MERGER

    # merge numerics and factors
    if ((num_facs > 0) & (num_nums > 0)):
        agg_df = pd.concat([num_df, fac_df], axis = 1)
    
    # use factors only
    if ((num_facs > 0) & (num_nums == 0)):
        agg_df = fac_df
        
    # use numerics only
    if ((num_facs == 0) & (num_nums > 0)):
        agg_df = num_df
        

    ##### LAST STEPS

    # update labels
    if (label != None):
        agg_df.columns = [label + "_" + str(col) for col in agg_df.columns]
    
    # impute zeros for SD
    if (sd_zeros == True):
        stdevs = agg_df.filter(like = "_std").columns
        for var in stdevs:
            agg_df[var].fillna(0, inplace = True)

    # display info
    print("- Final dimensions:", agg_df.shape)
    
    # dataset
    return agg_df

In [7]:
##### FUNCTION 3: RECENCY AND FREQUENCY
def add_rf_vars(df, group_var):
    
    # dataset
    return df

In [8]:
##### FUNCTION 4: DATE FEATURES
def encode_date(df, date):
    
    attrs = ['Year', 'Month', 'Week', 'Day', 
             'Dayofweek', 'Dayofyear',
             'Is_month_end', 'Is_month_start', 
             'Is_quarter_end', 'Is_quarter_start', 
             'Is_year_end', 'Is_year_start']
        
    for attr in attrs:
        df['date_' + attr] = getattr(df['date'].astype('str').dt, attr.lower())
            
    return df

In [9]:
##### FUNCTION 5: TARGETS
def add_rf_vars(df, group_var):
    
    df['target'] = df.groupby([group_var]).totals_TransactionRevenue.transform('sum')
    del df['totals_TransactionRevenue']
    
    # dataset
    return df

# 3. DATA PARTITIONING

In [15]:
# import CSV
df = pd.read_csv("../data/data_v1.csv.gz", compression = "gzip", dtype = {'fullVisitorId': 'str'})
print(df.shape)

(2109926, 40)


In [16]:
df['sales'] = df['totals_transactionRevenue']

In [17]:
del df['totals_totalTransactionRevenue'], df['totals_transactionRevenue']

In [18]:
df = df[df.sales > 0]
df.shape

(23108, 39)

In [22]:
int_vars = ["trafficSource_adwordsClickInfo.page", "totals_bounces", "totals_hits", "totals_newVisits", "totals_pageviews", "totals_sessionQualityDim", "totals_timeOnSite", "totals_transactions",
            "trafficSource_adwordsClickInfo.page", "visitId", "visitNumber", "visitStartTime", "sales", "customDimensions_index"]
for var in int_vars:
    df[var] = df[var].astype('int32')

In [23]:
df.dtypes

channelGrouping                                 object
date                                            object
device_browser                                  object
device_deviceCategory                           object
device_isMobile                                   bool
device_operatingSystem                          object
fullVisitorId                                   object
geoNetwork_city                                 object
geoNetwork_continent                            object
geoNetwork_country                              object
geoNetwork_metro                                object
geoNetwork_networkDomain                        object
geoNetwork_region                               object
geoNetwork_subContinent                         object
totals_bounces                                   int32
totals_hits                                      int32
totals_newVisits                                 int32
totals_pageviews                                 int32
totals_ses

In [24]:
df.to_csv('google.csv', index = False)

In [25]:
# check data
df.head()

Unnamed: 0,channelGrouping,date,device_browser,device_deviceCategory,device_isMobile,device_operatingSystem,fullVisitorId,geoNetwork_city,geoNetwork_continent,geoNetwork_country,geoNetwork_metro,geoNetwork_networkDomain,geoNetwork_region,geoNetwork_subContinent,totals_bounces,totals_hits,totals_newVisits,totals_pageviews,totals_sessionQualityDim,totals_timeOnSite,totals_transactions,trafficSource_adContent,trafficSource_adwordsClickInfo.adNetworkType,trafficSource_adwordsClickInfo.gclId,trafficSource_adwordsClickInfo.isVideoAd,trafficSource_adwordsClickInfo.page,trafficSource_adwordsClickInfo.slot,trafficSource_campaign,trafficSource_isTrueDirect,trafficSource_keyword,trafficSource_medium,trafficSource_referralPath,trafficSource_source,visitId,visitNumber,visitStartTime,customDimensions_index,customDimensions_value,sales
537,Direct,2018-05-11,Chrome,desktop,False,Chrome OS,567632072087918290,,Americas,United States,,rr.com,,Northern America,0,11,1,10,53,302,1,(not set),,,True,0,,,True,,(none),(not set),(direct),1526043913,1,1526043913,4,EMEA,21990000
707,Organic Search,2018-05-11,Chrome,desktop,False,Macintosh,34941760139500427,Mountain View,Americas,United States,San Francisco-Oakland-San Jose CA,,California,Northern America,0,15,1,11,10,169,1,(not set),,,True,0,,,False,,organic,(not set),google,1526090105,1,1526090105,4,North America,149990000
739,Direct,2018-05-11,Chrome,desktop,False,Windows,7417113168654151999,,Americas,United States,,,,Northern America,0,16,1,14,28,213,1,(not set),,,True,0,,,True,,(none),(not set),(direct),1526052977,1,1526052977,4,North America,7730000
763,Referral,2018-05-11,Chrome,desktop,False,Macintosh,5963345090934142475,Austin,Americas,United States,Austin TX,,Texas,Northern America,0,17,0,14,75,2016,1,(not set),,,True,0,,,True,,(none),/,(direct),1526051906,3,1526051906,4,North America,76620000
783,Organic Search,2018-05-11,Chrome,mobile,True,Android,6525673967695077450,San Francisco,Americas,United States,San Francisco-Oakland-San Jose CA,comcast.net,California,Northern America,0,18,1,16,16,1355,1,(not set),,,True,0,,,False,,organic,(not set),google,1526098702,1,1526098702,4,North America,43980000


In [233]:
# convert dates
df['date'] = pd.to_datetime(df['date'], infer_datetime_format = True)
print(df.date.min())
print(df.date.max())

2016-08-01 00:00:00
2018-10-15 00:00:00


In [234]:
# set windows
train_per = 168
valid_per = 62
valid_gap = 46

# number of folds
num_folds = 5

# placeholders
trn_idx = []
val_idx = []

# partitioning loop
for i in range(num_folds):

    # validation dates
    if i == 0:
        v_end = df['date'].max() - pd.DateOffset(days = 15)
    else:
        v_end = df['date'].max() - pd.DateOffset(days = 15) - pd.DateOffset(months = i*2) + pd.DateOffset(days = 1)
    v_start = v_end - pd.DateOffset(days = valid_per)

    # training dates
    t_end   = v_start - pd.DateOffset(days = valid_gap)
    t_start = t_end - pd.DateOffset(days = train_per)
    
    # extract index
    trn_idx.append(list(df[(df.date >= t_start) & (df.date <= t_end)].index))
    val_idx.append(list(df[(df.date >= v_start) & (df.date <= v_end)].index))
    
    # print information
    print('---------------------------------------------')
    print("FOLD " + str(i + 1) + '/' + str(num_folds))
    print('---------------------------------------------')
    print('- train: ' + str(t_start)[0:10] + ' - ' + str(t_end)[0:10] + ' (n = ' + str(len(trn_idx[i])) + ')')
    print('- valid: ' + str(v_start)[0:10] + ' - ' + str(v_end)[0:10] + ' (n = ' + str(len(val_idx[i])) + ')')
    print('---------------------------------------------')
    print('')

---------------------------------------------
FOLD 1/5
---------------------------------------------
- train: 2017-12-28 - 2018-06-14 (n = 475600)
- valid: 2018-07-30 - 2018-09-30 (n = 139647)
---------------------------------------------

---------------------------------------------
FOLD 2/5
---------------------------------------------
- train: 2017-10-28 - 2018-04-14 (n = 501124)
- valid: 2018-05-30 - 2018-07-31 (n = 149143)
---------------------------------------------

---------------------------------------------
FOLD 3/5
---------------------------------------------
- train: 2017-08-28 - 2018-02-12 (n = 508489)
- valid: 2018-03-30 - 2018-05-31 (n = 173836)
---------------------------------------------

---------------------------------------------
FOLD 4/5
---------------------------------------------
- train: 2017-06-28 - 2017-12-13 (n = 497524)
- valid: 2018-01-28 - 2018-03-31 (n = 188391)
---------------------------------------------

----------------------------------------

# 4. FEATURE ENGINEERING

In [None]:
### AGGREGATIONS

In [1]:
### DATE FEATURES

In [2]:
### RECENCY

In [3]:
### FREQUENCY

# 5. MODELING

In [None]:
# drop bad features
excluded_feats = ['fullVisitorId', 'visitId', 'visitStartTime', 'totals_totalTransactionRevenue']
features = [f for f in df.columns if f not in excluded_feats]
df[features].shape

In [47]:
### PARAMETERS

# LGB parameters
lgb_params = {
    'boosting_type':   'gbdt',
    'objective':       'regression',
    'metric':          'rmse',
    'subsample':        0.9,
    'feature_fraction': 0.7,
    'lambda_l1':        0.03,
    'lambda_l2':        0.03,
    'min_split_gain':   0.01,
    'min_child_weight': 5,
    'silent':           True,
    'verbosity':        -1,
    'learning_rate':    0.03,
    'max_depth':        3,
    'n_estimators':     5000,
    'nthread' :         16
}

In [None]:
### CROSS-VALIDATION LOOP

# create objects
clfs = []
importances = pd.DataFrame()
oof_preds = np.zeros((len(full_train), len(classes)))

# modeling loop
start  = time.time()
for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
    
    # data partitioning
    trn_x, trn_y = full_train[features].iloc[trn_], y.iloc[trn_]
    val_x, val_y = full_train[features].iloc[val_], y.iloc[val_]
    
    # train the model
    clf = lgb.LGBMClassifier(**lgb_params) 
    clf.fit(
        trn_x, trn_y,
        eval_set              = [(trn_x, trn_y), (val_x, val_y)],
        eval_metric           = multi_weighted_logloss,
        verbose               = 100,
        early_stopping_rounds = 100,
        sample_weight         = trn_y.map(weights)
    )
    clfs.append(clf)

    # predictions
    oof_preds[val_, :] =  clf.predict_proba(val_x, num_iteration = clf.best_iteration_)
    
    # feedback
    print('-------------------------------------')
    print('Fold ' + str(fold_ + 1) + ': RMSE = ' + str(round(multi_weighted_logloss(val_y, clf.predict_proba(val_x, num_iteration = clf.best_iteration_))[1], 5)))  
    print('-------------------------------------')
    print('')

    # variable importance
    imp_df = pd.DataFrame()
    imp_df['feature'] = features
    imp_df['gain'] = clf.feature_importances_
    imp_df['fold'] = fold_ + 1
    importances = pd.concat([importances, imp_df], axis = 0, sort = False)
    
    # clean up
    gc.collect()
    
# print performance
print('')
print('AVERAGE RMSE: %.5f ' % multi_weighted_logloss(y_true = y, y_preds = oof_preds)[1])
print('Done in %5.1f minutes' % ((time.time() - start) / 60))

In [None]:
##### VARIABLE IMPORTANCE

# load importance 
top_feats = 50
cols = importances[["gain", "feature"]].groupby("feature").mean().sort_values(by = "gain", ascending = False)[0:top_feats].index
importance = importances.loc[importances.feature.isin(cols)]
importance = importance.sort_values(by = "gain", ascending = False)

# plot variable importance
plt.figure(figsize = (10, 10))
sns.barplot(x = "gain", y = "feature", data = importance)
plt.tight_layout()
plt.savefig('../var_importance.pdf')

# 5. PREDICTING

# 6. SUBMISSION

In [None]:
# file name
model = 'lgb_v10_single'
perf  = str(round(cv_perf, 5))[2:7]
name  = model + '_' + perf

In [None]:
# export submission
sub.to_csv('../submissions/' + str(name) + '.csv', index = False)
sub.shape

In [None]:
# export OOF preds
oof_preds_df.to_csv('../preds/' + str(name) + '.csv', index = False)
oof_preds_df.shape