# 1. SETTINGS

In [99]:
# libraries
import pandas as pd
import numpy as np
import scipy.stats
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import LabelEncoder

In [100]:
# pandas options
pd.set_option("display.max_columns", None)

In [101]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [102]:
# garbage collection
import gc
gc.enable()

# 2. FUNCTIONS

In [104]:
##### FUNCTION FOR COUNTING MISSINGS
def count_missings(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum() / data.isnull().count() * 100).sort_values(ascending = False)
    table = pd.concat([total, percent], axis = 1, keys = ["Total", "Percent"])
    table = table[table["Total"] > 0]
    return table

In [105]:
##### FUNCTION FOR CREATING FLAGS FOR MISSINGS
def create_null_flags(data, features = None):
    if features == None:
        features = data.columns
    for var in features:
        num_null = data[var].isnull() + 0
        if num_null.sum() > 0:
            data["ISNULL_" + str(var)] = num_null
    return data

In [106]:
##### FUNCTION FOR TREATING FACTORS
def treat_factors(data, method = "label"):
    
    # label encoding
    if method == "label":
        factors = [f for f in data.columns if data[f].dtype == "object"]
        for var in factors:
            data[var], _ = pd.factorize(data[var])
        
    # dummy encoding
    if method == "dummy":
        data = pd.get_dummies(data, drop_first = True)
    
    # dataset
    return data

In [107]:
##### FUNCTION FOR AGGREGATING DATA
def aggregate_data(data, id_var, num_stats = ["mean", "min", "max", "std"],
                   label = None, sd_zeros = False):
    
    
    ### SEPARATE FEATURES
  
    # display info
    print("- Preparing the dataset...")

    # find factors
    data_factors = [f for f in data.columns if data[f].dtype == "object"]
    
    # partition subsets
    num_data = data[[id_var] + list(set(data.columns) - set(data_factors))]
    fac_data = data[data_factors]
    
    # display info
    num_facs = fac_data.shape[1] - 1
    num_nums = num_data.shape[1] - 1
    print("- Extracted %.0f factors and %.0f numerics..." % (num_facs, num_nums))


    ##### AGGREGATION
 
    # aggregate numerics
    if (num_nums > 0):
        print("- Aggregating numeric features...")
        num_data = num_data.groupby([id_var]).agg(num_stats)
        num_data.columns = ["_".join(col).strip() for col in num_data.columns.values]
        num_data = num_data.sort_index()

    # aggregate factors
    if (num_facs > 0):
        print("- Aggregating factor features...")
        fac_data = fac_data.groupby([id_var]).agg([("mode",   lambda x: scipy.stats.mode(x)[0][0]),
                                                 ("nunique",  lambda x: x.nunique())])
        fac_data.columns = ["_".join(col).strip() for col in fac_data.columns.values]
        fac_data = fac_data.sort_index()


    ##### MERGER

    # merge numerics and factors
    if ((num_facs > 0) & (num_nums > 0)):
        agg_data = pd.concat([num_data, fac_data], axis = 1)
    
    # use factors only
    if ((num_facs > 0) & (num_nums == 0)):
        agg_data = fac_data
        
    # use numerics only
    if ((num_facs == 0) & (num_nums > 0)):
        agg_data = num_data
        

    ##### LAST STEPS

    # update labels
    if (label != None):
        agg_data.columns = [label + "_" + str(col) for col in agg_data.columns]
    
    # impute zeros for SD
    if (sd_zeros == True):
        stdevs = agg_data.filter(like = "_std").columns
        for var in stdevs:
            agg_data[var].fillna(0, inplace = True)

    # display info
    print("- Final dimensions:", agg_data.shape)
    
    # return dataset
    return agg_data

# 1. DATA IMPORT

In [121]:
# import CSV
df = pd.read_csv("../data/prepared/data_v1.csv.gz", compression = "gzip")
print(df.shape)

(1708337, 41)


In [118]:
# check data
df.head(3)

Unnamed: 0,channelGrouping,date,device.browser,device.deviceCategory,device.isMobile,device.operatingSystem,fullVisitorId,geoNetwork.city,geoNetwork.continent,geoNetwork.country,geoNetwork.metro,geoNetwork.networkDomain,geoNetwork.region,geoNetwork.subContinent,sessionId,totals.bounces,totals.hits,totals.newVisits,totals.pageviews,totals.transactionRevenue,trafficSource.adContent,trafficSource.adwordsClickInfo.adNetworkType,trafficSource.adwordsClickInfo.gclId,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source,visitId,visitNumber,visitStartTime
0,Organic Search,20160902,Chrome,desktop,False,Windows,1131660440785968503,Izmir,Asia,Turkey,(not set),ttnet.com.tr,Izmir,Western Asia,1131660440785968503_1472830385,1.0,1,1.0,1.0,,,,,,,,(not set),,(not provided),organic,,google,1472830385,1,1472830385
1,Organic Search,20160902,Firefox,desktop,False,Macintosh,377306020877927890,not available in demo dataset,Oceania,Australia,not available in demo dataset,dodo.net.au,not available in demo dataset,Australasia,377306020877927890_1472880147,1.0,1,1.0,1.0,,,,,,,,(not set),,(not provided),organic,,google,1472880147,1,1472880147
2,Organic Search,20160902,Chrome,desktop,False,Windows,3895546263509774583,Madrid,Europe,Spain,(not set),unknown.unknown,Community of Madrid,Southern Europe,3895546263509774583_1472865386,1.0,1,1.0,1.0,,,,,,,,(not set),,(not provided),organic,,google,1472865386,1,1472865386


# 2. FEATURE ENGINEERING

In [119]:
### DATE FEATURES

# convert dates
df['date']           = pd.to_datetime(df['date'])
df['visitStartTime'] = pd.to_datetime(df['visitStartTime'], unit = "s")

# features: hour, day and week
print(df.shape)
df['session_date_dow'] = df['visitStartTime'].dt.dayofweek
df['session_date_dom'] = df['visitStartTime'].dt.day
df['session_date_hou'] = df['visitStartTime'].dt.hour
df['session_date_woy'] = df['visitStartTime'].dt.weekofyear
df['store_date_dow']   = df['date'].dt.dayofweek
df['store_date_dom']   = df['date'].dt.day
df['store_date_hou']   = df['date'].dt.hour
df['store_date_woy']   = df['date'].dt.weekofyear
print(df.shape)

# features: time since last session

# features: time before next session

(1708337, 35)
(1708337, 41)
