In [None]:
# Imports
import warnings
import time
import gc
import os
import math
import pandas as pd
import numpy as np
from IPython.display import HTML, display
from numba import jit

# Visualization options
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
display(HTML("<style>.container { width: 90% !important; }</style>"))
warnings.filterwarnings("ignore")

# Global variables
SEED = 2017
N_JOBS = 18


<b>Functions</b>

In [None]:
# Define error metric
def smape_old(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)
    #return np.nanmean(diff)

In [None]:
# Define error metric
@jit
def smape(y_true, y_pred):
    out = 0
    for i in range(y_true.shape[0]):
        a = y_true[i]
        b = y_pred[i]
        c = a + b
        if (c == 0):
            continue
        out += math.fabs(a - b) / c
    out *= (200.0 / y_true.shape[0])
    return(out)

In [None]:
# For each data configuration, get CV score and compute test set predictions
def run_generator(data_configs, trains, vals, init_train, test) :
    # Clear predictions directories
    #list(map(os.unlink, (os.path.join("../oof_preds/temp/",f) for f in os.listdir("../oof_preds/temp/"))))
    #list(map(os.unlink, (os.path.join("../oof_targets/temp/",f) for f in os.listdir("../oof_targets/temp/"))))
    #list(map(os.unlink, (os.path.join("../preds/temp/",f) for f in os.listdir("../preds/temp/"))))
    
    # Iterate on each data configuration :
    results = pd.DataFrame()
    for conf_name, conf in data_configs.items() :
        print("***** CONFIG : " + conf_name + " ***************")
        cur_train = init_train.copy()
        cur_test = test.copy()
        
        # Cross validation
        cv_score = cross_val(conf, conf_name, trains, vals)

        # Store results
        cur_result = pd.DataFrame(columns = ["conf", "cv"])
        cur_result.loc[0] = [conf_name, cv_score] 
        results = pd.concat([results, cur_result], axis = 0)
        
        '''# Predict test set
        start = time.time()
        cur_test["Date"] = cur_test.Page.apply(lambda a: a[-10:])
        cur_test["Page"] = cur_test.Page.apply(lambda a: a[:-11])
        cur_test, y = fit_predict(conf, cur_train, cur_test, is_test=True)
        
        # Remove log transformation if needed
        if (conf["log_target"] > 0) :
            cur_test.Visits = np.exp(cur_test.Visits) - conf["log_target"]
   
        # Save preds
        file_name = "../preds/temp/" + conf_name + "_preds_test.csv"
        cur_test[["Id", "Visits"]].to_csv(file_name, index = False)
        print("Elapsed predict test : " + str(time.time() - start))'''
        
        del(cur_train)
        del(cur_test)
        gc.collect()

    return(results)
    

In [None]:
# Cross validation
def cross_val(conf, conf_name, trains, vals):
    start_a = time.time()
    oof_preds = pd.DataFrame()
    oof_targets = pd.DataFrame()
    for i in range(len(trains)):
        start_f = time.time()
        train = trains[i]
        validation = vals[i]

        # Fit and predict on this fold
        validation, y = fit_predict(conf, train, validation, is_test=False)

        # Remove log transformation if needed
        if (conf["log_target"] > 0) :
            y.Visits = np.exp(y.Visits) - conf["log_target"]
            validation.Visits = np.exp(validation.Visits) - conf["log_target"]

        # Compute validation score
        #display(y.Visits.head().values)
        #display(validation.Visits.head().values)
        val_score = smape(y.Visits.values, validation.Visits.values)
        print("SMAPE fold " + str(i + 1) + " : " + str(val_score))

        # Assemble OOF predictions and targets
        oof_preds = pd.concat([pd.Series(oof_preds), pd.Series(validation.Visits)], axis = 0)
        oof_targets = pd.concat([pd.Series(oof_targets), pd.Series(y.Visits)], axis = 0)

        print("Elapsed in fold " + str(i + 1) + " : " + str(time.time() - start_f))
        del(train)
        del(validation)
        gc.collect()

    # Compute overall score
    cv_score = smape(oof_targets.values, oof_preds.values)
    print("Global SMAPE score : " + str(cv_score))

    # Save OOF preds
    file_name = "../oof_preds/temp/" + conf_name + "_preds_OOF.csv"
    pd.DataFrame({"y": oof_preds}).to_csv(file_name, index = False)
    file_name = "../oof_targets/temp/" + conf_name + "_targets_OOF.csv"
    pd.DataFrame({"y": oof_targets}).to_csv(file_name, index = False)

    print("Elapsed cross val : " + str(time.time() - start_a))
    del(oof_targets)
    del(oof_preds)
    gc.collect()

    return(cv_score)

In [None]:
# Fit and predict
def fit_predict(conf, train, validation, is_test = False) :
    # Handle training length (in days)
    if (conf["length"] != "all") :
        train = train[["Page"] + list(train.columns[-conf["length"]:])]

    # Melt df to have one observation per row
    train = pd.melt(train, 
                    id_vars="Page", 
                    var_name="Date", 
                    value_name="Visits")
    #print(train.shape)
    #display(train.head(3))
    
    # Handle log transformation of the target
    if (conf["log_target"] > 0) :
        train.Visits = np.log(train.Visits + conf["log_target"])

    y = 0
    if (is_test is False) :
        validation = pd.melt(validation, 
                             id_vars="Page", 
                             var_name="Date", 
                             value_name="Visits")  
    
        # Handle log transformation of the target
        if (conf["log_target"] > 0) :
            validation.Visits = np.log(validation.Visits + conf["log_target"])
        
        # Remove y from validation set
        y = validation[["Page", "Visits"]]
        validation.drop("Visits", axis=1, inplace=True)
        #display(validation.head(3))
        #print(validation.shape)
        gc.collect()

    # Handle date variable
    #start = time.time()
    train["Year"] = train["Date"].map(lambda x: int(str(x)[2:4]))
    train["Month"] = train["Date"].map(lambda x: int(str(x)[5:7]))
    train["DayOfM"] = train["Date"].map(lambda x: int(str(x)[8:10]))
    train["Date"] = train["Date"].astype("datetime64[ns]")
    train["DayOfY"] = train.Date.dt.dayofyear
    train["DayOfW"] = train.Date.dt.dayofweek
    train["is_wknd"] = (train.Date.dt.dayofweek >= 5).astype(int)
    train.drop(["Date"], inplace=True, axis=1)
    #display(train.head(3))
    #print("Elapsed train date : " + str(time.time() - start))

    #start = time.time()
    validation["Year"] = validation["Date"].map(lambda x: int(str(x)[2:4]))
    validation["Month"] = validation["Date"].map(lambda x: int(str(x)[5:7]))
    validation["DayOfM"] = validation["Date"].map(lambda x: int(str(x)[8:10]))
    validation["Date"] = validation["Date"].astype("datetime64[ns]")
    validation["DayOfY"] = validation.Date.dt.dayofyear
    validation["DayOfW"] = validation.Date.dt.dayofweek
    validation["is_wknd"] = (validation.Date.dt.dayofweek >= 5).astype(int)
    validation.drop(["Date"], inplace=True, axis=1)
    #display(validation.head(3))
    #print("Elapsed val date : " + str(time.time() - start))
    
    # Define grouping variables
    if (conf["groupby"] == "DayOfW") :
        cols_to_groupby = ["Page", "DayOfW"]
    elif (conf["groupby"] == "we") :
        cols_to_groupby = ["Page", "is_wknd"]
    else :
        cols_to_groupby = ["Page"]
    cols_to_apply = cols_to_groupby + ["Visits"]
    
    # Handle transformation
    if (conf["stat"] != "median"):
        pages_groups = train.groupby(cols_to_groupby).median().reset_index()
        pages_groups = pages_groups[cols_to_apply]
    elif (conf["stat"] != "mean"):
        pages_groups = train.groupby(cols_to_groupby).mean().reset_index()
        pages_groups = pages_groups[cols_to_apply]

    # Make validation predictions
    validation = validation.merge(pages_groups, how="left")
    #display(validation.head(3))
    #display(validation.tail(3))

    # Handle NAs
    y_na_idx = pd.isnull(y).any(1).nonzero()[0]
    y.drop(y.index[y_na_idx], axis=0, inplace=True)
    validation.drop(validation.index[y_na_idx], axis=0, inplace=True)
    if (conf["na"] == "0"):
        validation.loc[validation.Visits.isnull(), "Visits"] = 0
    elif(conf["na"] == "all_median"):
        validation.loc[validation.Visits.isnull(), "Visits"] = train.Visits.median()
    '''elif(conf["na"] == "month_median"):
        cols_to_groupby = ["Month"]
        cols_to_apply = cols_to_groupby + ["Visits"]
        pages_groups = train.groupby(cols_to_groupby).median().reset_index()
        pages_groups = pages_groups[cols_to_apply]
        validation["Visits"] = validation.apply(lambda row: 999 if row.Visits.isnull() else row.Visits, axis=1)
        display(validation.head())'''
    
    del(pages_groups)
    gc.collect()
    return(validation, y)


<b>Config</b>

In [None]:
dc1 = {"stat" : "median",
       "length" : 14, 
       "log_target" : 0, 
       "groupby" : "Page", 
       "na" : "0"}
dc2 = {"stat" : "median",
       "length" : 28, 
       "log_target" : 0, 
       "groupby" : "Page", 
       "na" : "0"}
dc3 = {"stat" : "mean",
       "length" : 14, 
       "log_target" : 0, 
       "groupby" : "Page", 
       "na" : "0"}
dc4 = {"stat" : "mean",
       "length" : 28, 
       "log_target" : 0, 
       "groupby" : "Page", 
       "na" : "0"}
dc5 = {"stat" : "median",
       "length" : 56, 
       "log_target" : 0, 
       "groupby" : "Page", 
       "na" : "0"}
dc6 = {"stat" : "mean",
       "length" : 56, 
       "log_target" : 0, 
       "groupby" : "Page", 
       "na" : "0"}
dc7 = {"stat" : "median", 
       "length" : 84, 
       "log_target" : 0, 
       "groupby" : "Page", 
       "na" : "0"}
dc8 = {"stat" : "mean",
       "length" : 84, 
       "log_target" : 0, 
       "groupby" : "Page", 
       "na" : "0"}
dc9 = {"stat" : "median", 
       "length" : 112, 
       "log_target" : 0, 
       "groupby" : "Page", 
       "na" : "0"}
dc10 = {"stat" : "mean",
       "length" : 112, 
       "log_target" : 0, 
       "groupby" : "Page", 
       "na" : "0"}
dc11 = {"stat" : "median", 
       "length" : 168, 
       "log_target" : 0, 
       "groupby" : "Page", 
       "na" : "0"}
dc12 = {"stat" : "mean",
       "length" : 168, 
       "log_target" : 0, 
       "groupby" : "Page", 
       "na" : "0"}
dc13 = {"stat" : "median",
       "length" : 28, 
       "log_target" : 1, 
       "groupby" : "Page", 
       "na" : "0"}
dc14 = {"stat" : "mean",
       "length" : 28, 
       "log_target" : 1, 
       "groupby" : "Page", 
       "na" : "0"}
dc15 = {"stat" : "median",
       "length" : 56, 
       "log_target" : 1, 
       "groupby" : "Page", 
       "na" : "0"}
dc16 = {"stat" : "mean",
       "length" : 56, 
       "log_target" : 1, 
       "groupby" : "Page", 
       "na" : "0"}
dc17 = {"stat" : "median",
       "length" : 28, 
       "log_target" : 1, 
       "groupby" : "DayOfW", 
       "na" : "0"}
dc18 = {"stat" : "mean",
       "length" : 28, 
       "log_target" : 0, 
       "groupby" : "DayOfW", 
       "na" : "0"}
dc19 = {"stat" : "median",
       "length" : 56, 
       "log_target" : 1, 
       "groupby" : "DayOfW", 
       "na" : "0"}
dc20 = {"stat" : "mean",
       "length" : 56, 
       "log_target" : 0, 
       "groupby" : "DayOfW", 
       "na" : "0"}
dc21 = {"stat" : "median",
       "length" : 84, 
       "log_target" : 1, 
       "groupby" : "DayOfW", 
       "na" : "0"}
dc22 = {"stat" : "mean",
       "length" : 84, 
       "log_target" : 0, 
       "groupby" : "DayOfW", 
       "na" : "0"}
dc23 = {"stat" : "median",
       "length" : 112, 
       "log_target" : 1, 
       "groupby" : "DayOfW", 
       "na" : "0"}
dc24 = {"stat" : "mean",
       "length" : 112, 
       "log_target" : 0, 
       "groupby" : "DayOfW", 
       "na" : "0"}
dc25 = {"stat" : "median",
       "length" : 168, 
       "log_target" : 1, 
       "groupby" : "DayOfW", 
       "na" : "0"}
dc26 = {"stat" : "mean",
       "length" : 168, 
       "log_target" : 0, 
       "groupby" : "DayOfW", 
       "na" : "0"}
dc27 = {"stat" : "median",
       "length" : 28, 
       "log_target" : 1, 
       "groupby" : "we", 
       "na" : "0"}
dc28 = {"stat" : "mean",
       "length" : 28, 
       "log_target" : 0, 
       "groupby" : "we", 
       "na" : "0"}
dc29 = {"stat" : "median",
       "length" : 56, 
       "log_target" : 1, 
       "groupby" : "we", 
       "na" : "0"}
dc30 = {"stat" : "mean",
       "length" : 56, 
       "log_target" : 0, 
       "groupby" : "we", 
       "na" : "0"}
dc31 = {"stat" : "median",
       "length" : 84, 
       "log_target" : 1, 
       "groupby" : "we", 
       "na" : "0"}
dc32 = {"stat" : "mean",
       "length" : 84, 
       "log_target" : 0, 
       "groupby" : "we", 
       "na" : "0"}
dc33 = {"stat" : "median",
       "length" : 112, 
       "log_target" : 1, 
       "groupby" : "we", 
       "na" : "0"}
dc34 = {"stat" : "mean",
       "length" : 112, 
       "log_target" : 0, 
       "groupby" : "we", 
       "na" : "0"}
dc35 = {"stat" : "median",
       "length" : 168, 
       "log_target" : 1, 
       "groupby" : "we", 
       "na" : "0"}
dc36 = {"stat" : "mean",
       "length" : 168, 
       "log_target" : 0, 
       "groupby" : "we", 
       "na" : "0"}
dc37 = {"stat" : "median",
       "length" : 28, 
       "log_target" : 1, 
       "groupby" : "we", 
       "na" : "all_median"}
dc38 = {"stat" : "mean",
       "length" : 28, 
       "log_target" : 0, 
       "groupby" : "we", 
       "na" : "all_median"}
dc39 = {"stat" : "median",
       "length" : 56, 
       "log_target" : 1, 
       "groupby" : "we", 
       "na" : "all_median"}
dc40 = {"stat" : "mean",
       "length" : 56, 
       "log_target" : 0, 
       "groupby" : "we", 
       "na" : "all_median"}
dc41 = {"stat" : "median",
       "length" : 112, 
       "log_target" : 1, 
       "groupby" : "we", 
       "na" : "all_median"}
dc42 = {"stat" : "mean",
       "length" : 112, 
       "log_target" : 0, 
       "groupby" : "we", 
       "na" : "all_median"}
dc43 = {"stat" : "median",
       "length" : 168, 
       "log_target" : 1, 
       "groupby" : "we", 
       "na" : "all_median"}
dc44 = {"stat" : "mean",
       "length" : 168, 
       "log_target" : 0, 
       "groupby" : "we", 
       "na" : "all_median"}
dc45 = {"stat" : "median",
       "length" : 21, 
       "log_target" : 1, 
       "groupby" : "we", 
       "na" : "all_median"}
dc46 = {"stat" : "mean",
       "length" : 21, 
       "log_target" : 0, 
       "groupby" : "we", 
       "na" : "all_median"}
dc47 = {"stat" : "median",
       "length" : 14, 
       "log_target" : 1, 
       "groupby" : "we", 
       "na" : "all_median"}
dc48 = {"stat" : "mean",
       "length" : 14, 
       "log_target" : 0, 
       "groupby" : "we", 
       "na" : "all_median"}
'''dc49 = {"stat" : "median",
       "length" : 28, 
       "log_target" : 1, 
       "groupby" : "we", 
       "na" : "month_median"}
dc50 = {"stat" : "mean",
       "length" : 28, 
       "log_target" : 0, 
       "groupby" : "we", 
       "na" : "month_median"}
dc51 = {"stat" : "median",
       "length" : 56, 
       "log_target" : 1, 
       "groupby" : "we", 
       "na" : "month_median"}
dc52 = {"stat" : "mean",
       "length" : 56, 
       "log_target" : 0, 
       "groupby" : "we", 
       "na" : "month_median"}
dc53 = {"stat" : "median",
       "length" : 112, 
       "log_target" : 1, 
       "groupby" : "we", 
       "na" : "month_median"}
dc54 = {"stat" : "mean",
       "length" : 112, 
       "log_target" : 0, 
       "groupby" : "we", 
       "na" : "month_median"}
dc55 = {"stat" : "median",
       "length" : 168, 
       "log_target" : 1, 
       "groupby" : "we", 
       "na" : "month_median"}
dc56 = {"stat" : "mean",
       "length" : 168, 
       "log_target" : 0, 
       "groupby" : "we", 
       "na" : "month_median"}'''

data_configs = {
    "dc49" : dc49,
    "dc50" : dc50,
    "dc51" : dc51,
    "dc52" : dc52,
    "dc53" : dc53,
    "dc54" : dc54,
    "dc55" : dc55,
    "dc56" : dc56,
}


<b>Script</b>

In [None]:
# Get data
init_train = pd.read_csv("../raw_data/train_1.csv")
display(init_train.shape)
display(init_train.head(3))

test = pd.read_csv("../raw_data/key_1.csv")
display(test.shape)
display(test.head(3))



In [None]:
# Create validation data - 60 days val sets, 10 days between train and val
validation5 = init_train[["Page"] + list(init_train.columns[491:551])]
train5 = init_train[["Page"] + list(init_train.columns[1:481])]

validation4 = init_train[["Page"] + list(init_train.columns[431:491])]
train4 = init_train[["Page"] + list(init_train.columns[1:421])]

validation3 = init_train[["Page"] + list(init_train.columns[371:431])]
train3 = init_train[["Page"] + list(init_train.columns[1:361])]

validation2 = init_train[["Page"] + list(init_train.columns[311:371])]
train2 = init_train[["Page"] + list(init_train.columns[1:301])]

validation1 = init_train[["Page"] + list(init_train.columns[251:311])]
train1 = init_train[["Page"] + list(init_train.columns[1:241])]

trains = [train1, train2, train3, train4, train5]
vals = [validation1, validation2, validation3, validation4, validation5]
gc.collect()

In [None]:
# Generate preds
results = run_generator(data_configs, trains, vals, init_train, test)

In [None]:
# Show ordered results
results.sort_values(["cv"], ascending=True, inplace=True)
results.reset_index(drop=True, inplace=True)
display(results)