In [1]:
# Imports
import warnings
import time
import gc
import os
import pandas as pd
import numpy as np
from IPython.display import HTML, display

# Visualization options
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
display(HTML("<style>.container { width: 90% !important; }</style>"))
warnings.filterwarnings("ignore")

# Global variables
SEED = 2017
N_JOBS = 18


<b>Functions</b>

In [2]:
# Define error metric
def smape(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)
    #return np.nanmean(diff)

In [3]:
# For each data configuration, get CV score and compute test set predictions
def run_generator(data_configs, trains, vals, init_train, test) :
    # Clear predictions directories
    #list(map(os.unlink, (os.path.join("../oof_preds/temp/",f) for f in os.listdir("../oof_preds/temp/"))))
    #list(map(os.unlink, (os.path.join("../oof_targets/temp/",f) for f in os.listdir("../oof_targets/temp/"))))
    #list(map(os.unlink, (os.path.join("../preds/temp/",f) for f in os.listdir("../preds/temp/"))))
    
    # Iterate on each data configuration :
    results = pd.DataFrame()
    for conf_name, conf in data_configs.items() :
        print("***** CONFIG : " + conf_name + " ***************")
        cur_train = init_train.copy()
        cur_test = test.copy()
        
        # Cross validation
        cv_score = cross_val(conf, conf_name, trains, vals)

        # Store results
        cur_result = pd.DataFrame(columns = ["conf", "cv"])
        cur_result.loc[0] = [conf_name, cv_score] 
        results = pd.concat([results, cur_result], axis = 0)
        
        '''# Predict test set
        start = time.time()
        cur_test["Date"] = cur_test.Page.apply(lambda a: a[-10:])
        cur_test["Page"] = cur_test.Page.apply(lambda a: a[:-11])
        print("Elapsed prepare test : " + str(time.time() - start))
        cur_test, y = fit_predict(conf, cur_train, cur_test, is_test=True)
   
        # Save preds
        file_name = "../preds/temp/" + conf_name + "_preds_test.csv"
        cur_test[["Id", "Visits"]].to_csv(file_name, index = False)'''
        
        del(cur_train)
        del(cur_test)
        gc.collect()

    return(results)
    

In [4]:
# Cross validation
def cross_val(conf, conf_name, trains, vals):
    start_a = time.time()
    oof_preds = pd.DataFrame()
    oof_targets = pd.DataFrame()
    for i in range(len(trains)):
        start_f = time.time()
        train = trains[i]
        validation = vals[i]

        validation, y = fit_predict(conf, train, validation, is_test=False)

        # Compute validation score
        val_score = smape(y.Visits, validation.Visits)
        print("SMAPE fold " + str(i + 1) + " : " + str(val_score))

        # Assemble OOF predictions and targets
        oof_preds = pd.concat([pd.Series(oof_preds), pd.Series(validation.Visits)], axis = 0)
        oof_targets = pd.concat([pd.Series(oof_targets), pd.Series(y.Visits)], axis = 0)

        print("Elapsed in fold " + str(i + 1) + " : " + str(time.time() - start_f))
        del(train)
        del(validation)
        gc.collect()

    # Compute overall score
    cv_score = smape(oof_targets, oof_preds)
    print("Global SMAPE score : " + str(cv_score))

    # Save OOF preds
    file_name = "../oof_preds/temp/" + conf_name + "_preds_OOF.csv"
    pd.DataFrame({"y": oof_preds}).to_csv(file_name, index = False)
    file_name = "../oof_targets/temp/" + conf_name + "_targets_OOF.csv"
    pd.DataFrame({"y": oof_targets}).to_csv(file_name, index = False)

    print("Elapsed cross val : " + str(time.time() - start_a))
    del(oof_targets)
    del(oof_preds)
    gc.collect()

    return(cv_score)

In [5]:
# Fit and predict
def fit_predict(conf, train, validation, is_test = False) :
    # Handle training length (in days)
    if (conf["length"] != "all") :
        train = train[["Page"] + list(train.columns[-conf["length"]:])]

    # Melt df to have one observation per row
    train = pd.melt(train, 
                    id_vars="Page", 
                    var_name="Date", 
                    value_name="Visits")
    #print(train.shape)
    #display(train.head(3))

    y = 0
    if (is_test is False) :
        validation = pd.melt(validation, 
                             id_vars="Page", 
                             var_name="Date", 
                             value_name="Visits")  
        
        # Remove y from validation set
        y = validation[["Page", "Visits"]]
        validation.drop("Visits", axis=1, inplace=True)
        #display(validation.head(3))
        #print(validation.shape)
        gc.collect()

    # Handle date variable
    #start = time.time()
    train["Year"] = train["Date"].map(lambda x: int(str(x)[2:4]))
    train["Month"] = train["Date"].map(lambda x: int(str(x)[5:7]))
    train["DayOfM"] = train["Date"].map(lambda x: int(str(x)[8:10]))
    train["Date"] = train["Date"].astype("datetime64[ns]")
    train["DayOfY"] = train.Date.dt.dayofyear
    train["DayOfW"] = train.Date.dt.dayofweek
    train["is_wknd"] = (train.Date.dt.dayofweek >= 5).astype(int)
    train.drop(["Date"], inplace=True, axis=1)
    #display(train.head(3))
    #print("Elapsed train date : " + str(time.time() - start))

    #start = time.time()
    validation["Year"] = validation["Date"].map(lambda x: int(str(x)[2:4]))
    validation["Month"] = validation["Date"].map(lambda x: int(str(x)[5:7]))
    validation["DayOfM"] = validation["Date"].map(lambda x: int(str(x)[8:10]))
    validation["Date"] = validation["Date"].astype("datetime64[ns]")
    validation["DayOfY"] = validation.Date.dt.dayofyear
    validation["DayOfW"] = validation.Date.dt.dayofweek
    validation["is_wknd"] = (validation.Date.dt.dayofweek >= 5).astype(int)
    validation.drop(["Date"], inplace=True, axis=1)
    #display(validation.head(3))
    #print("Elapsed val date : " + str(time.time() - start))
    
    # Handle transformation
    if (conf["stat"] != "median") :
        pages_groups = train.groupby(["Page"]).median().reset_index()
        pages_groups = pages_groups[["Page", "Visits"]]
    elif (conf["stat"] != "mean") :
        pages_groups = train.groupby(["Page"]).mean().reset_index()
        pages_groups = pages_groups[["Page", "Visits"]]

    # Make validation predictions
    validation = validation.merge(pages_groups, how="left")
    #display(validation.head(3))
    #display(validation.tail(3))
    del(pages_groups)
    gc.collect()

    # Handle NAs
    validation.loc[validation.Visits.isnull(), "Visits"] = 0
    
    return(validation, y)


<b>Config</b>

In [6]:
dc1 = {"stat" : "median",
       "length" : 14}
dc2 = {"stat" : "median",
       "length" : 28}
dc3 = {"stat" : "mean",
       "length" : 14}
dc4 = {"stat" : "mean",
       "length" : 28}
dc5 = {"stat" : "median",
       "length" : 56} # 8 weeks
dc6 = {"stat" : "mean",
       "length" : 56}
dc7 = {"stat" : "median", 
       "length" : 84} # 12 weeks
dc8 = {"stat" : "mean",
       "length" : 84}
dc9 = {"stat" : "median", 
       "length" : 112} # 16 weeks
dc10 = {"stat" : "mean",
       "length" : 112}
dc11 = {"stat" : "median", 
       "length" : 168} # 24 weeks (6 * 4 weeks)
dc12 = {"stat" : "mean",
       "length" : 168}

data_configs = {
    "dc11" : dc11,
    "dc12" : dc12,
}


<b>Script</b>

In [7]:
# Get data
init_train = pd.read_csv("../raw_data/train_1.csv")
display(init_train.shape)
display(init_train.head(3))

test = pd.read_csv("../raw_data/key_1.csv")
display(test.shape)
display(test.head(3))



(145063, 551)

Unnamed: 0,Page,2015-07-01,2015-07-02,2015-07-03,2015-07-04,2015-07-05,2015-07-06,2015-07-07,2015-07-08,2015-07-09,2015-07-10,2015-07-11,2015-07-12,2015-07-13,2015-07-14,2015-07-15,2015-07-16,2015-07-17,2015-07-18,2015-07-19,2015-07-20,2015-07-21,2015-07-22,2015-07-23,2015-07-24,2015-07-25,2015-07-26,2015-07-27,2015-07-28,2015-07-29,2015-07-30,2015-07-31,2015-08-01,2015-08-02,2015-08-03,2015-08-04,2015-08-05,2015-08-06,2015-08-07,2015-08-08,2015-08-09,2015-08-10,2015-08-11,2015-08-12,2015-08-13,2015-08-14,2015-08-15,2015-08-16,2015-08-17,2015-08-18,2015-08-19,2015-08-20,2015-08-21,2015-08-22,2015-08-23,2015-08-24,2015-08-25,2015-08-26,2015-08-27,2015-08-28,2015-08-29,2015-08-30,2015-08-31,2015-09-01,2015-09-02,2015-09-03,2015-09-04,2015-09-05,2015-09-06,2015-09-07,2015-09-08,2015-09-09,2015-09-10,2015-09-11,2015-09-12,2015-09-13,2015-09-14,2015-09-15,2015-09-16,2015-09-17,2015-09-18,2015-09-19,2015-09-20,2015-09-21,2015-09-22,2015-09-23,2015-09-24,2015-09-25,2015-09-26,2015-09-27,2015-09-28,2015-09-29,2015-09-30,2015-10-01,2015-10-02,2015-10-03,2015-10-04,2015-10-05,2015-10-06,2015-10-07,2015-10-08,2015-10-09,2015-10-10,2015-10-11,2015-10-12,2015-10-13,2015-10-14,2015-10-15,2015-10-16,2015-10-17,2015-10-18,2015-10-19,2015-10-20,2015-10-21,2015-10-22,2015-10-23,2015-10-24,2015-10-25,2015-10-26,2015-10-27,2015-10-28,2015-10-29,2015-10-30,2015-10-31,2015-11-01,2015-11-02,2015-11-03,2015-11-04,2015-11-05,2015-11-06,2015-11-07,2015-11-08,2015-11-09,2015-11-10,2015-11-11,2015-11-12,2015-11-13,2015-11-14,2015-11-15,2015-11-16,2015-11-17,2015-11-18,2015-11-19,2015-11-20,2015-11-21,2015-11-22,2015-11-23,2015-11-24,2015-11-25,2015-11-26,2015-11-27,2015-11-28,2015-11-29,2015-11-30,2015-12-01,2015-12-02,2015-12-03,2015-12-04,2015-12-05,2015-12-06,2015-12-07,2015-12-08,2015-12-09,2015-12-10,2015-12-11,2015-12-12,2015-12-13,2015-12-14,2015-12-15,2015-12-16,2015-12-17,2015-12-18,2015-12-19,2015-12-20,2015-12-21,2015-12-22,2015-12-23,2015-12-24,2015-12-25,2015-12-26,2015-12-27,2015-12-28,2015-12-29,2015-12-30,2015-12-31,2016-01-01,2016-01-02,2016-01-03,2016-01-04,2016-01-05,2016-01-06,2016-01-07,2016-01-08,2016-01-09,2016-01-10,2016-01-11,2016-01-12,2016-01-13,2016-01-14,2016-01-15,2016-01-16,2016-01-17,2016-01-18,2016-01-19,2016-01-20,2016-01-21,2016-01-22,2016-01-23,2016-01-24,2016-01-25,2016-01-26,2016-01-27,2016-01-28,2016-01-29,2016-01-30,2016-01-31,2016-02-01,2016-02-02,2016-02-03,2016-02-04,2016-02-05,2016-02-06,2016-02-07,2016-02-08,2016-02-09,2016-02-10,2016-02-11,2016-02-12,2016-02-13,2016-02-14,2016-02-15,2016-02-16,2016-02-17,2016-02-18,2016-02-19,2016-02-20,2016-02-21,2016-02-22,2016-02-23,2016-02-24,2016-02-25,2016-02-26,2016-02-27,2016-02-28,2016-02-29,2016-03-01,2016-03-02,2016-03-03,2016-03-04,2016-03-05,2016-03-06,2016-03-07,2016-03-08,2016-03-09,2016-03-10,2016-03-11,2016-03-12,2016-03-13,2016-03-14,2016-03-15,2016-03-16,2016-03-17,2016-03-18,2016-03-19,2016-03-20,2016-03-21,2016-03-22,2016-03-23,2016-03-24,2016-03-25,2016-03-26,2016-03-27,2016-03-28,2016-03-29,2016-03-30,2016-03-31,2016-04-01,2016-04-02,2016-04-03,2016-04-04,2016-04-05,2016-04-06,2016-04-07,2016-04-08,2016-04-09,2016-04-10,2016-04-11,2016-04-12,2016-04-13,2016-04-14,2016-04-15,2016-04-16,2016-04-17,2016-04-18,2016-04-19,2016-04-20,2016-04-21,2016-04-22,2016-04-23,2016-04-24,2016-04-25,2016-04-26,2016-04-27,2016-04-28,2016-04-29,2016-04-30,2016-05-01,2016-05-02,2016-05-03,2016-05-04,2016-05-05,2016-05-06,2016-05-07,2016-05-08,2016-05-09,2016-05-10,2016-05-11,2016-05-12,2016-05-13,2016-05-14,2016-05-15,2016-05-16,2016-05-17,2016-05-18,2016-05-19,2016-05-20,2016-05-21,2016-05-22,2016-05-23,2016-05-24,2016-05-25,2016-05-26,2016-05-27,2016-05-28,2016-05-29,2016-05-30,2016-05-31,2016-06-01,2016-06-02,2016-06-03,2016-06-04,2016-06-05,2016-06-06,2016-06-07,2016-06-08,2016-06-09,2016-06-10,2016-06-11,2016-06-12,2016-06-13,2016-06-14,2016-06-15,2016-06-16,2016-06-17,2016-06-18,2016-06-19,2016-06-20,2016-06-21,2016-06-22,2016-06-23,2016-06-24,2016-06-25,2016-06-26,2016-06-27,2016-06-28,2016-06-29,2016-06-30,2016-07-01,2016-07-02,2016-07-03,2016-07-04,2016-07-05,2016-07-06,2016-07-07,2016-07-08,2016-07-09,2016-07-10,2016-07-11,2016-07-12,2016-07-13,2016-07-14,2016-07-15,2016-07-16,2016-07-17,2016-07-18,2016-07-19,2016-07-20,2016-07-21,2016-07-22,2016-07-23,2016-07-24,2016-07-25,2016-07-26,2016-07-27,2016-07-28,2016-07-29,2016-07-30,2016-07-31,2016-08-01,2016-08-02,2016-08-03,2016-08-04,2016-08-05,2016-08-06,2016-08-07,2016-08-08,2016-08-09,2016-08-10,2016-08-11,2016-08-12,2016-08-13,2016-08-14,2016-08-15,2016-08-16,2016-08-17,2016-08-18,2016-08-19,2016-08-20,2016-08-21,2016-08-22,2016-08-23,2016-08-24,2016-08-25,2016-08-26,2016-08-27,2016-08-28,2016-08-29,2016-08-30,2016-08-31,2016-09-01,2016-09-02,2016-09-03,2016-09-04,2016-09-05,2016-09-06,2016-09-07,2016-09-08,2016-09-09,2016-09-10,2016-09-11,2016-09-12,2016-09-13,2016-09-14,2016-09-15,2016-09-16,2016-09-17,2016-09-18,2016-09-19,2016-09-20,2016-09-21,2016-09-22,2016-09-23,2016-09-24,2016-09-25,2016-09-26,2016-09-27,2016-09-28,2016-09-29,2016-09-30,2016-10-01,2016-10-02,2016-10-03,2016-10-04,2016-10-05,2016-10-06,2016-10-07,2016-10-08,2016-10-09,2016-10-10,2016-10-11,2016-10-12,2016-10-13,2016-10-14,2016-10-15,2016-10-16,2016-10-17,2016-10-18,2016-10-19,2016-10-20,2016-10-21,2016-10-22,2016-10-23,2016-10-24,2016-10-25,2016-10-26,2016-10-27,2016-10-28,2016-10-29,2016-10-30,2016-10-31,2016-11-01,2016-11-02,2016-11-03,2016-11-04,2016-11-05,2016-11-06,2016-11-07,2016-11-08,2016-11-09,2016-11-10,2016-11-11,2016-11-12,2016-11-13,2016-11-14,2016-11-15,2016-11-16,2016-11-17,2016-11-18,2016-11-19,2016-11-20,2016-11-21,2016-11-22,2016-11-23,2016-11-24,2016-11-25,2016-11-26,2016-11-27,2016-11-28,2016-11-29,2016-11-30,2016-12-01,2016-12-02,2016-12-03,2016-12-04,2016-12-05,2016-12-06,2016-12-07,2016-12-08,2016-12-09,2016-12-10,2016-12-11,2016-12-12,2016-12-13,2016-12-14,2016-12-15,2016-12-16,2016-12-17,2016-12-18,2016-12-19,2016-12-20,2016-12-21,2016-12-22,2016-12-23,2016-12-24,2016-12-25,2016-12-26,2016-12-27,2016-12-28,2016-12-29,2016-12-30,2016-12-31
0,2NE1_zh.wikipedia.org_all-access_spider,18.0,11.0,5.0,13.0,14.0,9.0,9.0,22.0,26.0,24.0,19.0,10.0,14.0,15.0,8.0,16.0,8.0,8.0,16.0,7.0,11.0,10.0,20.0,18.0,15.0,14.0,49.0,10.0,16.0,18.0,8.0,5.0,9.0,7.0,13.0,9.0,7.0,4.0,11.0,10.0,5.0,9.0,9.0,9.0,9.0,13.0,4.0,15.0,25.0,9.0,5.0,6.0,20.0,3.0,14.0,46.0,5.0,5.0,13.0,4.0,9.0,10.0,9.0,11.0,11.0,11.0,9.0,15.0,5.0,10.0,7.0,4.0,8.0,9.0,10.0,6.0,13.0,16.0,6.0,24.0,9.0,11.0,12.0,8.0,14.0,6.0,6.0,11.0,14.0,6.0,10.0,20.0,7.0,15.0,8.0,15.0,5.0,8.0,8.0,5.0,11.0,165.0,34.0,6.0,13.0,8.0,9.0,11.0,26.0,18.0,3.0,5.0,12.0,6.0,16.0,19.0,9.0,10.0,11.0,11.0,7.0,9.0,10.0,24.0,6.0,6.0,8.0,16.0,13.0,10.0,10.0,6.0,5.0,20.0,6.0,47.0,9.0,9.0,12.0,11.0,17.0,15.0,14.0,11.0,97.0,11.0,12.0,11.0,14.0,15.0,12.0,104.0,5.0,22.0,45.0,75.0,29.0,34.0,20.0,12.0,25.0,9.0,62.0,20.0,19.0,8.0,23.0,13.0,16.0,34.0,36.0,11.0,18.0,12.0,24.0,30.0,27.0,44.0,35.0,53.0,11.0,26.0,13.0,18.0,9.0,16.0,6.0,19.0,20.0,19.0,22.0,30.0,14.0,16.0,22.0,15.0,15.0,26.0,16.0,13.0,27.0,18.0,13.0,32.0,31.0,16.0,38.0,18.0,9.0,14.0,10.0,24.0,8.0,15.0,18.0,10.0,23.0,17.0,11.0,26.0,14.0,8.0,12.0,9.0,11.0,34.0,17.0,29.0,11.0,9.0,14.0,21.0,12.0,11.0,13.0,11.0,13.0,16.0,13.0,19.0,21.0,14.0,11.0,35.0,18.0,42.0,15.0,5.0,21.0,56.0,9.0,20.0,17.0,18.0,8.0,9.0,17.0,9.0,10.0,14.0,17.0,6.0,18.0,13.0,11.0,12.0,11.0,8.0,15.0,11.0,20.0,59.0,11.0,18.0,17.0,12.0,14.0,13.0,9.0,490.0,189.0,102.0,38.0,126.0,71.0,21.0,57.0,79.0,17.0,17.0,23.0,16.0,23.0,18.0,22.0,44.0,6.0,31.0,17.0,25.0,40.0,19.0,15.0,15.0,29.0,18.0,16.0,13.0,20.0,22.0,19.0,11.0,50.0,22.0,39.0,23.0,21.0,23.0,22.0,16.0,19.0,35.0,16.0,12.0,15.0,13.0,14.0,10.0,21.0,20.0,19.0,14.0,12.0,15.0,17.0,16.0,21.0,27.0,13.0,11.0,15.0,14.0,18.0,18.0,10.0,11.0,14.0,18.0,14.0,13.0,17.0,15.0,14.0,234.0,8.0,62.0,26.0,22.0,8.0,22.0,15.0,69.0,11.0,18.0,23.0,12.0,20.0,17.0,15.0,16.0,18.0,21.0,15.0,30.0,115.0,56.0,45.0,17.0,18.0,15.0,18.0,14.0,15.0,15.0,24.0,22.0,18.0,30.0,12.0,13.0,18.0,17.0,31.0,26.0,29.0,12.0,19.0,19.0,57.0,17.0,20.0,49.0,10.0,19.0,26.0,41.0,23.0,30.0,55.0,17.0,24.0,14.0,12.0,49.0,42.0,37.0,13.0,30.0,20.0,33.0,20.0,14.0,40.0,15.0,18.0,26.0,8.0,25.0,21.0,20.0,25.0,19.0,23.0,18.0,19.0,18.0,55.0,16.0,65.0,11.0,11.0,13.0,20.0,21.0,13.0,24.0,20.0,13.0,32.0,16.0,10.0,13.0,44.0,17.0,13.0,72.0,40.0,19.0,14.0,13.0,12.0,14.0,10.0,26.0,13.0,22.0,14.0,23.0,12.0,8.0,50.0,13.0,10.0,16.0,14.0,10.0,24.0,10.0,20.0,10.0,26.0,25.0,16.0,19.0,20.0,12.0,19.0,50.0,16.0,30.0,18.0,25.0,14.0,20.0,8.0,67.0,13.0,41.0,10.0,21.0,13.0,8.0,15.0,14.0,12.0,6.0,11.0,10.0,42.0,21.0,24.0,14.0,11.0,204.0,14.0,45.0,33.0,28.0,18.0,14.0,47.0,15.0,14.0,18.0,20.0,14.0,16.0,14.0,20.0,60.0,22.0,15.0,17.0,19.0,18.0,21.0,21.0,47.0,65.0,17.0,32.0,63.0,15.0,26.0,14.0,20.0,22.0,19.0,18.0,20.0
1,2PM_zh.wikipedia.org_all-access_spider,11.0,14.0,15.0,18.0,11.0,13.0,22.0,11.0,10.0,4.0,41.0,65.0,57.0,38.0,20.0,62.0,44.0,15.0,10.0,47.0,24.0,17.0,22.0,9.0,39.0,13.0,11.0,12.0,21.0,19.0,9.0,15.0,33.0,8.0,8.0,7.0,13.0,2.0,23.0,12.0,27.0,27.0,36.0,23.0,58.0,80.0,60.0,69.0,42.0,161.0,94.0,77.0,78.0,20.0,24.0,13.0,14.0,26.0,8.0,82.0,22.0,11.0,81.0,37.0,9.0,40.0,47.0,18.0,23.0,6.0,2.0,7.0,16.0,10.0,34.0,14.0,31.0,20.0,23.0,14.0,16.0,34.0,15.0,30.0,13.0,30.0,15.0,25.0,17.0,8.0,12.0,17.0,10.0,21.0,18.0,30.0,13.0,7.0,15.0,23.0,20.0,15.0,9.0,47.0,14.0,11.0,16.0,12.0,7.0,15.0,14.0,12.0,18.0,29.0,39.0,11.0,14.0,28.0,17.0,20.0,17.0,36.0,13.0,11.0,14.0,14.0,14.0,33.0,14.0,13.0,18.0,13.0,11.0,8.0,10.0,11.0,81.0,14.0,20.0,6.0,16.0,18.0,9.0,12.0,10.0,8.0,11.0,14.0,47.0,13.0,13.0,6.0,10.0,8.0,8.0,8.0,18.0,31.0,16.0,15.0,10.0,13.0,9.0,32.0,161.0,6.0,20.0,8.0,11.0,13.0,8.0,19.0,7.0,9.0,16.0,11.0,6.0,38.0,11.0,17.0,13.0,12.0,12.0,9.0,7.0,15.0,14.0,14.0,11.0,13.0,12.0,12.0,24.0,15.0,38.0,18.0,26.0,15.0,12.0,14.0,40.0,19.0,13.0,39.0,19.0,16.0,19.0,11.0,76.0,14.0,19.0,26.0,19.0,17.0,30.0,17.0,17.0,17.0,19.0,11.0,175.0,10.0,5.0,12.0,7.0,12.0,14.0,19.0,11.0,19.0,17.0,15.0,19.0,15.0,9.0,20.0,6.0,11.0,6.0,15.0,20.0,35.0,34.0,21.0,17.0,22.0,26.0,16.0,16.0,28.0,19.0,17.0,15.0,11.0,7.0,15.0,11.0,36.0,16.0,22.0,18.0,46.0,17.0,15.0,17.0,12.0,17.0,14.0,15.0,14.0,15.0,28.0,36.0,23.0,12.0,25.0,18.0,18.0,16.0,20.0,17.0,16.0,13.0,15.0,19.0,14.0,20.0,37.0,16.0,15.0,11.0,42.0,10.0,14.0,61.0,39.0,17.0,17.0,41.0,35.0,16.0,9.0,64.0,22.0,22.0,66.0,33.0,30.0,16.0,18.0,45.0,17.0,88.0,23.0,18.0,12.0,12.0,13.0,13.0,5.0,11.0,13.0,11.0,22.0,10.0,13.0,17.0,10.0,14.0,18.0,9.0,16.0,17.0,6.0,15.0,18.0,10.0,11.0,16.0,10.0,12.0,12.0,13.0,9.0,16.0,19.0,19.0,11.0,15.0,10.0,20.0,25.0,9.0,14.0,10.0,14.0,18.0,25.0,13.0,24.0,14.0,13.0,14.0,24.0,16.0,15.0,13.0,11.0,12.0,28.0,28.0,17.0,27.0,48.0,184.0,64.0,24.0,92.0,31.0,34.0,49.0,21.0,36.0,32.0,16.0,16.0,19.0,22.0,22.0,19.0,18.0,18.0,17.0,35.0,49.0,19.0,25.0,24.0,39.0,19.0,29.0,30.0,16.0,54.0,15.0,39.0,19.0,17.0,60.0,12.0,77.0,63.0,12.0,9.0,34.0,30.0,13.0,20.0,29.0,10.0,14.0,23.0,15.0,12.0,25.0,22.0,144.0,31.0,31.0,17.0,66.0,78.0,19.0,44.0,43.0,35.0,13.0,13.0,25.0,15.0,37.0,38.0,22.0,28.0,19.0,46.0,24.0,22.0,43.0,58.0,26.0,20.0,27.0,35.0,20.0,31.0,24.0,24.0,94.0,18.0,20.0,18.0,16.0,38.0,54.0,29.0,49.0,25.0,72.0,144.0,36.0,97.0,179.0,29.0,12.0,21.0,42.0,53.0,41.0,19.0,25.0,19.0,15.0,21.0,21.0,27.0,33.0,15.0,24.0,13.0,11.0,14.0,26.0,11.0,21.0,14.0,14.0,54.0,5.0,10.0,12.0,11.0,14.0,28.0,23.0,20.0,9.0,12.0,11.0,14.0,14.0,15.0,15.0,11.0,20.0,13.0,19.0,621.0,57.0,17.0,23.0,19.0,21.0,47.0,28.0,22.0,22.0,65.0,27.0,17.0,17.0,13.0,9.0,18.0,22.0,17.0,15.0,22.0,23.0,19.0,17.0,42.0,28.0,15.0,9.0,30.0,52.0,45.0,26.0,20.0
2,3C_zh.wikipedia.org_all-access_spider,1.0,0.0,1.0,1.0,0.0,4.0,0.0,3.0,4.0,4.0,1.0,1.0,1.0,6.0,8.0,6.0,4.0,5.0,1.0,2.0,3.0,8.0,8.0,6.0,6.0,2.0,2.0,3.0,2.0,4.0,3.0,3.0,5.0,3.0,5.0,4.0,2.0,5.0,1.0,4.0,5.0,0.0,0.0,7.0,3.0,5.0,1.0,6.0,2.0,5.0,0.0,3.0,1.0,0.0,1.0,1.0,2.0,4.0,2.0,1.0,1.0,3.0,4.0,3.0,6.0,6.0,4.0,3.0,3.0,2.0,9.0,7.0,2.0,3.0,1.0,3.0,1.0,6.0,7.0,1.0,2.0,5.0,2.0,3.0,8.0,5.0,0.0,4.0,1.0,5.0,3.0,0.0,1.0,8.0,2.0,1.0,3.0,0.0,0.0,5.0,3.0,3.0,0.0,2.0,5.0,2.0,5.0,10.0,5.0,6.0,1.0,4.0,4.0,1.0,3.0,13.0,2.0,1.0,3.0,2.0,1.0,10.0,5.0,6.0,2.0,5.0,2.0,2.0,3.0,2.0,6.0,3.0,2.0,1.0,2.0,3.0,1.0,1.0,2.0,2.0,3.0,2.0,2.0,5.0,7.0,2.0,3.0,4.0,6.0,1.0,3.0,6.0,3.0,3.0,4.0,2.0,2.0,4.0,3.0,1.0,5.0,5.0,4.0,2.0,4.0,5.0,4.0,2.0,1.0,6.0,1.0,1.0,3.0,1.0,3.0,5.0,3.0,3.0,0.0,5.0,3.0,2.0,2.0,2.0,2.0,0.0,3.0,3.0,3.0,4.0,4.0,8.0,3.0,5.0,8.0,1.0,4.0,0.0,3.0,6.0,3.0,1.0,3.0,3.0,3.0,1.0,3.0,8.0,4.0,3.0,2.0,5.0,6.0,3.0,6.0,5.0,6.0,7.0,3.0,1.0,5.0,1.0,2.0,0.0,1.0,4.0,3.0,3.0,9.0,4.0,7.0,5.0,10.0,2.0,3.0,3.0,4.0,2.0,3.0,5.0,3.0,6.0,4.0,5.0,5.0,2.0,1.0,4.0,7.0,2.0,2.0,5.0,1.0,0.0,3.0,3.0,1.0,2.0,4.0,2.0,2.0,3.0,4.0,7.0,1.0,1.0,10.0,9.0,5.0,1.0,6.0,7.0,4.0,6.0,2.0,4.0,155.0,155.0,83.0,48.0,31.0,16.0,6.0,13.0,8.0,8.0,5.0,7.0,3.0,4.0,6.0,7.0,10.0,9.0,7.0,8.0,4.0,6.0,5.0,2.0,7.0,3.0,7.0,6.0,3.0,1.0,6.0,2.0,1.0,3.0,8.0,3.0,5.0,4.0,7.0,5.0,2.0,5.0,0.0,3.0,12.0,4.0,2.0,4.0,6.0,4.0,5.0,9.0,4.0,5.0,7.0,1.0,5.0,1.0,5.0,4.0,5.0,7.0,7.0,5.0,3.0,4.0,1.0,9.0,3.0,4.0,6.0,2.0,2.0,1.0,16.0,6.0,3.0,3.0,6.0,1.0,6.0,1.0,4.0,3.0,5.0,1.0,6.0,5.0,1.0,4.0,5.0,4.0,2.0,4.0,3.0,4.0,2.0,0.0,1.0,3.0,12.0,4.0,7.0,5.0,6.0,6.0,6.0,3.0,3.0,3.0,5.0,5.0,2.0,11.0,6.0,2.0,2.0,3.0,7.0,5.0,4.0,5.0,3.0,3.0,9.0,7.0,2.0,1.0,5.0,6.0,7.0,13.0,3.0,5.0,6.0,2.0,4.0,1.0,2.0,7.0,2.0,2.0,4.0,4.0,2.0,5.0,3.0,2.0,3.0,5.0,4.0,2.0,5.0,7.0,5.0,2.0,7.0,6.0,11.0,10.0,5.0,19.0,7.0,11.0,4.0,10.0,3.0,4.0,6.0,3.0,4.0,8.0,10.0,3.0,3.0,1.0,10.0,5.0,4.0,4.0,3.0,4.0,1.0,3.0,6.0,6.0,6.0,3.0,5.0,11.0,6.0,3.0,7.0,6.0,0.0,2.0,4.0,4.0,3.0,6.0,4.0,3.0,4.0,1.0,6.0,5.0,5.0,2.0,3.0,3.0,2.0,2.0,6.0,1.0,3.0,3.0,3.0,2.0,10.0,2.0,2.0,2.0,7.0,3.0,6.0,4.0,2.0,4.0,6.0,5.0,4.0,4.0,3.0,3.0,9.0,3.0,5.0,4.0,0.0,1.0,4.0,5.0,8.0,8.0,1.0,1.0,2.0,5.0,3.0,3.0,3.0,7.0,3.0,9.0,8.0,3.0,210.0,5.0,4.0,6.0,2.0,2.0,4.0,3.0,3.0,1.0,1.0,7.0,4.0,4.0,6.0,3.0,4.0,17.0


(8703780, 2)

Unnamed: 0,Page,Id
0,!vote_en.wikipedia.org_all-access_all-agents_2...,bf4edcf969af
1,!vote_en.wikipedia.org_all-access_all-agents_2...,929ed2bf52b9
2,!vote_en.wikipedia.org_all-access_all-agents_2...,ff29d0f51d5c


In [8]:
# Create validation data - 60 days val sets, 10 days between train and val
validation5 = init_train[["Page"] + list(init_train.columns[491:551])]
train5 = init_train[["Page"] + list(init_train.columns[1:481])]

validation4 = init_train[["Page"] + list(init_train.columns[431:491])]
train4 = init_train[["Page"] + list(init_train.columns[1:421])]

validation3 = init_train[["Page"] + list(init_train.columns[371:431])]
train3 = init_train[["Page"] + list(init_train.columns[1:361])]

validation2 = init_train[["Page"] + list(init_train.columns[311:371])]
train2 = init_train[["Page"] + list(init_train.columns[1:301])]

validation1 = init_train[["Page"] + list(init_train.columns[251:311])]
train1 = init_train[["Page"] + list(init_train.columns[1:241])]

trains = [train1, train2, train3, train4, train5]
vals = [validation1, validation2, validation3, validation4, validation5]
gc.collect()

27

In [9]:
# Generate preds
results = run_generator(data_configs, trains, vals, init_train, test)

***** CONFIG : dc11 ***************
SMAPE fold 1 : 55.8237945678
Elapsed in fold 1 : 240.81540060043335
SMAPE fold 2 : 62.940635399
Elapsed in fold 2 : 237.71068382263184
SMAPE fold 3 : 69.1071397552
Elapsed in fold 3 : 238.89702725410461
SMAPE fold 4 : 65.2478760226
Elapsed in fold 4 : 240.39991188049316
SMAPE fold 5 : 65.8067628754
Elapsed in fold 5 : 239.20644426345825
Global SMAPE score : 63.8397972784
Elapsed cross val : 2044.7637009620667
***** CONFIG : dc12 ***************
SMAPE fold 1 : 48.9870190577
Elapsed in fold 1 : 241.87780904769897
SMAPE fold 2 : 50.9469663169
Elapsed in fold 2 : 240.80409622192383
SMAPE fold 3 : 53.4400846606
Elapsed in fold 3 : 240.12034559249878
SMAPE fold 4 : 50.3973908527
Elapsed in fold 4 : 242.08399438858032
SMAPE fold 5 : 52.1803124493
Elapsed in fold 5 : 242.20941591262817
Global SMAPE score : 51.204375353
Elapsed cross val : 2000.975704908371


In [10]:
# Show ordered results
results.sort_values(["cv"], ascending=True, inplace=True)
results.reset_index(drop=True, inplace=True)
display(results)

Unnamed: 0,conf,cv
0,dc12,51.2
1,dc11,63.84
