In [2]:
# Imports
import pandas as pd
import numpy as np
from IPython.display import HTML, display
import warnings
import os
import time
from sklearn.model_selection import KFold
from sklearn.linear_model import LassoCV, LassoLarsCV, RidgeCV
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
from xgboost import XGBRegressor, DMatrix

# Visualization options
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option('display.max_columns', None)
%matplotlib inline
display(HTML("<style>.container { width: 90% !important; }</style>"))
warnings.filterwarnings("ignore")

# Global variables
SEED = 420
N_JOBS = 18
FOLDS = 10

<b>Functions</b>

In [3]:
# Stack all single Level 1 models OOF and test preds as entries for a Level 2 model
def get_level_one_data(oof_preds_dir, oof_targets_dir, preds_dir) :
    level_one_preds = pd.DataFrame()
    i = 0
    for file_name in sorted(os.listdir(oof_preds_dir)) :
        preds_OOF = pd.read_csv(oof_preds_dir + file_name)
        cur_preds_OOF = pd.Series(preds_OOF.y, name = file_name[0:7])
        level_one_preds = pd.concat([level_one_preds, cur_preds_OOF], axis = 1)
        i += 1

    level_one_targets = pd.DataFrame()
    i = 0
    for file_name in sorted(os.listdir(oof_targets_dir)) :
        targets_OOF = pd.read_csv(oof_targets_dir + file_name)
        cur_targets_OOF = pd.Series(targets_OOF.y, name = file_name[0:7])
        level_one_targets = pd.concat([level_one_targets, cur_targets_OOF], axis = 1)
        i += 1

    # Same for test set preds
    level_one_test = pd.DataFrame()
    i = 0
    for file_name in sorted(os.listdir(preds_dir)) :
        preds_test = pd.read_csv(preds_dir + file_name)
        cur_preds_test = pd.Series(preds_test.y, name = file_name[0:7])
        level_one_test = pd.concat([level_one_test, cur_preds_test], axis = 1)
        i += 1
    
    return(level_one_preds, level_one_targets, level_one_test)

In [4]:
def get_cv_score(X_train, X_train_y, model, model_name, conf, conf_name) :
    oof_preds = pd.DataFrame()
    oof_targets = pd.DataFrame()
    kf = KFold(n_splits = FOLDS, shuffle = True, random_state = SEED)
    for i, (train_index, test_index) in enumerate(kf.split(X_train)):
        #print("FOLD " + str(i + 1))
        X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_fold, y_test_fold = X_train_y.iloc[train_index], X_train_y.iloc[test_index]

        # Fit and predict
        model.fit(X_train_fold, y_train_fold.y)
        preds_X_test_fold = model.predict(X_test_fold)
        
        # Assemble OOF predictions and targets
        oof_preds = pd.concat([pd.Series(oof_preds), pd.Series(preds_X_test_fold)], axis = 0)
        oof_targets = pd.concat([pd.Series(oof_targets), pd.Series(y_test_fold.y)], axis = 0)
        
    if (conf["log_target"] >= 0) :
        # Transform preds back if needed
        oof_preds = np.exp(oof_preds) - conf["log_target"]
        oof_targets = np.exp(oof_targets) - conf["log_target"]

    # Compute error on concatenated OOF predictions
    cv_score = r2_score(oof_targets, oof_preds)
    print("Global OOF r2_score : " + str(cv_score))
                    
    return(cv_score)

<b>Models</b>

In [5]:
models = {
    "la" : LassoCV(eps = 0.0001, 
                   n_alphas = 100, 
                   max_iter = 10000, 
                   tol = 0.0001,                     
                   normalize = True, 
                   precompute = True, 
                   random_state = SEED,
                   n_jobs = N_JOBS),
    "ll" : LassoLarsCV(max_n_alphas = 1000, 
                       max_iter = 10000,
                       normalize = True, 
                       precompute = True, 
                       n_jobs = N_JOBS),
    "ri" : RidgeCV(alphas = [0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6],
                   normalize = True),
    "rf" : RandomForestRegressor(n_estimators = 200,
                                 criterion = "mse", 
                                 max_features = 0.75,
                                 max_depth = 8,
                                 min_samples_split = 5,
                                 min_samples_leaf = 2, 
                                 bootstrap = True, 
                                 n_jobs = N_JOBS, 
                                 random_state = SEED),
    "et" : ExtraTreesRegressor(n_estimators = 200,
                               criterion = "mse", 
                               max_features = 0.75,
                               max_depth = 8,
                               min_samples_split = 5,
                               min_samples_leaf = 2, 
                               bootstrap = True, 
                               n_jobs = N_JOBS, 
                               random_state = SEED), 
#    "kn" : KNeighborsRegressor(n_neighbors = 100, 
#                               weights = "distance", 
#                               p = 3, 
#                               n_jobs = N_JOBS), 
#    "gb" : GradientBoostingRegressor(loss = "ls", 
#                                     learning_rate = 0.03, 
#                                     n_estimators = 200, 
#                                     max_depth = 8,
#                                     criterion = "friedman_mse",
#                                     min_samples_split = 5,
#                                     min_samples_leaf = 2,
#                                     subsample = 0.75,
#                                     max_features = 0.75, 
#                                     random_state = SEED),
#    "xg" : XGBRegressor(objective = "reg:linear", 
#                        learning_rate = 0.03, 
#                        n_estimators = 200, 
#                        max_depth = 8,
#                        min_child_weight = 2, 
#                        subsample = 0.75, 
#                        colsample_bytree = 0.75, 
#                        colsample_bylevel = 0.75, 
#                        nthread = N_JOBS,
#                        seed = SEED)
}

<b>Script</b>

In [6]:
# Get data
init_test = pd.read_csv("raw_data/test.csv")
test_ids = init_test.ID.values


In [7]:
# Define probe df
probe_ids = [1, 12, 23, 28, 42, 
             43, 45, 57, 72, 78, 
             88, 89, 93, 94, 104, 
             105, 110, 253, 259, 262, 
             337, 409, 437, 488, 493, 
             973, 1001, 1004, 1008, 1009, 
             1644, 1652, 1664, 2129, 2342, 
             3977, 7055, 8002, 8007, 8416]
probe_values = [71.34112, 109.30903, 115.21953, 92.00675, 87.73572, 
                129.79876, 99.55671, 116.02167, 110.54742, 125.28849, 
                90.33211, 130.55165, 105.79792, 103.04672, 92.37968, 
                108.5069, 83.31692, 115.93724, 93.33662, 75.35182, 
                101.23135, 91.00760, 85.96960, 113.39009, 108.40135, 
                106.76189, 111.65212, 91.472, 106.71967, 108.21841, 
                99.14157, 89.77625, 112.93977, 112.03, 93.06, 
                132.08556, 91.549, 95.84858, 87.44019, 96.84773]
new_train = init_test[init_test["ID"].isin(probe_ids)]
new_y = pd.DataFrame({"y" : probe_values})
new_y.set_index(new_train.index, inplace = True)
probe_df = pd.concat([new_train, new_y], axis = 1)


In [8]:
# Prepare process
oof_preds_dirs = []
oof_targets_dirs = []
preds_dirs = []

oof_preds_dirs.append("oof_preds/dc1_4/")
oof_preds_dirs.append("oof_preds/dc5_8/")
oof_preds_dirs.append("oof_preds/dc9_12/")
oof_preds_dirs.append("oof_preds/dc13_16/")

oof_targets_dirs.append("oof_targets/dc1_4/")
oof_targets_dirs.append("oof_targets/dc5_8/")
oof_targets_dirs.append("oof_targets/dc9_12/")
oof_targets_dirs.append("oof_targets/dc13_16/")

preds_dirs.append("preds/dc1_4/")
preds_dirs.append("preds/dc5_8/")
preds_dirs.append("preds/dc9_12/")
preds_dirs.append("preds/dc13_16/")


In [None]:
# Run stacker
for i in range(len(oof_preds_dirs)) :
    print("********************")
    oof_preds_dir = oof_preds_dirs[i]
    oof_targets_dir = oof_targets_dirs[i]
    preds_dir = preds_dirs[i]

    # Create Level 2 inputs
    level_one_preds, level_one_targets, level_one_test = get_level_one_data(oof_preds_dir, oof_targets_dir, preds_dir)
    display(level_one_preds.head(2))
    display(level_one_preds.tail(2))

    # Get error of each OOF column
    for col in level_one_preds.columns :
        cv_score = r2_score(level_one_targets[col], level_one_preds[col])
        print("Global OOF r2_score for " + col + " : " + str(cv_score))

    # Try different level 2 models
    for model_name, model  in models.items() :
        print("***** MODEL : " + model_name + " *****")

        # Get CV score
        cv_score = get_cv_score(model, level_one_preds, level_one_targets)

        # Fit, predict
        model.fit(level_one_preds, level_one_targets)
        preds = model.predict(level_one_test)
        
        # Handle format problems
        if isinstance(model, (RidgeCV, KNeighborsRegressor)) :
            preds = pd.DataFrame(preds_test).iloc[:, 0].values            
                        
        # Modify probed values
        conf_name = oof_preds_dirs[i].split("/")[1]
        file_name = "stack_preds/stacker_" + conf_name + "_" + model_name + ".csv"
        final_preds = pd.DataFrame({"ID": init_test.ID.values, "y": pd.DataFrame(preds)[0]})
        final_preds = pd.merge(final_preds, probe_df[["ID", "y"]], how = "left", on = "ID")
        final_preds["y"] = final_preds.apply(replace_probed_y, axis = 1)
        final_preds.drop(["y_x", "y_y"], axis = 1, inplace = True)

        # Save predictions
        final_preds.to_csv(file_name, index = False)
        display(final_preds.head(2))
