In [1]:
# Imports
import os
import pandas as pd
import numpy as np
import datetime
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.optimize import minimize
from sklearn.metrics import r2_score
from IPython.display import HTML, display

# Definitions
pd.set_option('display.float_format', lambda x: '%.3f' % x)
%matplotlib inline
display(HTML("<style>.container { width: 90% !important; }</style>"))
warnings.simplefilter('ignore')
N_JOBS = -1
SEED = 2017

In [2]:
# Stack all single Level 1 models OOF and test preds as entries for a Level 2 model
def get_level_one_data(oof_preds_dir, oof_targets_dir, preds_dir) :
    level_one_preds = pd.DataFrame()
    i = 0
    for file_name in sorted(os.listdir(oof_preds_dir)) :
        preds_OOF = pd.read_csv(oof_preds_dir + file_name)
        cur_preds_OOF = pd.Series(preds_OOF.y, name = file_name[0:7])
        level_one_preds = pd.concat([level_one_preds, cur_preds_OOF], axis = 1)
        i += 1

    level_one_targets = pd.DataFrame()
    i = 0
    for file_name in sorted(os.listdir(oof_targets_dir)) :
        targets_OOF = pd.read_csv(oof_targets_dir + file_name)
        cur_targets_OOF = pd.Series(targets_OOF.y, name = file_name[0:7])
        level_one_targets = pd.concat([level_one_targets, cur_targets_OOF], axis = 1)
        i += 1

    # Same for test set preds
    level_one_test = pd.DataFrame()
    i = 0
    for file_name in sorted(os.listdir(preds_dir)) :
        preds_test = pd.read_csv(preds_dir + file_name)
        cur_preds_test = pd.Series(preds_test.y, name = file_name[0:7])
        level_one_test = pd.concat([level_one_test, cur_preds_test], axis = 1)
        i += 1
    
    return(level_one_preds, level_one_targets, level_one_test)

In [3]:
# Plot predictions
def plot_preds(targets, preds) :
    plt.style.use("fivethirtyeight")
    plt.figure(figsize=(10, 20))
    colormap = plt.cm.gist_rainbow
    plt.gca().set_color_cycle([colormap(i) for i in np.linspace(0, 0.9, len(preds))])
    plt.plot([50, 250], [50, 250], c = "red", linewidth = 2)
    for i in range(len(preds)) :        
        plt.scatter(targets[i], preds[i], marker = "o", s = 5, label = "Model" + str(i + 1))
    plt.xlabel("Real y")
    plt.ylabel("Projected y")
    plt.legend()
    plt.show()

In [4]:
# Define function to minimize
def target_func(weights, targets_X_test) :
    final_pred = 0
    for weight, pred in zip(weights, preds_X_test):
        final_pred += weight * pred
    return(-r2_score(targets_X_test[0], final_pred))


# Optimize ensemble weights on held out fold
def optimize(preds_X_test, targets_X_test) :
    scores = []
    weights = []
    for i in range(100):
        # Choose many random starting weights
        starting_values = np.random.uniform(size = len(preds_X_test))

        # Our weights are bound between 0 and 1
        bounds = [(0, 1)] * len(preds_X_test)

        res = minimize(
            fun=target_func, 
            x0=starting_values, 
            args=(targets_X_test, ),
            method="SLSQP", 
            bounds=bounds, 
            options={"maxiter" : 20000}
        )

        scores.append(res["fun"])
        weights.append(res["x"])

    bestSC = -np.min(scores)
    bestWght = weights[np.argmin(scores)]

    print("\n Ensemble Score: {}".format(bestSC))
    print("\n Best Weights: {}".format(bestWght))

    return(bestSC, bestWght)

In [5]:
def replace_probed_y(row):
    if (np.isnan(row["y_y"])) :
        return (row["y_x"])
    else :
        return (row["y_y"])

<b>Script</b>

In [6]:
# Get data
init_test = pd.read_csv("raw_data/test.csv")
test_ids = init_test.ID.values


In [7]:
# Define probe df
probe_ids = [1, 12, 23, 28, 42, 
             43, 45, 57, 72, 78, 
             88, 89, 93, 94, 104, 
             105, 110, 253, 259, 262, 
             337, 409, 437, 488, 493, 
             973, 1001, 1004, 1008, 1009, 
             1644, 1652, 1664, 2129, 2342, 
             3977, 7055, 8002, 8007, 8416]
probe_values = [71.34112, 109.30903, 115.21953, 92.00675, 87.73572, 
                129.79876, 99.55671, 116.02167, 110.54742, 125.28849, 
                90.33211, 130.55165, 105.79792, 103.04672, 92.37968, 
                108.5069, 83.31692, 115.93724, 93.33662, 75.35182, 
                101.23135, 91.00760, 85.96960, 113.39009, 108.40135, 
                106.76189, 111.65212, 91.472, 106.71967, 108.21841, 
                99.14157, 89.77625, 112.93977, 112.03, 93.06, 
                132.08556, 91.549, 95.84858, 87.44019, 96.84773]
new_train = init_test[init_test["ID"].isin(probe_ids)]
new_y = pd.DataFrame({"y" : probe_values})
new_y.set_index(new_train.index, inplace = True)
probe_df = pd.concat([new_train, new_y], axis = 1)


In [8]:
# Prepare process
oof_preds_dirs = []
oof_targets_dirs = []
preds_dirs = []

oof_preds_dirs.append("oof_preds/dc1_4/")
oof_preds_dirs.append("oof_preds/dc5_8/")
oof_preds_dirs.append("oof_preds/dc9_12/")
oof_preds_dirs.append("oof_preds/dc13_16/")

oof_targets_dirs.append("oof_targets/dc1_4/")
oof_targets_dirs.append("oof_targets/dc5_8/")
oof_targets_dirs.append("oof_targets/dc9_12/")
oof_targets_dirs.append("oof_targets/dc13_16/")

preds_dirs.append("preds/dc1_4/")
preds_dirs.append("preds/dc5_8/")
preds_dirs.append("preds/dc9_12/")
preds_dirs.append("preds/dc13_16/")


In [9]:
# Create weighted averages
for i in range(len(oof_preds_dirs)) :
    print("********************")
    oof_preds_dir = oof_preds_dirs[i]
    oof_targets_dir = oof_targets_dirs[i]
    preds_dir = preds_dirs[i]

    # Create Level 2 inputs
    level_one_preds, level_one_targets, level_one_test = get_level_one_data(oof_preds_dir, oof_targets_dir, preds_dir)
    display(level_one_preds.head(2))
    display(level_one_preds.tail(2))

    # Get error of each OOF column
    preds_X_test = []
    targets_X_test = []
    for col in level_one_preds.columns :
        cv_score = r2_score(level_one_targets[col], level_one_preds[col])
        print("Global OOF r2_score for " + col + " : " + str(cv_score))
        preds_X_test.append(level_one_preds[col])
        targets_X_test.append(level_one_targets[col])
    #plot_preds(targets_X_test, preds_X_test)

    # Optimize weights
    bestSC, bestWght = optimize(preds_X_test, targets_X_test)

    # Prepare submission using optimal weights
    preds = 0
    for j in range(0, len(bestWght)) :
        preds = preds + (bestWght[j] * level_one_test[[j]].values)
                        
    # Modify probed values
    conf_name = oof_preds_dirs[i].split("/")[1]
    file_name = "ens_preds/avg_" + conf_name + ".csv"
    final_preds = pd.DataFrame({"ID": init_test.ID.values, "y": pd.DataFrame(preds)[0]})
    final_preds = pd.merge(final_preds, probe_df[["ID", "y"]], how = "left", on = "ID")
    final_preds["y"] = final_preds.apply(replace_probed_y, axis = 1)
    final_preds.drop(["y_x", "y_y"], axis = 1, inplace = True)

    # Save predictions
    final_preds.to_csv(file_name, index = False)
    
    display(final_preds.head(2))


********************


Unnamed: 0,et_dc1_,et_dc2_,et_dc3_,et_dc4_,la_dc1_,la_dc2_,la_dc3_,la_dc4_,ll_dc1_,ll_dc2_,...,zl_dc3_,zl_dc4_,zr_dc1_,zr_dc2_,zr_dc3_,zr_dc4_,zx_dc1_,zx_dc2_,zx_dc3_,zx_dc4_
0,115.651,113.732,115.672,114.257,115.337,109.853,115.337,109.225,115.334,109.883,...,115.976,114.836,114.16,114.471,114.833,114.809,113.632,112.437,114.715,112.924
1,113.669,112.557,113.434,113.101,114.192,109.862,114.192,109.225,114.195,109.896,...,114.569,113.173,113.174,114.644,113.34,114.584,112.632,111.176,112.034,111.063


Unnamed: 0,et_dc1_,et_dc2_,et_dc3_,et_dc4_,la_dc1_,la_dc2_,la_dc3_,la_dc4_,ll_dc1_,ll_dc2_,...,zl_dc3_,zl_dc4_,zr_dc1_,zr_dc2_,zr_dc3_,zr_dc4_,zx_dc1_,zx_dc2_,zx_dc3_,zx_dc4_
3950,93.51,94.085,93.66,93.998,94.989,95.501,94.989,95.595,94.985,95.923,...,94.239,94.834,93.41,93.895,93.946,93.914,95.431,94.697,94.046,94.159
3951,94.003,96.841,94.357,97.246,95.708,95.062,95.708,95.595,95.711,95.084,...,95.719,96.367,94.882,94.312,95.572,94.923,95.293,96.181,96.411,97.372


Global OOF r2_score for et_dc1_ : 0.572323747277
Global OOF r2_score for et_dc2_ : 0.568627561169
Global OOF r2_score for et_dc3_ : 0.572625155259
Global OOF r2_score for et_dc4_ : 0.568350859384
Global OOF r2_score for la_dc1_ : 0.588591992586
Global OOF r2_score for la_dc2_ : 0.543162967347
Global OOF r2_score for la_dc3_ : 0.58859205522
Global OOF r2_score for la_dc4_ : 0.542897499065
Global OOF r2_score for ll_dc1_ : 0.588604960758
Global OOF r2_score for ll_dc2_ : 0.53469695369
Global OOF r2_score for ll_dc3_ : 0.588626551317
Global OOF r2_score for ll_dc4_ : 0.533852923565
Global OOF r2_score for rf_dc1_ : 0.572086287767
Global OOF r2_score for rf_dc2_ : 0.567588587842
Global OOF r2_score for rf_dc3_ : 0.574554941169
Global OOF r2_score for rf_dc4_ : 0.567794212108
Global OOF r2_score for xg_dc1_ : 0.566760912849
Global OOF r2_score for xg_dc2_ : 0.568222417871
Global OOF r2_score for xg_dc3_ : 0.566301402214
Global OOF r2_score for xg_dc4_ : 0.56726708706
Global OOF r2_score for

Unnamed: 0,ID,y
0,1,71.341
1,2,94.882


********************


Unnamed: 0,et_dc5_,et_dc6_,et_dc7_,et_dc8_,la_dc5_,la_dc6_,la_dc7_,la_dc8_,ll_dc5_,ll_dc6_,...,zl_dc7_,zl_dc8_,zr_dc5_,zr_dc6_,zr_dc7_,zr_dc8_,zx_dc5_,zx_dc6_,zx_dc7_,zx_dc8_
0,114.971,115.539,116.241,116.046,113.642,111.447,113.642,111.493,113.639,110.779,...,115.103,114.035,113.295,118.91,114.269,117.06,111.509,115.315,112.595,113.827
1,113.454,112.292,113.087,112.45,115.402,112.185,115.402,112.277,115.405,110.878,...,116.88,115.558,114.397,113.406,115.025,113.12,114.192,112.227,116.017,113.294


Unnamed: 0,et_dc5_,et_dc6_,et_dc7_,et_dc8_,la_dc5_,la_dc6_,la_dc7_,la_dc8_,ll_dc5_,ll_dc6_,...,zl_dc7_,zl_dc8_,zr_dc5_,zr_dc6_,zr_dc7_,zr_dc8_,zx_dc5_,zx_dc6_,zx_dc7_,zx_dc8_
3909,93.457,93.789,93.703,93.755,94.707,92.337,94.707,92.728,94.736,95.543,...,94.454,95.066,93.498,93.675,94.003,93.789,94.106,94.213,94.196,94.431
3910,93.789,94.948,94.007,95.319,95.564,93.778,95.564,94.519,95.578,95.016,...,95.029,95.41,94.37,93.889,94.708,94.146,95.424,94.603,95.67,96.188


Global OOF r2_score for et_dc5_ : 0.573364336308
Global OOF r2_score for et_dc6_ : 0.569874339637
Global OOF r2_score for et_dc7_ : 0.572714809399
Global OOF r2_score for et_dc8_ : 0.570165875463
Global OOF r2_score for la_dc5_ : 0.589199496845
Global OOF r2_score for la_dc6_ : 0.552598544422
Global OOF r2_score for la_dc7_ : 0.589199548943
Global OOF r2_score for la_dc8_ : 0.553238938818
Global OOF r2_score for ll_dc5_ : 0.589283662232
Global OOF r2_score for ll_dc6_ : 0.534467672611
Global OOF r2_score for ll_dc7_ : 0.589354738638
Global OOF r2_score for ll_dc8_ : 0.532249067099
Global OOF r2_score for rf_dc5_ : 0.568335207598
Global OOF r2_score for rf_dc6_ : 0.565916819734
Global OOF r2_score for rf_dc7_ : 0.573755763549
Global OOF r2_score for rf_dc8_ : 0.569944956905
Global OOF r2_score for xg_dc5_ : 0.564725343136
Global OOF r2_score for xg_dc6_ : 0.557063110958
Global OOF r2_score for xg_dc7_ : 0.564171403775
Global OOF r2_score for xg_dc8_ : 0.561883823855
Global OOF r2_score 

Unnamed: 0,ID,y
0,1,71.341
1,2,93.996


********************


Unnamed: 0,et_dc10,et_dc11,et_dc12,et_dc9_,la_dc10,la_dc11,la_dc12,la_dc9_,ll_dc10,ll_dc11,...,zl_dc12,zl_dc9_,zr_dc10,zr_dc11,zr_dc12,zr_dc9_,zx_dc10,zx_dc11,zx_dc12,zx_dc9_
0,103.82,104.163,103.607,104.414,106.707,105.891,106.61,105.891,106.205,105.903,...,105.107,111.282,104.255,105.772,104.704,105.416,107.132,110.455,105.579,109.129
1,114.472,114.481,113.58,115.335,112.243,112.141,112.091,112.141,108.993,112.156,...,112.921,115.115,114.575,114.301,114.281,115.314,115.555,115.375,114.69,120.351


Unnamed: 0,et_dc10,et_dc11,et_dc12,et_dc9_,la_dc10,la_dc11,la_dc12,la_dc9_,ll_dc10,ll_dc11,...,zl_dc12,zl_dc9_,zr_dc10,zr_dc11,zr_dc12,zr_dc9_,zx_dc10,zx_dc11,zx_dc12,zx_dc9_
4253,109.037,109.891,110.726,109.653,110.273,110.718,110.366,110.718,109.268,110.717,...,110.014,109.619,110.224,110.425,109.878,110.512,109.094,108.152,109.342,110.607
4254,104.212,105.25,104.465,104.995,104.361,102.592,104.324,102.592,105.747,102.591,...,102.086,101.44,103.043,102.618,102.577,102.455,102.332,100.984,101.742,103.352


Global OOF r2_score for et_dc10 : 0.562810837396
Global OOF r2_score for et_dc11 : 0.564323245292
Global OOF r2_score for et_dc12 : 0.562516717039
Global OOF r2_score for et_dc9_ : 0.564959275388
Global OOF r2_score for la_dc10 : 0.557892287725
Global OOF r2_score for la_dc11 : 0.581696439515
Global OOF r2_score for la_dc12 : 0.557728291183
Global OOF r2_score for la_dc9_ : 0.581696508861
Global OOF r2_score for ll_dc10 : 0.520786806046
Global OOF r2_score for ll_dc11 : 0.581622089326
Global OOF r2_score for ll_dc12 : 0.534204611386
Global OOF r2_score for ll_dc9_ : 0.581608733311
Global OOF r2_score for rf_dc10 : 0.560412962826
Global OOF r2_score for rf_dc11 : 0.566863737935
Global OOF r2_score for rf_dc12 : 0.562714926069
Global OOF r2_score for rf_dc9_ : 0.564593335368
Global OOF r2_score for xg_dc10 : 0.553783802074
Global OOF r2_score for xg_dc11 : 0.562203922193
Global OOF r2_score for xg_dc12 : 0.556376976454
Global OOF r2_score for xg_dc9_ : 0.55623335427
Global OOF r2_score f

Unnamed: 0,ID,y
0,1,71.341
1,2,94.042


********************


Unnamed: 0,et_dc13,et_dc14,et_dc15,et_dc16,la_dc13,la_dc14,la_dc15,la_dc16,ll_dc13,ll_dc14,...,zl_dc15,zl_dc16,zr_dc13,zr_dc14,zr_dc15,zr_dc16,zx_dc13,zx_dc14,zx_dc15,zx_dc16
0,94.111,93.955,93.924,93.803,94.684,95.179,94.669,95.343,94.678,95.415,...,95.501,95.413,94.308,94.48,94.506,94.096,93.672,95.419,95.075,94.577
1,93.928,94.017,93.845,93.742,94.684,95.622,94.669,95.206,94.678,96.17,...,94.281,94.845,94.23,94.277,94.16,94.096,94.824,94.408,94.248,94.725


Unnamed: 0,et_dc13,et_dc14,et_dc15,et_dc16,la_dc13,la_dc14,la_dc15,la_dc16,ll_dc13,ll_dc14,...,zl_dc15,zl_dc16,zr_dc13,zr_dc14,zr_dc15,zr_dc16,zx_dc13,zx_dc14,zx_dc15,zx_dc16
4207,92.018,92.439,92.953,93.075,93.459,94.734,93.459,94.013,93.445,96.502,...,92.607,93.207,91.954,92.201,92.468,92.163,92.359,92.262,92.987,93.65
4208,92.74,92.601,93.959,93.767,93.891,94.293,93.891,94.413,93.877,95.46,...,93.709,93.718,92.88,92.483,93.226,92.422,94.063,93.243,94.057,93.792


Global OOF r2_score for et_dc13 : 0.568367227604
Global OOF r2_score for et_dc14 : 0.567381624839
Global OOF r2_score for et_dc15 : 0.56871831139
Global OOF r2_score for et_dc16 : 0.567136181641
Global OOF r2_score for la_dc13 : 0.582234231814
Global OOF r2_score for la_dc14 : 0.565018718854
Global OOF r2_score for la_dc15 : 0.582436417144
Global OOF r2_score for la_dc16 : 0.565794115392
Global OOF r2_score for ll_dc13 : 0.582289495597
Global OOF r2_score for ll_dc14 : 0.529728456129
Global OOF r2_score for ll_dc15 : 0.58239370996
Global OOF r2_score for ll_dc16 : 0.542533148316
Global OOF r2_score for rf_dc13 : 0.565654644625
Global OOF r2_score for rf_dc14 : 0.562030182284
Global OOF r2_score for rf_dc15 : 0.568633845006
Global OOF r2_score for rf_dc16 : 0.56645038039
Global OOF r2_score for xg_dc13 : 0.554577936653
Global OOF r2_score for xg_dc14 : 0.550293617439
Global OOF r2_score for xg_dc15 : 0.561992506787
Global OOF r2_score for xg_dc16 : 0.559730978313
Global OOF r2_score for

Unnamed: 0,ID,y
0,1,71.341
1,2,94.184
