In [1]:
# Imports
import os
import pandas as pd
import numpy as np
import datetime
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.optimize import minimize
from sklearn.metrics import r2_score
from IPython.display import HTML, display

# Definitions
pd.set_option('display.float_format', lambda x: '%.3f' % x)
%matplotlib inline
display(HTML("<style>.container { width: 90% !important; }</style>"))
warnings.simplefilter('ignore')
N_JOBS = -1
SEED = 2017

In [2]:
# Stack all single Level 1 models OOF and test preds as entries for a Level 2 model
def get_level_one_data(oof_preds_dir, oof_targets_dir, preds_dir) :
    level_one_preds = pd.DataFrame()
    i = 0
    for file_name in sorted(os.listdir(oof_preds_dir)) :
        preds_OOF = pd.read_csv(oof_preds_dir + file_name)
        cur_preds_OOF = pd.Series(preds_OOF.y, name = file_name[0:7])
        level_one_preds = pd.concat([level_one_preds, cur_preds_OOF], axis = 1)
        i += 1

    level_one_targets = pd.DataFrame()
    i = 0
    for file_name in sorted(os.listdir(oof_targets_dir)) :
        targets_OOF = pd.read_csv(oof_targets_dir + file_name)
        cur_targets_OOF = pd.Series(targets_OOF.y, name = file_name[0:7])
        level_one_targets = pd.concat([level_one_targets, cur_targets_OOF], axis = 1)
        i += 1

    # Same for test set preds
    level_one_test = pd.DataFrame()
    i = 0
    for file_name in sorted(os.listdir(preds_dir)) :
        preds_test = pd.read_csv(preds_dir + file_name)
        cur_preds_test = pd.Series(preds_test.y, name = file_name[0:7])
        level_one_test = pd.concat([level_one_test, cur_preds_test], axis = 1)
        i += 1
    
    return(level_one_preds, level_one_targets, level_one_test)

In [3]:
# Plot predictions
def plot_preds(targets, preds) :
    plt.style.use("fivethirtyeight")
    plt.figure(figsize=(10, 20))
    colormap = plt.cm.gist_rainbow
    plt.gca().set_color_cycle([colormap(i) for i in np.linspace(0, 0.9, len(preds))])
    plt.plot([50, 250], [50, 250], c = "red", linewidth = 2)
    for i in range(len(preds)) :        
        plt.scatter(targets[i], preds[i], marker = "o", s = 5, label = "Model" + str(i + 1))
    plt.xlabel("Real y")
    plt.ylabel("Projected y")
    plt.legend()
    plt.show()

In [4]:
# Define function to minimize
def target_func(weights, targets_X_test) :
    final_pred = 0
    for weight, pred in zip(weights, preds_X_test):
        final_pred += weight * pred
    return(-r2_score(targets_X_test[0], final_pred))


# Optimize ensemble weights on held out fold
def optimize(preds_X_test, targets_X_test) :
    scores = []
    weights = []
    for i in range(100):
        # Choose many random starting weights
        starting_values = np.random.uniform(size = len(preds_X_test))

        # Our weights are bound between 0 and 1
        bounds = [(0, 1)] * len(preds_X_test)

        res = minimize(
            fun=target_func, 
            x0=starting_values, 
            args=(targets_X_test, ),
            method="SLSQP", 
            bounds=bounds, 
            options={"maxiter" : 20000}
        )

        scores.append(res["fun"])
        weights.append(res["x"])

    bestSC = -np.min(scores)
    bestWght = weights[np.argmin(scores)]

    print("\n Ensemble Score: {}".format(bestSC))
    print("\n Best Weights: {}".format(bestWght))

    return(bestSC, bestWght)

In [5]:
def replace_probed_y(row):
    if (np.isnan(row["y_y"])) :
        return (row["y_x"])
    else :
        return (row["y_y"])

<b>Script</b>

In [6]:
# Get data
init_test = pd.read_csv("raw_data/test.csv")
test_ids = init_test.ID.values


In [7]:
# Define probe df
probe_ids = [1, 12, 23, 28, 42, 
             43, 45, 57, 72, 78, 
             88, 89, 93, 94, 104, 
             105, 110, 253, 259, 262, 
             289, 337, 409, 437, 488, 
             493, 973, 1001, 1004, 1008, 
             1009, 1259, 1644, 1652, 1664, 
             2129, 2342, 3853, 3977, 4193, 
             4958, 4960, 7055, 7805, 8002, 
             8007, 8416]
probe_values = [71.34112, 109.30903, 115.21953, 92.00675, 87.73572, 
                129.79876, 99.55671, 116.02167, 110.54742, 125.28849, 
                90.33211, 130.55165, 105.79792, 103.04672, 92.37968, 
                108.5069, 87.70757, 115.93724, 93.33662, 75.35182, 
                89.27667, 101.23135, 91.00760, 85.96960, 113.39009, 
                108.40135, 106.76189, 111.65212, 91.472, 106.71967, 
                108.21841, 112.3909, 99.14157, 89.77625, 112.93977, 
                112.03, 93.06, 105.481283411, 132.08556, 132.78216, 
                113.58711, 89.83957, 91.549, 105.8472, 95.84858, 
                87.44019, 96.84773]
new_train = init_test[init_test["ID"].isin(probe_ids)]
new_y = pd.DataFrame({"y" : probe_values})
new_y.set_index(new_train.index, inplace = True)
probe_df = pd.concat([new_train, new_y], axis = 1)


In [8]:
# Prepare process
oof_preds_dirs = []
oof_targets_dirs = []
preds_dirs = []

oof_preds_dirs.append("oof_preds/dc1_4/")
oof_preds_dirs.append("oof_preds/dc5_8/")
oof_preds_dirs.append("oof_preds/dc9_12/")
oof_preds_dirs.append("oof_preds/dc13_16/")

oof_targets_dirs.append("oof_targets/dc1_4/")
oof_targets_dirs.append("oof_targets/dc5_8/")
oof_targets_dirs.append("oof_targets/dc9_12/")
oof_targets_dirs.append("oof_targets/dc13_16/")

preds_dirs.append("preds/dc1_4/")
preds_dirs.append("preds/dc5_8/")
preds_dirs.append("preds/dc9_12/")
preds_dirs.append("preds/dc13_16/")


In [9]:
# Create weighted averages
for i in range(len(oof_preds_dirs)) :
    print("********************")
    oof_preds_dir = oof_preds_dirs[i]
    oof_targets_dir = oof_targets_dirs[i]
    preds_dir = preds_dirs[i]

    # Create Level 2 inputs
    level_one_preds, level_one_targets, level_one_test = get_level_one_data(oof_preds_dir, oof_targets_dir, preds_dir)
    display(level_one_preds.head(2))
    display(level_one_preds.tail(2))

    # Get error of each OOF column
    preds_X_test = []
    targets_X_test = []
    for col in level_one_preds.columns :
        cv_score = r2_score(level_one_targets[col], level_one_preds[col])
        print("Global OOF r2_score for " + col + " : " + str(cv_score))
        preds_X_test.append(level_one_preds[col])
        targets_X_test.append(level_one_targets[col])
    #plot_preds(targets_X_test, preds_X_test)

    # Optimize weights
    bestSC, bestWght = optimize(preds_X_test, targets_X_test)

    # Prepare submission using optimal weights
    preds = 0
    for j in range(0, len(bestWght)) :
        preds = preds + (bestWght[j] * level_one_test[[j]].values)
                        
    # Modify probed values
    conf_name = oof_preds_dirs[i].split("/")[1]
    file_name = "ens_preds/avg_" + conf_name + ".csv"
    final_preds = pd.DataFrame({"ID": init_test.ID.values, "y": pd.DataFrame(preds)[0]})
    final_preds = pd.merge(final_preds, probe_df[["ID", "y"]], how = "left", on = "ID")
    final_preds["y"] = final_preds.apply(replace_probed_y, axis = 1)
    final_preds.drop(["y_x", "y_y"], axis = 1, inplace = True)

    # Save predictions
    final_preds.to_csv(file_name, index = False)
    
    display(final_preds.head(2))


********************


Unnamed: 0,et_dc1_,et_dc2_,et_dc3_,et_dc4_,la_dc1_,la_dc2_,la_dc3_,la_dc4_,ll_dc1_,ll_dc2_,...,zl_dc3_,zl_dc4_,zr_dc1_,zr_dc2_,zr_dc3_,zr_dc4_,zx_dc1_,zx_dc2_,zx_dc3_,zx_dc4_
0,116.134,114.13,116.186,114.255,115.397,110.616,115.396,110.473,115.421,108.013,...,116.255,114.706,114.472,115.386,115.259,115.102,114.874,112.751,116.039,113.866
1,113.334,112.932,113.403,113.084,114.199,110.668,114.199,110.473,114.207,108.021,...,114.405,113.376,112.364,115.337,112.845,114.966,112.059,111.378,112.275,111.957


Unnamed: 0,et_dc1_,et_dc2_,et_dc3_,et_dc4_,la_dc1_,la_dc2_,la_dc3_,la_dc4_,ll_dc1_,ll_dc2_,...,zl_dc3_,zl_dc4_,zr_dc1_,zr_dc2_,zr_dc3_,zr_dc4_,zx_dc1_,zx_dc2_,zx_dc3_,zx_dc4_
3951,93.758,94.023,93.779,94.006,94.905,94.381,94.905,94.36,94.909,95.243,...,94.128,94.794,93.486,93.74,93.587,93.874,93.702,93.821,93.5,94.137
3952,94.224,95.205,94.171,95.804,95.638,96.447,95.638,96.54,95.638,96.055,...,95.618,95.933,94.821,94.118,95.15,94.585,96.389,94.937,96.284,96.862


Global OOF r2_score for et_dc1_ : 0.572600357478
Global OOF r2_score for et_dc2_ : 0.56949721826
Global OOF r2_score for et_dc3_ : 0.572999418005
Global OOF r2_score for et_dc4_ : 0.569112855322
Global OOF r2_score for la_dc1_ : 0.588467268349
Global OOF r2_score for la_dc2_ : 0.548622901878
Global OOF r2_score for la_dc3_ : 0.588467136131
Global OOF r2_score for la_dc4_ : 0.550172023209
Global OOF r2_score for ll_dc1_ : 0.588498942363
Global OOF r2_score for ll_dc2_ : 0.53041351412
Global OOF r2_score for ll_dc3_ : 0.588555948425
Global OOF r2_score for ll_dc4_ : 0.522219953052
Global OOF r2_score for rf_dc1_ : 0.572124235188
Global OOF r2_score for rf_dc2_ : 0.567171849033
Global OOF r2_score for rf_dc3_ : 0.574580627741
Global OOF r2_score for rf_dc4_ : 0.568533648688
Global OOF r2_score for xg_dc1_ : 0.564998266541
Global OOF r2_score for xg_dc2_ : 0.564896677099
Global OOF r2_score for xg_dc3_ : 0.565711036022
Global OOF r2_score for xg_dc4_ : 0.565157028005
Global OOF r2_score fo

Unnamed: 0,ID,y
0,1,71.341
1,2,94.375


********************


Unnamed: 0,et_dc5_,et_dc6_,et_dc7_,et_dc8_,la_dc5_,la_dc6_,la_dc7_,la_dc8_,ll_dc5_,ll_dc6_,...,zl_dc7_,zl_dc8_,zr_dc5_,zr_dc6_,zr_dc7_,zr_dc8_,zx_dc5_,zx_dc6_,zx_dc7_,zx_dc8_
0,114.451,116.14,116.241,116.046,113.642,111.461,113.642,111.493,113.639,110.796,...,115.103,114.035,113.043,119.119,114.269,117.06,111.111,115.204,112.595,113.827
1,113.558,112.337,113.087,112.45,115.402,112.171,115.402,112.277,115.405,110.775,...,116.88,115.558,114.569,113.542,115.025,113.12,114.813,112.898,116.017,113.294


Unnamed: 0,et_dc5_,et_dc6_,et_dc7_,et_dc8_,la_dc5_,la_dc6_,la_dc7_,la_dc8_,ll_dc5_,ll_dc6_,...,zl_dc7_,zl_dc8_,zr_dc5_,zr_dc6_,zr_dc7_,zr_dc8_,zx_dc5_,zx_dc6_,zx_dc7_,zx_dc8_
3909,93.261,93.805,93.703,93.755,94.707,92.783,94.707,92.728,94.702,95.418,...,94.454,95.066,93.529,93.678,94.003,93.789,93.827,94.471,94.196,94.431
3910,93.852,95.395,94.007,95.319,95.564,94.55,95.564,94.519,95.562,94.941,...,95.029,95.41,94.575,93.902,94.708,94.146,95.366,94.165,95.67,96.188


Global OOF r2_score for et_dc5_ : 0.573266659609
Global OOF r2_score for et_dc6_ : 0.570046349556
Global OOF r2_score for et_dc7_ : 0.572714809399
Global OOF r2_score for et_dc8_ : 0.570165875463
Global OOF r2_score for la_dc5_ : 0.589199490928
Global OOF r2_score for la_dc6_ : 0.552576201935
Global OOF r2_score for la_dc7_ : 0.589199548943
Global OOF r2_score for la_dc8_ : 0.553238938818
Global OOF r2_score for ll_dc5_ : 0.589294859787
Global OOF r2_score for ll_dc6_ : 0.534845839354
Global OOF r2_score for ll_dc7_ : 0.589354738638
Global OOF r2_score for ll_dc8_ : 0.532249067099
Global OOF r2_score for rf_dc5_ : 0.569976670549
Global OOF r2_score for rf_dc6_ : 0.565657243398
Global OOF r2_score for rf_dc7_ : 0.573755763549
Global OOF r2_score for rf_dc8_ : 0.569944956905
Global OOF r2_score for xg_dc5_ : 0.565180063288
Global OOF r2_score for xg_dc6_ : 0.556619230532
Global OOF r2_score for xg_dc7_ : 0.564171403775
Global OOF r2_score for xg_dc8_ : 0.561883823855
Global OOF r2_score 

Unnamed: 0,ID,y
0,1,71.341
1,2,94.006


********************


Unnamed: 0,et_dc10,et_dc11,et_dc12,et_dc9_,la_dc10,la_dc11,la_dc12,la_dc9_,ll_dc10,ll_dc11,...,zl_dc12,zl_dc9_,zr_dc10,zr_dc11,zr_dc12,zr_dc9_,zx_dc10,zx_dc11,zx_dc12,zx_dc9_
0,114.857,114.982,114.428,115.467,112.039,112.087,111.918,112.087,109.051,112.059,...,112.238,115.132,114.504,114.107,113.983,115.125,115.724,114.714,114.285,120.287
1,112.183,111.961,111.535,112.369,112.686,111.477,111.956,111.477,109.405,111.469,...,113.746,113.443,113.223,113.561,112.951,113.123,113.59,113.371,113.354,111.849


Unnamed: 0,et_dc10,et_dc11,et_dc12,et_dc9_,la_dc10,la_dc11,la_dc12,la_dc9_,ll_dc10,ll_dc11,...,zl_dc12,zl_dc9_,zr_dc10,zr_dc11,zr_dc12,zr_dc9_,zx_dc10,zx_dc11,zx_dc12,zx_dc9_
4254,109.256,109.855,110.448,109.358,110.513,110.724,110.261,110.724,109.519,110.707,...,110.007,108.99,110.206,110.145,109.968,110.18,110.271,108.018,109.172,111.026
4255,104.583,104.999,104.786,104.237,104.356,102.547,103.606,102.547,105.861,102.529,...,101.712,101.69,102.583,102.51,102.855,102.294,102.002,100.961,101.074,101.881


Global OOF r2_score for et_dc10 : 0.560934010843
Global OOF r2_score for et_dc11 : 0.563664062419
Global OOF r2_score for et_dc12 : 0.560809641914
Global OOF r2_score for et_dc9_ : 0.564630020252
Global OOF r2_score for la_dc10 : 0.558537475667
Global OOF r2_score for la_dc11 : 0.581673030285
Global OOF r2_score for la_dc12 : 0.55766286836
Global OOF r2_score for la_dc9_ : 0.581740299839
Global OOF r2_score for ll_dc10 : 0.523361850702
Global OOF r2_score for ll_dc11 : 0.581691062679
Global OOF r2_score for ll_dc12 : 0.536840856246
Global OOF r2_score for ll_dc9_ : 0.581653684042
Global OOF r2_score for rf_dc10 : 0.559147985309
Global OOF r2_score for rf_dc11 : 0.566022537383
Global OOF r2_score for rf_dc12 : 0.56036283879
Global OOF r2_score for rf_dc9_ : 0.564071493423
Global OOF r2_score for xg_dc10 : 0.552616613884
Global OOF r2_score for xg_dc11 : 0.562122026962
Global OOF r2_score for xg_dc12 : 0.556242846329
Global OOF r2_score for xg_dc9_ : 0.557551822905
Global OOF r2_score fo

Unnamed: 0,ID,y
0,1,71.341
1,2,94.223


********************


Unnamed: 0,et_dc13,et_dc14,et_dc15,et_dc16,la_dc13,la_dc14,la_dc15,la_dc16,ll_dc13,ll_dc14,...,zl_dc15,zl_dc16,zr_dc13,zr_dc14,zr_dc15,zr_dc16,zx_dc13,zx_dc14,zx_dc15,zx_dc16
0,93.85,93.861,93.924,93.803,94.684,95.131,94.669,95.343,94.678,95.546,...,95.501,95.413,94.352,94.166,94.506,94.096,94.25,95.161,95.075,94.577
1,93.787,93.882,93.845,93.742,94.684,95.614,94.669,95.206,94.678,96.108,...,94.281,94.845,94.225,94.218,94.16,94.096,94.864,94.766,94.248,94.725


Unnamed: 0,et_dc13,et_dc14,et_dc15,et_dc16,la_dc13,la_dc14,la_dc15,la_dc16,ll_dc13,ll_dc14,...,zl_dc15,zl_dc16,zr_dc13,zr_dc14,zr_dc15,zr_dc16,zx_dc13,zx_dc14,zx_dc15,zx_dc16
4207,92.474,92.215,92.953,93.075,93.459,94.877,93.459,94.013,93.45,96.504,...,92.607,93.207,92.175,92.385,92.468,92.163,92.397,92.841,92.987,93.65
4208,92.52,92.443,93.959,93.767,93.891,94.136,93.891,94.413,93.882,95.475,...,93.709,93.718,92.313,92.641,93.226,92.422,94.472,93.088,94.057,93.792


Global OOF r2_score for et_dc13 : 0.56833788137
Global OOF r2_score for et_dc14 : 0.567201128191
Global OOF r2_score for et_dc15 : 0.56871831139
Global OOF r2_score for et_dc16 : 0.567136181641
Global OOF r2_score for la_dc13 : 0.582247141372
Global OOF r2_score for la_dc14 : 0.566013706864
Global OOF r2_score for la_dc15 : 0.582436417144
Global OOF r2_score for la_dc16 : 0.565794115392
Global OOF r2_score for ll_dc13 : 0.582304687756
Global OOF r2_score for ll_dc14 : 0.534194872518
Global OOF r2_score for ll_dc15 : 0.58239370996
Global OOF r2_score for ll_dc16 : 0.542533148316
Global OOF r2_score for rf_dc13 : 0.564452366033
Global OOF r2_score for rf_dc14 : 0.561725084315
Global OOF r2_score for rf_dc15 : 0.568633845006
Global OOF r2_score for rf_dc16 : 0.56645038039
Global OOF r2_score for xg_dc13 : 0.557136680385
Global OOF r2_score for xg_dc14 : 0.549924384186
Global OOF r2_score for xg_dc15 : 0.561992506787
Global OOF r2_score for xg_dc16 : 0.559730978313
Global OOF r2_score for 

Unnamed: 0,ID,y
0,1,71.341
1,2,94.188
