In [1]:
# Imports
import os
import pandas as pd
import numpy as np
import datetime
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.optimize import minimize
from sklearn.metrics import r2_score
from IPython.display import HTML, display

# Definitions
pd.set_option('display.float_format', lambda x: '%.3f' % x)
%matplotlib inline
display(HTML("<style>.container { width: 90% !important; }</style>"))
warnings.simplefilter('ignore')
N_JOBS = -1
SEED = 2017

In [2]:
# Stack all single Level 1 models OOF and test preds as entries for a Level 2 model
def get_level_one_data(oof_preds_dir, oof_targets_dir, preds_dir) :
    level_one_preds = pd.DataFrame()
    i = 0
    for file_name in sorted(os.listdir(oof_preds_dir)) :
        preds_OOF = pd.read_csv(oof_preds_dir + file_name)
        cur_preds_OOF = pd.Series(preds_OOF.y, name = file_name[0:7])
        level_one_preds = pd.concat([level_one_preds, cur_preds_OOF], axis = 1)
        i += 1

    level_one_targets = pd.DataFrame()
    i = 0
    for file_name in sorted(os.listdir(oof_targets_dir)) :
        targets_OOF = pd.read_csv(oof_targets_dir + file_name)
        cur_targets_OOF = pd.Series(targets_OOF.y, name = file_name[0:7])
        level_one_targets = pd.concat([level_one_targets, cur_targets_OOF], axis = 1)
        i += 1

    # Same for test set preds
    level_one_test = pd.DataFrame()
    i = 0
    for file_name in sorted(os.listdir(preds_dir)) :
        preds_test = pd.read_csv(preds_dir + file_name)
        cur_preds_test = pd.Series(preds_test.y, name = file_name[0:7])
        level_one_test = pd.concat([level_one_test, cur_preds_test], axis = 1)
        i += 1
    
    return(level_one_preds, level_one_targets, level_one_test)

In [3]:
# Plot predictions
def plot_preds(targets, preds) :
    plt.style.use("fivethirtyeight")
    plt.figure(figsize=(10, 20))
    colormap = plt.cm.gist_rainbow
    plt.gca().set_color_cycle([colormap(i) for i in np.linspace(0, 0.9, len(preds))])
    plt.plot([50, 250], [50, 250], c = "red", linewidth = 2)
    for i in range(len(preds)) :        
        plt.scatter(targets[i], preds[i], marker = "o", s = 5, label = "Model" + str(i + 1))
    plt.xlabel("Real y")
    plt.ylabel("Projected y")
    plt.legend()
    plt.show()

In [4]:
# Define function to minimize
def target_func(weights, targets_X_test) :
    final_pred = 0
    for weight, pred in zip(weights, preds_X_test):
        final_pred += weight * pred
    return(-r2_score(targets_X_test[0], final_pred))


# Optimize ensemble weights on held out fold
def optimize(preds_X_test, targets_X_test) :
    scores = []
    weights = []
    for i in range(200):
        # Choose many random starting weights
        starting_values = np.random.uniform(size = len(preds_X_test))

        # Our weights are bound between 0 and 1
        bounds = [(0, 1)] * len(preds_X_test)

        res = minimize(
            fun=target_func, 
            x0=starting_values, 
            args=(targets_X_test, ),
            method="SLSQP", 
            bounds=bounds, 
            options={"maxiter" : 10000}
        )

        scores.append(res["fun"])
        weights.append(res["x"])

    bestSC = -np.min(scores)
    bestWght = weights[np.argmin(scores)]

    print("\n Ensemble Score: {}".format(bestSC))
    print("\n Best Weights: {}".format(bestWght))

    return(bestSC, bestWght)

In [5]:
def replace_probed_y(row):
    if (np.isnan(row["y_y"])) :
        return (row["y_x"])
    else :
        return (row["y_y"])

<b>Script</b>

In [6]:
# Get data
init_test = pd.read_csv("raw_data/test.csv")
test_ids = init_test.ID.values


In [7]:
# Define probe df
probe_ids = [1, 12, 23, 28, 42, 
             43, 45, 57, 72, 78, 
             88, 89, 93, 94, 104, 
             105, 110, 253, 259, 262, 
             337, 409, 437, 488, 493, 
             973, 1001, 1004, 1008, 1009, 
             1644, 1652, 1664, 2129, 2342, 
             3977, 7055, 8002, 8007, 8416]
probe_values = [71.34112, 109.30903, 115.21953, 92.00675, 87.73572, 
                129.79876, 99.55671, 116.02167, 110.54742, 125.28849, 
                90.33211, 130.55165, 105.79792, 103.04672, 92.37968, 
                108.5069, 83.31692, 115.93724, 93.33662, 75.35182, 
                101.23135, 91.00760, 85.96960, 113.39009, 108.40135, 
                106.76189, 111.65212, 91.472, 106.71967, 108.21841, 
                99.14157, 89.77625, 112.93977, 112.03, 93.06, 
                132.08556, 91.549, 95.84858, 87.44019, 96.84773]
new_train = init_test[init_test["ID"].isin(probe_ids)]
new_y = pd.DataFrame({"y" : probe_values})
new_y.set_index(new_train.index, inplace = True)
probe_df = pd.concat([new_train, new_y], axis = 1)


In [8]:
# Prepare process
oof_preds_dirs = []
oof_targets_dirs = []
preds_dirs = []

oof_preds_dirs.append("oof_preds/dc1_4/")
oof_preds_dirs.append("oof_preds/dc5_8/")
oof_preds_dirs.append("oof_preds/dc9_12/")
oof_preds_dirs.append("oof_preds/dc13_16/")

oof_targets_dirs.append("oof_targets/dc1_4/")
oof_targets_dirs.append("oof_targets/dc5_8/")
oof_targets_dirs.append("oof_targets/dc9_12/")
oof_targets_dirs.append("oof_targets/dc13_16/")

preds_dirs.append("preds/dc1_4/")
preds_dirs.append("preds/dc5_8/")
preds_dirs.append("preds/dc9_12/")
preds_dirs.append("preds/dc13_16/")


In [9]:
# Create weighted averages
for i in range(len(oof_preds_dirs)) :
    print("********************")
    oof_preds_dir = oof_preds_dirs[i]
    oof_targets_dir = oof_targets_dirs[i]
    preds_dir = preds_dirs[i]

    # Create Level 2 inputs
    level_one_preds, level_one_targets, level_one_test = get_level_one_data(oof_preds_dir, oof_targets_dir, preds_dir)
    display(level_one_preds.head(2))
    display(level_one_preds.tail(2))

    # Get error of each OOF column
    preds_X_test = []
    targets_X_test = []
    for col in level_one_preds.columns :
        cv_score = r2_score(level_one_targets[col], level_one_preds[col])
        print("Global OOF r2_score for " + col + " : " + str(cv_score))
        preds_X_test.append(level_one_preds[col])
        targets_X_test.append(level_one_targets[col])
    #plot_preds(targets_X_test, preds_X_test)

    # Optimize weights
    bestSC, bestWght = optimize(preds_X_test, targets_X_test)

    # Prepare submission using optimal weights
    preds = 0
    for j in range(0, len(bestWght)) :
        preds = preds + (bestWght[j] * level_one_test[[j]].values)
                        
    # Modify probed values
    conf_name = oof_preds_dirs[i].split("/")[1]
    file_name = "ens_preds/avg_" + conf_name + ".csv"
    final_preds = pd.DataFrame({"ID": init_test.ID.values, "y": pd.DataFrame(preds)[0]})
    final_preds = pd.merge(final_preds, probe_df[["ID", "y"]], how = "left", on = "ID")
    final_preds["y"] = final_preds.apply(replace_probed_y, axis = 1)
    final_preds.drop(["y_x", "y_y"], axis = 1, inplace = True)

    # Save predictions
    final_preds.to_csv(file_name, index = False)
    
    display(final_preds.head(2))


********************


Unnamed: 0,et_dc1_,et_dc2_,et_dc3_,et_dc4_,gb_dc3_,gb_dc4_,la_dc1_,la_dc2_,la_dc3_,la_dc4_,...,rf_dc3_,rf_dc4_,ri_dc1_,ri_dc2_,ri_dc3_,ri_dc4_,xg_dc1_,xg_dc2_,xg_dc3_,xg_dc4_
0,115.406,113.857,116.123,113.452,116.776,112.425,115.483,114.729,115.483,114.745,...,116.157,113.645,114.792,114.197,114.792,114.158,115.379,113.252,119.797,115.086
1,113.613,112.956,113.466,112.843,110.68,110.578,114.306,112.519,114.306,112.586,...,112.512,112.697,112.711,111.948,112.71,111.89,112.644,111.495,111.419,111.959


Unnamed: 0,et_dc1_,et_dc2_,et_dc3_,et_dc4_,gb_dc3_,gb_dc4_,la_dc1_,la_dc2_,la_dc3_,la_dc4_,...,rf_dc3_,rf_dc4_,ri_dc1_,ri_dc2_,ri_dc3_,ri_dc4_,xg_dc1_,xg_dc2_,xg_dc3_,xg_dc4_
3945,93.349,93.669,93.515,93.704,93.862,94.708,94.885,94.738,94.885,95.362,...,93.296,93.759,93.2,94.213,93.27,94.19,94.247,93.893,93.023,94.31
3946,94.015,94.853,94.151,94.883,95.887,95.809,95.576,95.419,95.576,95.362,...,95.788,94.855,93.199,94.724,93.315,94.744,96.242,94.497,99.251,97.424


Global OOF r2_score for et_dc1_ : 0.572880491667
Global OOF r2_score for et_dc2_ : 0.569650608321
Global OOF r2_score for et_dc3_ : 0.571729325242
Global OOF r2_score for et_dc4_ : 0.56899946984
Global OOF r2_score for gb_dc3_ : 0.559796897974
Global OOF r2_score for gb_dc4_ : 0.555797134133
Global OOF r2_score for la_dc1_ : 0.588814231061
Global OOF r2_score for la_dc2_ : 0.558060408734
Global OOF r2_score for la_dc3_ : 0.588819290786
Global OOF r2_score for la_dc4_ : 0.551603743705
Global OOF r2_score for ll_dc1_ : 0.588857942044
Global OOF r2_score for ll_dc2_ : 0.548257980377
Global OOF r2_score for ll_dc3_ : 0.588857942044
Global OOF r2_score for ll_dc4_ : 0.552249064208
Global OOF r2_score for rf_dc1_ : 0.571961297233
Global OOF r2_score for rf_dc2_ : 0.567866364945
Global OOF r2_score for rf_dc3_ : 0.576381176152
Global OOF r2_score for rf_dc4_ : 0.568825836525
Global OOF r2_score for ri_dc1_ : 0.552033133111
Global OOF r2_score for ri_dc2_ : 0.557608308391
Global OOF r2_score f

Unnamed: 0,ID,y
0,1,71.341
1,2,94.987


********************


Unnamed: 0,et_dc5_,et_dc6_,et_dc7_,et_dc8_,gb_dc5_,gb_dc6_,gb_dc7_,gb_dc8_,la_dc5_,la_dc6_,...,rf_dc7_,rf_dc8_,ri_dc5_,ri_dc6_,ri_dc7_,ri_dc8_,xg_dc5_,xg_dc6_,xg_dc7_,xg_dc8_
0,114.596,115.651,116.241,116.046,111.013,117.01,112.297,114.147,113.642,111.476,...,114.275,115.985,113.465,111.878,113.465,111.83,111.178,113.416,113.042,115.168
1,113.246,112.114,113.087,112.45,111.271,110.734,111.268,109.616,115.402,112.22,...,113.953,112.444,116.376,116.01,116.376,116.024,113.35,111.667,112.473,112.632


Unnamed: 0,et_dc5_,et_dc6_,et_dc7_,et_dc8_,gb_dc5_,gb_dc6_,gb_dc7_,gb_dc8_,la_dc5_,la_dc6_,...,rf_dc7_,rf_dc8_,ri_dc5_,ri_dc6_,ri_dc7_,ri_dc8_,xg_dc5_,xg_dc6_,xg_dc7_,xg_dc8_
3909,93.618,93.687,93.703,93.755,95.309,94.555,94.665,94.748,94.707,92.955,...,94.127,93.864,92.938,93.57,92.938,93.528,94.522,93.643,94.515,94.072
3910,93.947,94.978,94.007,95.319,95.66,95.02,97.201,96.519,95.564,94.649,...,95.248,95.288,93.857,94.66,93.857,94.742,96.225,94.955,99.097,99.595


Global OOF r2_score for et_dc5_ : 0.57288594796
Global OOF r2_score for et_dc6_ : 0.570022030614
Global OOF r2_score for et_dc7_ : 0.572714809399
Global OOF r2_score for et_dc8_ : 0.570165875463
Global OOF r2_score for gb_dc5_ : 0.55582088096
Global OOF r2_score for gb_dc6_ : 0.549236754468
Global OOF r2_score for gb_dc7_ : 0.561802013128
Global OOF r2_score for gb_dc8_ : 0.557940853633
Global OOF r2_score for la_dc5_ : 0.589199526121
Global OOF r2_score for la_dc6_ : 0.552599350302
Global OOF r2_score for la_dc7_ : 0.589199548943
Global OOF r2_score for la_dc8_ : 0.553238938818
Global OOF r2_score for ll_dc5_ : 0.589353705941
Global OOF r2_score for ll_dc6_ : 0.54482853007
Global OOF r2_score for ll_dc7_ : 0.589353705941
Global OOF r2_score for ll_dc8_ : 0.541309429169
Global OOF r2_score for rf_dc5_ : 0.570574708459
Global OOF r2_score for rf_dc6_ : 0.564357954559
Global OOF r2_score for rf_dc7_ : 0.573755763549
Global OOF r2_score for rf_dc8_ : 0.569944956905
Global OOF r2_score for

Unnamed: 0,ID,y
0,1,71.341
1,2,94.273


********************


Unnamed: 0,et_dc10,et_dc11,et_dc12,et_dc9_,gb_dc10,gb_dc12,la_dc10,la_dc11,la_dc12,la_dc9_,...,rf_dc12,rf_dc9_,ri_dc10,ri_dc11,ri_dc12,ri_dc9_,xg_dc10,xg_dc11,xg_dc12,xg_dc9_
0,103.375,103.209,102.503,102.997,103.49,103.605,107.03,105.883,107.243,105.883,...,103.772,104.968,109.84,111.141,109.757,112.328,107.501,108.018,104.542,107.443
1,115.046,115.36,114.825,115.909,118.994,116.835,112.336,111.982,112.275,111.982,...,116.731,119.879,114.376,113.706,114.389,114.904,121.991,120.821,120.155,122.889


Unnamed: 0,et_dc10,et_dc11,et_dc12,et_dc9_,gb_dc10,gb_dc12,la_dc10,la_dc11,la_dc12,la_dc9_,...,rf_dc12,rf_dc9_,ri_dc10,ri_dc11,ri_dc12,ri_dc9_,xg_dc10,xg_dc11,xg_dc12,xg_dc9_
4247,109.778,110.075,110.416,109.511,107.8,108.026,109.924,110.404,109.944,110.413,...,109.341,109.397,109.51,109.275,109.333,109.67,109.008,109.039,108.8,108.331
4248,103.964,104.543,103.762,104.542,103.715,104.205,104.261,102.581,104.248,102.613,...,103.136,102.749,100.19,98.553,99.964,98.47,101.84,103.297,101.958,103.304


Global OOF r2_score for et_dc10 : 0.564286215115
Global OOF r2_score for et_dc11 : 0.566139214969
Global OOF r2_score for et_dc12 : 0.562896679185
Global OOF r2_score for et_dc9_ : 0.565766954545
Global OOF r2_score for gb_dc10 : 0.545413505931
Global OOF r2_score for gb_dc12 : 0.547616515472
Global OOF r2_score for la_dc10 : 0.564610784797
Global OOF r2_score for la_dc11 : 0.581812077052
Global OOF r2_score for la_dc12 : 0.564653215722
Global OOF r2_score for la_dc9_ : 0.58177883074
Global OOF r2_score for ll_dc10 : 0.531316248405
Global OOF r2_score for ll_dc11 : 0.581810763046
Global OOF r2_score for ll_dc12 : 0.546206815662
Global OOF r2_score for ll_dc9_ : 0.581799114493
Global OOF r2_score for rf_dc10 : 0.562180629152
Global OOF r2_score for rf_dc11 : 0.565837118747
Global OOF r2_score for rf_dc12 : 0.56399966403
Global OOF r2_score for rf_dc9_ : 0.563108152623
Global OOF r2_score for ri_dc10 : 0.552743740526
Global OOF r2_score for ri_dc11 : 0.548465732044
Global OOF r2_score fo

Unnamed: 0,ID,y
0,1,71.341
1,2,93.898


********************


Unnamed: 0,et_dc13,et_dc14,et_dc15,et_dc16,gb_dc13,gb_dc14,gb_dc16,la_dc13,la_dc14,la_dc15,...,rf_dc15,rf_dc16,ri_dc13,ri_dc14,ri_dc15,ri_dc16,xg_dc13,xg_dc14,xg_dc15,xg_dc16
0,93.788,93.808,93.924,93.803,94.474,95.656,94.723,94.684,95.18,94.669,...,94.378,93.84,93.908,93.813,94.081,93.841,93.706,94.73,94.164,94.796
1,93.908,93.982,93.845,93.742,94.293,95.717,94.731,94.684,95.621,94.669,...,94.15,93.923,94.281,95.684,94.453,95.713,96.139,95.882,94.563,94.705


Unnamed: 0,et_dc13,et_dc14,et_dc15,et_dc16,gb_dc13,gb_dc14,gb_dc16,la_dc13,la_dc14,la_dc15,...,rf_dc15,rf_dc16,ri_dc13,ri_dc14,ri_dc15,ri_dc16,xg_dc13,xg_dc14,xg_dc15,xg_dc16
4207,92.296,92.295,92.953,93.075,93.961,93.915,94.121,93.459,94.733,93.459,...,92.447,92.22,93.905,93.375,93.067,93.388,93.528,94.549,93.273,93.728
4208,92.164,92.304,93.959,93.767,94.524,93.455,95.215,93.891,94.293,93.891,...,94.843,94.208,97.236,95.576,96.371,95.654,92.028,91.521,91.743,93.981


Global OOF r2_score for et_dc13 : 0.568284095652
Global OOF r2_score for et_dc14 : 0.566985312873
Global OOF r2_score for et_dc15 : 0.56871831139
Global OOF r2_score for et_dc16 : 0.567136181641
Global OOF r2_score for gb_dc13 : 0.549207196931
Global OOF r2_score for gb_dc14 : 0.544464440003
Global OOF r2_score for gb_dc16 : 0.55338428102
Global OOF r2_score for la_dc13 : 0.58224936712
Global OOF r2_score for la_dc14 : 0.565803815256
Global OOF r2_score for la_dc15 : 0.582436417144
Global OOF r2_score for la_dc16 : 0.565794115392
Global OOF r2_score for ll_dc13 : 0.582330225043
Global OOF r2_score for ll_dc14 : 0.538534284043
Global OOF r2_score for ll_dc15 : 0.58238101556
Global OOF r2_score for ll_dc16 : 0.551534008851
Global OOF r2_score for rf_dc13 : 0.564697458651
Global OOF r2_score for rf_dc14 : 0.562391838344
Global OOF r2_score for rf_dc15 : 0.568633845006
Global OOF r2_score for rf_dc16 : 0.56645038039
Global OOF r2_score for ri_dc13 : 0.55235209127
Global OOF r2_score for ri

Unnamed: 0,ID,y
0,1,71.341
1,2,94.865


In [10]:
'''# Get error of each OOF column
for i in range(len(models_OOF)) :
    cur_model_OOF = models_OOF[i]
    model_name = cur_model_OOF.columns[0][0:7]
    cv_score = r2_score(cur_model_OOF[[1]], cur_model_OOF[[0]])
    print("Global OOF r2_score for " + model_name + " : " + str(cv_score))
#plot_preds(targets_X_test, preds_X_test)'''


'# Get error of each OOF column\nfor i in range(len(models_OOF)) :\n    cur_model_OOF = models_OOF[i]\n    model_name = cur_model_OOF.columns[0][0:7]\n    cv_score = r2_score(cur_model_OOF[[1]], cur_model_OOF[[0]])\n    print("Global OOF r2_score for " + model_name + " : " + str(cv_score))\n#plot_preds(targets_X_test, preds_X_test)'

In [11]:
'''# Stack all single Level 1 models OOF and test preds as entries for a Level 2 model
def get_level_one_data() :
    # Get level 1 preds and targets
    models_OOF = []
    i = 0
    preds_dir = "oof_preds/"
    targets_dir = "oof_targets/"
    for file_name in sorted(os.listdir(preds_dir)) :
        preds_OOF = pd.read_csv(preds_dir + file_name)
        targets_OOF = pd.read_csv(targets_dir + file_name.replace("preds", "targets"))
        cur_preds_OOF = pd.Series(preds_OOF.y, name = file_name[0:7])
        cur_targets_OOF = pd.Series(targets_OOF.y, name = file_name[0:7])
        cur_model_OOF = pd.DataFrame({file_name[0:7] + "_preds" : cur_preds_OOF, file_name[0:7] + "_targets" : cur_targets_OOF})
        models_OOF.append(cur_model_OOF)

    # Get test set preds
    level_one_test = pd.DataFrame()
    i = 0
    test_dir = "preds/"
    for file_name in sorted(os.listdir(test_dir)) :
        preds_test = pd.read_csv(test_dir + file_name)
        cur_preds_test = pd.Series(preds_test.y, name = file_name[0:7])
        level_one_test = pd.concat([level_one_test, cur_preds_test], axis = 1)
        i += 1
    display(level_one_test.head())
    
    return(models_OOF, level_one_test)'''

'# Stack all single Level 1 models OOF and test preds as entries for a Level 2 model\ndef get_level_one_data() :\n    # Get level 1 preds and targets\n    models_OOF = []\n    i = 0\n    preds_dir = "oof_preds/"\n    targets_dir = "oof_targets/"\n    for file_name in sorted(os.listdir(preds_dir)) :\n        preds_OOF = pd.read_csv(preds_dir + file_name)\n        targets_OOF = pd.read_csv(targets_dir + file_name.replace("preds", "targets"))\n        cur_preds_OOF = pd.Series(preds_OOF.y, name = file_name[0:7])\n        cur_targets_OOF = pd.Series(targets_OOF.y, name = file_name[0:7])\n        cur_model_OOF = pd.DataFrame({file_name[0:7] + "_preds" : cur_preds_OOF, file_name[0:7] + "_targets" : cur_targets_OOF})\n        models_OOF.append(cur_model_OOF)\n\n    # Get test set preds\n    level_one_test = pd.DataFrame()\n    i = 0\n    test_dir = "preds/"\n    for file_name in sorted(os.listdir(test_dir)) :\n        preds_test = pd.read_csv(test_dir + file_name)\n        cur_preds_test = pd