In [1]:
# Imports
import os
import pandas as pd
import numpy as np
import datetime
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.optimize import minimize
from sklearn.metrics import r2_score
from IPython.display import HTML, display

# Definitions
pd.set_option('display.float_format', lambda x: '%.3f' % x)
%matplotlib inline
display(HTML("<style>.container { width: 90% !important; }</style>"))
warnings.simplefilter('ignore')
N_JOBS = -1
SEED = 2017

In [6]:
# Stack all single Level 1 models OOF and test preds as entries for a Level 2 model
def get_level_one_outputs(OOF_targets) :
    level_one_OOF = pd.DataFrame()
    i = 0
    root_dir = "oof_preds/"
    for file_name in sorted(os.listdir(root_dir)) :
        preds_OOF = pd.read_csv(root_dir + file_name)
        cur_preds_OOF = pd.Series(preds_OOF.y, name = file_name[0:7])
        level_one_OOF = pd.concat([level_one_OOF, cur_preds_OOF], axis = 1)
        i += 1
    display(level_one_OOF.head())

    # Same for test set preds
    level_one_test = pd.DataFrame()
    i = 0
    root_dir = "preds/"
    for file_name in sorted(os.listdir(root_dir)) :
        preds_test = pd.read_csv(root_dir + file_name)
        cur_preds_test = pd.Series(preds_test.y, name = file_name[0:7])
        level_one_test = pd.concat([level_one_test, cur_preds_test], axis = 1)
        i += 1
    display(level_one_test.head())
    
    return(level_one_OOF, level_one_test)

In [7]:
# Plot predictions
def plot_preds(targets, preds) :
    plt.style.use("fivethirtyeight")
    plt.figure(figsize=(10, 4))
    colormap = plt.cm.gist_ncar
    plt.gca().set_color_cycle([colormap(i) for i in np.linspace(0, 0.9, len(preds))])
    plt.plot([0, 50], [0, 300], c = "red", linewidth = 2)
    for i in range(len(preds)) :        
        plt.scatter(targets, preds[i], marker = "o", s = 5, label = "Model" + str(i + 1))
    plt.xlabel("Real y")
    plt.ylabel("Projected y")
    plt.legend()
    plt.show()

In [8]:
# Define function to minimize
def target_func(weights) :
    final_prediction = 0
    for weight, prediction in zip(weights, preds_X_test):
        final_prediction += weight * prediction
    return(-r2_score(OOF_targets.y, final_prediction))


# Optimize ensemble weights on held out fold
def optimize(preds_X_test) :
    scores = []
    weights = []
    for i in range(100):
        # Choose many random starting weights
        starting_values = np.random.uniform(size = len(preds_X_test))

        # Our weights are bound between 0 and 1
        bounds = [(0, 1)] * len(preds_X_test)

        res = minimize(
            target_func, 
            starting_values, 
            method = "SLSQP", 
            bounds = bounds, 
            options = {"maxiter" : 10000})

        scores.append(res["fun"])
        weights.append(res["x"])

    bestSC = -np.min(scores)
    bestWght = weights[np.argmin(scores)]

    print("\n Ensemble Score: {}".format(bestSC))
    print("\n Best Weights: {}".format(bestWght))

    return(bestSC, bestWght)

<b>Script</b>

In [9]:
# Get data
OOF_targets = pd.read_csv("clean_data/OOF_targets.csv")
init_test = pd.read_csv("raw_data/test.csv")
test_ids = init_test.ID.values

display(OOF_targets.head())

Unnamed: 0,ID,y
0,1,99.15
1,2,91.98
2,3,91.52
3,4,91.57
4,5,90.11


In [10]:
# Create Level 2 inputs
level_one_OOF, level_one_test = get_level_one_outputs(OOF_targets)

Unnamed: 0,et_dc15,et_dc16,et_dc17,et_dc18,et_dc19,et_dc20,xg_dc15,xg_dc16,xg_dc17,xg_dc18,xg_dc19,xg_dc20
0,93.633,93.613,93.642,93.635,93.696,93.711,94.17,93.896,92.817,93.582,94.268,93.549
1,93.838,94.145,93.893,94.162,93.93,94.041,94.164,95.514,93.873,95.439,94.164,94.902
2,93.407,93.346,93.266,93.385,93.333,93.497,90.89,91.675,90.846,91.561,90.932,91.555
3,93.894,93.744,93.947,93.81,94.081,93.75,89.279,90.51,89.449,91.293,89.725,90.99
4,94.192,93.768,94.207,93.771,94.294,93.831,94.547,93.463,94.465,93.755,94.396,93.907


Unnamed: 0,et_dc15,et_dc16,et_dc17,et_dc18,et_dc19,et_dc20,xg_dc15,xg_dc16,xg_dc17,xg_dc18,xg_dc19,xg_dc20
0,75.893,76.32,75.927,76.277,76.134,76.531,74.369,75.761,74.41,75.386,74.336,75.876
1,93.852,93.632,93.884,93.68,93.754,93.764,94.994,94.77,94.697,94.399,96.554,94.848
2,76.968,77.346,77.353,77.362,77.29,77.294,79.332,78.784,79.06,78.761,79.793,79.056
3,77.787,77.418,77.641,77.158,77.84,77.349,76.799,76.739,77.274,76.604,76.859,76.543
4,112.413,112.611,112.542,112.568,112.518,112.52,115.185,115.414,117.067,116.204,116.415,115.469


In [11]:
# Get error of each OOF column
preds_X_test = []
for col in level_one_OOF.columns :
    cv_score = r2_score(OOF_targets.y, level_one_OOF[col])
    print("Global OOF r2_score for " + col + " : " + str(cv_score))

    preds_X_test.append(level_one_OOF[col])
plot_preds(OOF_targets.y, preds_X_test)


ValueError: Found input variables with inconsistent numbers of samples: [4209, 4244]

In [None]:
# Optimize weights
bestSC, bestWght = optimize(preds_X_test)

In [None]:
# Prepare submission using optimal weights
preds = 0
for i in range(0, len(bestWght)) :
    preds = preds + (bestWght[i] * level_one_test[[i]].values)

In [None]:
pd.DataFrame(preds)[0]

In [None]:
# Save predictions
file_name = "ens_preds/avg_" + str(level_one_OOF.shape[1]) + ".csv"
pd.DataFrame({"ID": init_test.ID.values, "y": pd.DataFrame(preds)[0]}).to_csv(file_name, index = False)


In [None]:
'''# Get validation set data
la_preds_X_test = np.array(pd.read_csv("preds/la_preds_X_test.csv", header = None))
et_preds_X_test = np.array(pd.read_csv("preds/et_preds_X_test.csv", header = None))
xg_preds_X_test = np.array(pd.read_csv("preds/xg_preds_X_test.csv", header = None))

# Get test set data
la_preds_test = np.array(pd.read_csv("preds/la_preds_test.csv"))
et_preds_test = np.array(pd.read_csv("preds/et_preds_test.csv"))
xg_preds_test = np.array(pd.read_csv("preds/xg_preds_test.csv"))

preds_X_test = []
preds_X_test.append(la_preds_X_test[:, 0])
preds_X_test.append(et_preds_X_test[:, 0])
preds_X_test.append(xg_preds_X_test[:, 0])

preds_test = []
preds_test.append(la_preds_test[:, 1])
preds_test.append(et_preds_test[:, 1])
preds_test.append(xg_preds_test[:, 1])

names = []
names.append("la")
names.append("et")
names.append("xg")'''


In [None]:
'''# Scores of single models on validation set
print("la on held out fold : " + str(r2_score(X_test_y, la_preds_X_test)))
print("et on held out fold : " + str(r2_score(X_test_y, et_preds_X_test)))
print("xg on held out fold : " + str(r2_score(X_test_y, xg_preds_X_test)))'''


In [None]:
'''# Optimize weights
bestSC, bestWght, names = optimize(preds_X_test, names)'''

In [None]:
'''# Prepare submission using optimal weights
preds = 0
for i in range(0, len(bestWght)) :
    (bestWght[i] * preds_test[i])
    preds += (bestWght[i] * preds_test[i])'''

In [None]:
'''# Save predictions
pd.DataFrame({"ID": test_ids[0], "y": preds}).to_csv("preds/blend.csv", index = False)'''
