In [1]:
# Imports
import pandas as pd
import numpy as np
import datetime
import warnings
from scipy.optimize import minimize
from sklearn.metrics import r2_score
from IPython.display import display

# Definitions
pd.set_option('display.float_format', lambda x: '%.3f' % x)
%matplotlib inline
warnings.simplefilter('ignore')
N_JOBS = -1
SEED = 2017

In [2]:
# Define function to minimize
def target_func(weights) :
    final_prediction = 0
    for weight, prediction in zip(weights, preds_X_test):
        final_prediction += weight * prediction
    return(-r2_score(X_test_y, final_prediction))


# Optimize ensemble weights on held out fold
def optimize(preds_X_test, names) :
    scores = []
    weights = []
    for i in range(100):
        # Choose many random starting weights
        starting_values = np.random.uniform(size = len(preds_X_test))

        # Our weights are bound between 0 and 1
        bounds = [(0, 1)] * len(preds_X_test)

        res = minimize(
            target_func, 
            starting_values, 
            method = "SLSQP", 
            bounds = bounds, 
            options = {"maxiter" : 10000})

        scores.append(res["fun"])
        weights.append(res["x"])

    bestSC = -np.min(scores)
    bestWght = weights[np.argmin(scores)]

    print("\n Ensemble Score: {}".format(bestSC))
    print("\n Best Weights: {}".format(bestWght))
    print("\n Names: {}".format(names))

    return(bestSC, bestWght, names)

<b>Script</b>

In [3]:
# Get data
test = pd.read_csv("clean_data/test.csv")
test_ids = pd.read_csv("clean_data/test_ids.csv", header = None)

X_test_y = pd.read_csv("clean_data/X_test_y.csv")



In [25]:
# Get validation set data
la_preds_X_test = np.array(pd.read_csv("preds/la_preds_X_test.csv", header = None))
et_preds_X_test = np.array(pd.read_csv("preds/et_preds_X_test.csv", header = None))
xg_preds_X_test = np.array(pd.read_csv("preds/xg_preds_X_test.csv", header = None))

# Get test set data
la_preds_test = np.array(pd.read_csv("preds/la_preds_test.csv"))
et_preds_test = np.array(pd.read_csv("preds/et_preds_test.csv"))
xg_preds_test = np.array(pd.read_csv("preds/xg_preds_test.csv"))

preds_X_test = []
preds_X_test.append(la_preds_X_test[:, 0])
preds_X_test.append(et_preds_X_test[:, 0])
preds_X_test.append(xg_preds_X_test[:, 0])

preds_test = []
preds_test.append(la_preds_test[:, 1])
preds_test.append(et_preds_test[:, 1])
preds_test.append(xg_preds_test[:, 1])

names = []
names.append("la")
names.append("et")
names.append("xg")


In [26]:
print(preds_test)


[array([  79.14059146,   94.0194818 ,   79.14059146, ...,   93.47911443,
        111.01956104,   93.17548759]), array([  77.77629834,   94.3189357 ,   77.56576577, ...,   93.07136846,
        111.13477199,   93.06974713]), array([  79.18000031,   94.309021  ,   80.21730042, ...,   92.15676117,
        109.28131104,   91.15718842])]


In [27]:
# Scores of single models on validation set
print("la on held out fold : " + str(r2_score(X_test_y, la_preds_X_test)))
print("et on held out fold : " + str(r2_score(X_test_y, et_preds_X_test)))
print("xg on held out fold : " + str(r2_score(X_test_y, xg_preds_X_test)))


la on held out fold : 0.627059169621
et on held out fold : 0.614172687261
xg on held out fold : 0.611368707735


In [28]:
# Optimize weights
bestSC, bestWght, names = optimize(preds_X_test, names)


 Ensemble Score: 0.6278532631267271

 Best Weights: [  8.64478837e-01   1.32718692e-01   1.60378407e-19]

 Names: ['la', 'et', 'xg']


In [30]:
# Prepare submission using optimal weights
preds = 0
for i in range(0, len(bestWght)) :
    (bestWght[i] * preds_test[i])
    preds += (bestWght[i] * preds_test[i])

In [31]:
# Save predictions
pd.DataFrame({"ID": test_ids[0], "y": preds}).to_csv("preds/blend.csv", index = False)
