# Find optimal weights with OOF predictions using weight minimization technique 

In this notebook we will look at how to implement Weight Minimization technique to find optimal weights for ensembling, we will apply minimize function on OOF predictions of our base models to find weights.

In [19]:
#load important libraries
#author:sohaib
from __future__ import division
#import required modules
from datetime import datetime
import pandas as pd 
from IPython.display import display
import numpy as np

from sklearn.metrics import roc_auc_score
from scipy.optimize import minimize
from numba import jit

# to supress printing of exponential notation in pandas
pd.options.display.float_format = '{:20,.2f}'.format
pd.options.display.max_columns = 100

In [17]:
def auc_to_gini(score):
    """Converts AUC to Gini
       
    Arguments:
        score {float} -- AUC score
    
    Returns:
        [float] -- gini score
    """
    
    gini = (2 * score) - 1
    return gini


# _-author__:Tilli

#TODO: Multiprocessing takes more time, because Inter Process Communication Overhead is more computationaly costly 
#in this scenario then function computition

#https://www.kaggle.com/tilii7/ensemble-weights-minimization-vs-mcmc/code
def weight_minimize(weights, y, predictions_list):
    ''' scipy minimize will pass the weights as a numpy array '''
    final_prediction = 0
    for weight, prediction in zip(weights, predictions_list):
        final_prediction += weight*prediction
    
    return -auc_to_gini(roc_auc_score(y, final_prediction)) 

def timer(start_time=None):
    """
    """
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [24]:
MAX_ITER_FOR_MINIMIZE_FUNC = 1000
MAX_WEIGHTS_ITER = 10

#download the porto data from : https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/data
TRAIN_PATH = 'porto_data/train.csv'
TEST_PATH = 'porto_data/test.csv'

##  Load OOF predictitons of base models.


In [14]:
train = pd.read_csv(TRAIN_PATH)
y = train.target

#sample dataset
xgb_oof = pd.read_csv('base_models_results/oof_xgb_base.csv')
lgb_oof = pd.read_csv('base_models_results/oof_lgb_base.csv')

oof_predictions = [xgb_oof.xgb_basetrain, lgb_oof.lgb_basetrain]

In [22]:
start_time = timer(None)
print 'Finding weights by minimization ...'
gini = []
weights_list = []

for i in range(MAX_WEIGHTS_ITER):
    starting_values = np.random.uniform(size=len(oof_predictions))
    bounds = [(0,1)]*len(oof_predictions)
    res = minimize(weight_minimize, starting_values, method='L-BFGS-B', args=(y, oof_predictions), bounds=bounds, options={'disp': False, 'maxiter': MAX_ITER_FOR_MINIMIZE_FUNC})
    gini.append(res['fun'])
    weights_list.append(res['x'])

bestscore = np.min(gini)
print "Best score = {:.6f} on {} iteration".format(bestscore, str(np.argmin(gini)))

bestweights = weights_list[np.argmin(gini)]
print "Best Weights: ", bestweights
timer(start_time)

Finding weights by minimization ...
Best score = -0.254704 on 7 iteration
Best Weights:  [ 0.79956133  0.        ]

 Time taken: 0 hours 0 minutes and 3.28 seconds.


In [23]:
#use weights found, multiply and add with base models test predictions