# Differential Evolution for Hyperparameter Tuning

In [112]:
import numpy as np
import pandas as pd
import data_preprocessing

### Loading data and bringing in the right format

In [113]:
inputDF, df = data_preprocessing.preprocess_data()

reading file...
Tokenizing Summary
Combining...
Preprocessing done!


In [118]:
inputDF, df.shape

(<83041x15940 sparse matrix of type '<type 'numpy.float64'>'
 	with 3359482 stored elements in Compressed Sparse Row format>, (83041, 40))

### ML model building

In [119]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import cross_val_score
# Differential evolution optimizer
from scipy.optimize import differential_evolution

In [120]:
args = (inputDF,df['Priority'])
bounds = [ (10,100),(5,20),(2,10),(2,10) ] # order: n_estimators,max_depth,min_samples_split,min_samples_leaf

In [130]:
def func(parameters, *args):
    #print args[0], args[1]
    parameters = map(int,parameters)
    clf = RandomForestClassifier(
        n_jobs=-1, 
        n_estimators=parameters[0], 
        max_depth=parameters[1],
        min_samples_split=parameters[2],
        min_samples_leaf=parameters[3],
        class_weight="balanced"
    )
    clf = RandomForestClassifier(class_weight="balanced", n_jobs=-1)
    f1scorer_macro = make_scorer(f1_score, average='macro')
    f1 = cross_val_score(clf, args[0], args[1], scoring=f1scorer_macro, cv=3, n_jobs=-1)
    average_f1 = np.mean(f1)
    print average_f1, parameters
    return -1*average_f1

In [132]:
%time result = differential_evolution(func, bounds, args, popsize=10, mutation=(0.5,1.9), recombination=0.7, maxiter=1)

0.385716100753 [14, 8, 2, 9]
0.385716100753 [23, 8, 5, 2]
0.385716100753 [47, 16, 8, 7]
0.385716100753 [30, 5, 8, 8]
0.385716100753 [68, 6, 4, 4]
0.385716100753 [53, 11, 4, 5]
0.385716100753 [10, 15, 8, 6]
0.385716100753 [37, 19, 9, 3]
0.385716100753 [70, 15, 3, 7]
0.385716100753 [45, 12, 3, 7]
0.385716100753 [62, 10, 5, 4]
0.385716100753 [77, 13, 4, 3]
0.385716100753 [59, 19, 6, 2]
0.385716100753 [97, 11, 3, 8]
0.385716100753 [73, 7, 7, 6]
0.385716100753 [89, 14, 8, 6]
0.385716100753 [99, 13, 6, 7]
0.385716100753 [39, 12, 4, 9]
0.385716100753 [17, 8, 2, 9]
0.385716100753 [35, 5, 2, 3]
0.385716100753 [88, 9, 4, 7]
0.385716100753 [84, 7, 7, 2]
0.385716100753 [55, 14, 7, 2]
0.385716100753 [65, 6, 8, 9]
0.385716100753 [84, 7, 2, 2]
0.385716100753 [72, 5, 2, 8]
0.385716100753 [24, 18, 9, 4]
0.385716100753 [95, 18, 9, 3]
0.385716100753 [28, 11, 5, 8]
0.385716100753 [19, 17, 6, 8]
0.385716100753 [14, 10, 3, 5]
0.385716100753 [76, 16, 3, 6]
0.385716100753 [58, 17, 9, 5]
0.385716100753 [81, 16

In [135]:
result.x

array([ 10.04737239,  15.42451623,   6.78045424,   4.63141731])

In [136]:
parameters = map(int,result.x)
print 'Tuned Parameters:',parameters

Tuned Parameters: [10, 15, 6, 4]


In [139]:
clf = RandomForestClassifier(
        n_jobs=-1, 
        n_estimators=parameters[0], 
        max_depth=parameters[1],
        min_samples_split=parameters[2],
        min_samples_leaf=parameters[3],
        class_weight='balanced'
    )
f1scorer_macro = make_scorer(f1_score, average='macro')
f1 = cross_val_score(clf, inputDF, df['Priority'], scoring=f1scorer_macro, cv=10, n_jobs=-1)
average_f1 = np.mean(f1)
print 'Cross validation metrics:',average_f1, np.std(f1)
%time clf.fit(inputDF, df['Priority'])

Cross validation metrics: 0.293654716975 0.0175176356588
CPU times: user 1 s, sys: 4 ms, total: 1.01 s
Wall time: 407 ms


RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=15, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=4,
            min_samples_split=6, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=-1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [140]:
clf = RandomForestClassifier(n_jobs=-1)
f1scorer_macro = make_scorer(f1_score, average='macro')
f1 = cross_val_score(clf, inputDF, df['Priority'], scoring=f1scorer_macro, cv=10, n_jobs=-1)
average_f1 = np.mean(f1)
print 'Cross validation metrics:',average_f1, np.std(f1)
%time clf.fit(inputDF, df['Priority'])

Cross validation metrics: 0.414096109438 0.0138912232415
CPU times: user 30.7 s, sys: 0 ns, total: 30.7 s
Wall time: 6.3 s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)