# Differential Evolution for Hyperparameter Tuning

In [112]:
import numpy as np
import pandas as pd
import data_preprocessing

### Loading data and bringing in the right format

In [113]:
inputDF, df = data_preprocessing.preprocess_data()

reading file...
Tokenizing Summary
Combining...
Preprocessing done!


In [118]:
inputDF, df.shape

(<83041x15940 sparse matrix of type '<type 'numpy.float64'>'
 	with 3359482 stored elements in Compressed Sparse Row format>, (83041, 40))

### ML model building

In [119]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import cross_val_score
# Differential evolution optimizer
from scipy.optimize import differential_evolution

In [120]:
args = (inputDF,df['Priority'])
bounds = [ (10,100),(5,20),(2,10),(2,10) ] # order: n_estimators,max_depth,min_samples_split,min_samples_leaf

In [130]:
def func(parameters, *args):
    #print args[0], args[1]
    parameters = map(int,parameters)
    clf = RandomForestClassifier(
        n_jobs=-1, 
        n_estimators=parameters[0], 
        max_depth=parameters[1],
        min_samples_split=parameters[2],
        min_samples_leaf=parameters[3],
        class_weight="balanced"
    )
    clf = RandomForestClassifier(class_weight="balanced", n_jobs=-1)
    f1scorer_macro = make_scorer(f1_score, average='macro')
    f1 = cross_val_score(clf, args[0], args[1], scoring=f1scorer_macro, cv=3, n_jobs=-1)
    average_f1 = np.mean(f1)
    print average_f1, parameters
    return -1*average_f1

In [131]:
func(np.array([10,10,2,1]),args)

IndexError: tuple index out of range

In [None]:
%time result = differential_evolution(func, bounds, args, popsize=10, mutation=(0.5,1.9), recombination=0.7, maxiter=1)

0.385716100753 [14, 8, 2, 9]
0.385716100753 [23, 8, 5, 2]
0.385716100753 [47, 16, 8, 7]
0.385716100753 [30, 5, 8, 8]
0.385716100753 [68, 6, 4, 4]
0.385716100753 [53, 11, 4, 5]
0.385716100753 [10, 15, 8, 6]
0.385716100753 [37, 19, 9, 3]
0.385716100753 [70, 15, 3, 7]
0.385716100753 [45, 12, 3, 7]
0.385716100753 [62, 10, 5, 4]
0.385716100753 [77, 13, 4, 3]
0.385716100753 [59, 19, 6, 2]
0.385716100753 [97, 11, 3, 8]
0.385716100753 [73, 7, 7, 6]
0.385716100753 [89, 14, 8, 6]
0.385716100753 [99, 13, 6, 7]
0.385716100753 [39, 12, 4, 9]
0.385716100753 [17, 8, 2, 9]
0.385716100753 [35, 5, 2, 3]
0.385716100753 [88, 9, 4, 7]
0.385716100753 [84, 7, 7, 2]
0.385716100753 [55, 14, 7, 2]
0.385716100753 [65, 6, 8, 9]
0.385716100753 [84, 7, 2, 2]
0.385716100753 [72, 5, 2, 8]
0.385716100753 [24, 18, 9, 4]
0.385716100753 [95, 18, 9, 3]
0.385716100753 [28, 11, 5, 8]
0.385716100753 [19, 17, 6, 8]
0.385716100753 [14, 10, 3, 5]
0.385716100753 [76, 16, 3, 6]
0.385716100753 [58, 17, 9, 5]
0.385716100753 [81, 16

In [62]:
parameters = map(int,result.x)
print 'Tuned Parameters:',parameters

Tuned Parameters: [99, 5, 8, 5]


In [None]:
clf = RandomForestClassifier(
        n_jobs=-1, 
        n_estimators=parameters[0], 
        max_depth=parameters[1],
        min_samples_split=parameters[2],
        min_samples_leaf=parameters[3]
    )
f1scorer_macro = make_scorer(f1_score, average='macro')
f1 = cross_val_score(clf, args[0], args[1], scoring=f1scorer_macro, cv=10, n_jobs=-1)
average_f1 = np.mean(f1)
print 'Cross validation metrics:',average, np.std(f1)
%time clf.fit(inputDF, df['Priority'])

In [18]:
from scipy.optimize import rosen, differential_evolution
bounds = [(0,2), (0, 2), (0, 2), (0, 2), (0, 2)]
result = differential_evolution(rosen, bounds)
result.x, result.fun

(array([ 1.,  1.,  1.,  1.,  1.]), 0.0)

In [91]:
f1scorer_macro = make_scorer(f1_score, average='macro')

In [92]:
clf = RandomForestClassifier(n_jobs=-1)

In [109]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

clf = RandomForestClassifier(class_weight="balanced", n_jobs=-1)
kf = KFold(n_splits=10)

In [111]:
cvscores = []
for train,test in kf.split(inputDF):
    print len(train)
    train_input = inputDF[train]
    test_input = inputDF[test]
    train_output = df['Priority'][train]
    test_output = df['Priority'][test]
    %time clf.fit(train_input,train_output)
    prediction = clf.predict(test_input)
    scores = f1_score(test_output, prediction, average=None)
    print scores
    cvscores.append(scores)

74736


TypeError: only integer scalar arrays can be converted to a scalar index

In [96]:
inputDF.shape

(83041, 15980)