# Differential Evolution for Hyperparameter Tuning

In [1]:
import numpy as np
import pandas as pd
from data_preprocessing import DataProcessor

### Loading data and bringing in the right format

In [2]:
train_df = pd.read_csv('../data/processed/train_processed.csv')
test_df = pd.read_csv('../data/processed/test_processed.csv')

In [4]:
p = DataProcessor()

In [5]:
train_processed, train_original = p.fit_transform(train_df)

removed nan values
count vectorizer finished fitting
count vector finished transforming
dummy variables created
created a sparse matrix of all features


In [6]:
train_processed

<83041x15940 sparse matrix of type '<type 'numpy.float64'>'
	with 3359482 stored elements in Compressed Sparse Row format>

In [9]:
test_processed, test_orignal = p.transform(test_df)

count vector finished transforming
dummy variables created
created a sparse matrix of all features


In [10]:
test_processed

<20761x15940 sparse matrix of type '<type 'numpy.float64'>'
	with 837223 stored elements in Compressed Sparse Row format>

### ML model building

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import cross_val_score
# Differential evolution optimizer
from scipy.optimize import differential_evolution

In [15]:
args = (train_processed,train_original['Priority'])
bounds = [ (5,50),(2,10),(1,10) ] # order: n_estimators,max_depth,min_samples_split,min_samples_leaf

In [16]:
def func(parameters, *args):
    #print args[0], args[1]
    parameters = map(int,parameters)
    clf = RandomForestClassifier(
        n_jobs=-1, 
        n_estimators=parameters[0], 
        max_depth=None,
        min_samples_split=parameters[1],
        min_samples_leaf=parameters[2],
        class_weight="balanced"
    )
#     clf = RandomForestClassifier(class_weight="balanced", n_jobs=-1)
    f1scorer_macro = make_scorer(f1_score, average='macro')
#     print args[0].shape, args[1].shape
    f1 = cross_val_score(clf, args[0], args[1], scoring=f1scorer_macro, cv=3, n_jobs=-1)
    average_f1 = np.mean(f1)
    print average_f1, parameters
    return -1*average_f1

In [17]:
%time result = differential_evolution(func, bounds, args, strategy='rand2bin', popsize=20, mutation=(0.5,1.9), recombination=0.7, maxiter=3)

0.334418278537 [18, 7, 7]
0.340315834973 [5, 2, 3]
0.363937865759 [34, 7, 4]
0.337260309336 [36, 8, 8]
0.380586872998 [33, 7, 3]
0.348004345503 [25, 6, 5]
0.403365908261 [30, 6, 2]
0.344603678767 [47, 5, 7]
0.419048858025 [40, 3, 1]
0.376379487192 [25, 6, 3]
0.409730192482 [45, 4, 2]
0.291916088049 [5, 5, 9]
0.350922592249 [31, 4, 5]
0.345540199046 [10, 2, 4]
0.336786367535 [12, 4, 6]
0.336057429559 [29, 8, 8]
0.397496439435 [22, 9, 2]
0.375139917676 [21, 2, 3]
0.351490293515 [45, 3, 6]
0.409976774126 [49, 4, 2]
0.316133732602 [13, 4, 9]
0.317990056465 [8, 8, 8]
0.338910884086 [14, 3, 6]
0.361528878601 [28, 9, 4]
0.452192157912 [6, 9, 1]
0.340758955735 [26, 7, 7]
0.33046319552 [41, 3, 9]
0.382808637824 [42, 7, 3]
0.467912638813 [21, 6, 1]
0.341315228876 [15, 6, 6]
0.331299640498 [46, 9, 9]
0.32407696659 [26, 4, 9]
0.333345926183 [17, 8, 7]
0.332290909092 [43, 5, 9]
0.387778472762 [10, 3, 2]
0.387853137912 [15, 2, 1]
0.34273759296 [16, 2, 5]
0.335496145362 [8, 7, 4]
0.328531040379 [11, 

In [18]:
result.x

array([ 32.60605168,   9.39161969,   1.89580894])

In [19]:
parameters = map(int,result.x)
print 'Tuned Parameters:',parameters

Tuned Parameters: [32, 9, 1]


In [25]:
clf = RandomForestClassifier(
        n_jobs=-1, 
        n_estimators=parameters[0], 
        max_depth=None,
        min_samples_split=parameters[1],
        min_samples_leaf=parameters[2],
        class_weight='balanced'
    )
f1scorer_macro = make_scorer(f1_score, average='macro')
f1 = cross_val_score(clf, train_processed, train_original['Priority'], scoring=f1scorer_macro, cv=10, n_jobs=-1)
average_f1 = np.mean(f1)
print 'Cross validation metrics:',average_f1, np.std(f1)
%time clf.fit(inputDF, df['Priority'])

Cross validation metrics: 0.361053886553 0.00499499768331


NameError: name 'inputDF' is not defined

In [24]:
clf = RandomForestClassifier(n_jobs=-1)
f1scorer_macro = make_scorer(f1_score, average='macro')
f1 = cross_val_score(clf, train_processed, train_original['Priority'], scoring=f1scorer_macro, cv=10, n_jobs=-1)
average_f1 = np.mean(f1)
print 'Cross validation metrics:',average_f1, np.std(f1)
%time clf.fit(train_processed, train_original['Priority'])

KeyboardInterrupt: 