# Differential Evolution for Hyperparameter Tuning

In [1]:
import numpy as np
import pandas as pd
from data_preprocessing import DataProcessor

### Loading data and bringing in the right format

In [2]:
train_df = pd.read_csv('../data/processed/train_processed.csv')
test_df = pd.read_csv('../data/processed/test_processed.csv')

In [3]:
p = DataProcessor()

In [4]:
train_processed, train_original = p.fit_transform(train_df)

removed nan values
count vectorizer finished fitting
count vector finished transforming
dummy variables created
created a sparse matrix of all features


In [5]:
train_processed

<83041x15940 sparse matrix of type '<type 'numpy.float64'>'
	with 3359482 stored elements in Compressed Sparse Row format>

In [6]:
test_processed = p.transform(test_df)

count vector finished transforming
dummy variables created
created a sparse matrix of all features


In [7]:
test_processed

<20761x15940 sparse matrix of type '<type 'numpy.float64'>'
	with 837223 stored elements in Compressed Sparse Row format>

### ML model building

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import cross_val_score
# Differential evolution optimizer
from scipy.optimize import differential_evolution

In [17]:
args = (train_processed,train_original['Priority'])
bounds = [ (5,100),(5,20),(5,25) ] # order: n_estimators,max_depth,min_samples_split,min_samples_leaf

In [18]:
def func(parameters, *args):
    #print args[0], args[1]
    parameters = map(int,parameters)
    clf = RandomForestClassifier(
        n_jobs=-1, 
        n_estimators=parameters[0], 
        max_depth=None,
        min_samples_split=parameters[1],
        min_samples_leaf=parameters[2],
        class_weight="balanced"
    )
#     clf = RandomForestClassifier(class_weight="balanced", n_jobs=-1)
    f1scorer_macro = make_scorer(f1_score, average='macro')
#     print args[0].shape, args[1].shape
    f1 = cross_val_score(clf, args[0], args[1], scoring=f1scorer_macro, cv=3, n_jobs=-1)
    average_f1 = np.mean(f1)
    print average_f1, parameters
    return -1*average_f1

In [19]:
%time result = differential_evolution(func, bounds, args, strategy='rand2bin', popsize=15, mutation=(0.5,1.9), recombination=0.7, maxiter=2)

0.343635463661 [13, 5, 5]
0.32037007655 [19, 9, 13]
0.315756533928 [98, 11, 19]
0.330654260912 [45, 11, 10]
0.301185085018 [8, 12, 12]
0.317887453694 [65, 11, 17]
0.321983892985 [38, 6, 13]
0.316816652694 [83, 9, 18]
0.343370873513 [88, 16, 7]
0.282815077261 [5, 17, 20]
0.351471032253 [25, 9, 5]
0.310337904842 [57, 5, 24]
0.326205175916 [44, 10, 12]
0.312781011969 [96, 18, 21]
0.32910939736 [14, 16, 8]
0.31416542802 [60, 17, 18]
0.300919561841 [21, 12, 23]
0.313273033782 [54, 7, 17]
0.306841267446 [30, 15, 20]
0.339114753529 [66, 18, 9]
0.315148839827 [90, 13, 19]
0.292271850335 [9, 6, 16]
0.311069967404 [74, 17, 23]
0.326929731088 [27, 15, 10]
0.319435247495 [79, 7, 16]
0.339172419565 [72, 10, 8]
0.315343653394 [57, 8, 15]
0.304910456788 [21, 6, 21]
0.326822703137 [74, 19, 11]
0.320620221595 [93, 15, 15]
0.345006735692 [95, 19, 7]
0.338545315818 [53, 14, 9]
0.329479632971 [17, 13, 7]
0.323642760641 [81, 12, 14]
0.309952773909 [48, 8, 19]
0.306409662161 [41, 10, 24]
0.304397161628 [37,

In [20]:
result.x

array([ 95.20061021,  19.34546557,   5.87881766])

In [21]:
parameters = map(int,result.x)
print 'Tuned Parameters:',parameters

Tuned Parameters: [95, 19, 5]


In [25]:
clf = RandomForestClassifier(
        n_jobs=-1, 
        n_estimators=parameters[0], 
        max_depth=None,
        min_samples_split=parameters[1],
        min_samples_leaf=parameters[2],
        class_weight='balanced'
    )
f1scorer_macro = make_scorer(f1_score, average='macro')
f1 = cross_val_score(clf, train_processed, train_original['Priority'], scoring=f1scorer_macro, cv=10, n_jobs=-1)
average_f1 = np.mean(f1)
print 'Cross validation metrics:',average_f1, np.std(f1)
%time clf.fit(inputDF, df['Priority'])

Cross validation metrics: 0.361053886553 0.00499499768331


NameError: name 'inputDF' is not defined

In [24]:
clf = RandomForestClassifier(n_jobs=-1)
f1scorer_macro = make_scorer(f1_score, average='macro')
f1 = cross_val_score(clf, train_processed, train_original['Priority'], scoring=f1scorer_macro, cv=10, n_jobs=-1)
average_f1 = np.mean(f1)
print 'Cross validation metrics:',average_f1, np.std(f1)
%time clf.fit(train_processed, train_original['Priority'])

KeyboardInterrupt: 