# Differential Evolution for Hyperparameter Tuning

In [1]:
import numpy as np
import pandas as pd
from data_preprocessing import DataProcessor

### Loading data and bringing in the right format

In [2]:
train_df = pd.read_csv('../data/processed/train_processed.csv')
test_df = pd.read_csv('../data/processed/test_processed.csv')

In [3]:
p = DataProcessor()

In [4]:
train_processed, train_original = p.fit_transform(train_df)

removed nan values
count vectorizer finished fitting
count vector finished transforming
dummy variables created
created a sparse matrix of all features


In [5]:
train_processed

<83041x15940 sparse matrix of type '<type 'numpy.float64'>'
	with 3359482 stored elements in Compressed Sparse Row format>

In [6]:
test_processed = p.transform(test_df)

count vector finished transforming
dummy variables created
created a sparse matrix of all features


In [7]:
test_processed

<20761x15940 sparse matrix of type '<type 'numpy.float64'>'
	with 837223 stored elements in Compressed Sparse Row format>

### ML model building

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import cross_val_score
# Differential evolution optimizer
from scipy.optimize import differential_evolution

In [17]:
args = (train_processed,train_original['Priority'])
bounds = [ (5,100),(5,20),(5,25) ] # order: n_estimators,max_depth,min_samples_split,min_samples_leaf

In [18]:
def func(parameters, *args):
    #print args[0], args[1]
    parameters = map(int,parameters)
    clf = RandomForestClassifier(
        n_jobs=-1, 
        n_estimators=parameters[0], 
        max_depth=None,
        min_samples_split=parameters[1],
        min_samples_leaf=parameters[2],
        class_weight="balanced"
    )
#     clf = RandomForestClassifier(class_weight="balanced", n_jobs=-1)
    f1scorer_macro = make_scorer(f1_score, average='macro')
#     print args[0].shape, args[1].shape
    f1 = cross_val_score(clf, args[0], args[1], scoring=f1scorer_macro, cv=3, n_jobs=-1)
    average_f1 = np.mean(f1)
    print average_f1, parameters
    return -1*average_f1

In [None]:
%time result = differential_evolution(func, bounds, args, strategy='rand2bin', popsize=15, mutation=(0.5,1.9), recombination=0.7, maxiter=2)

0.343635463661 [13, 5, 5]
0.32037007655 [19, 9, 13]


In [38]:
result.x

array([ 18.90506696,  12.07981089,   4.43601826,   1.16490721])

In [39]:
parameters = map(int,result.x)
print 'Tuned Parameters:',parameters

Tuned Parameters: [18, 12, 4, 1]


In [40]:
clf = RandomForestClassifier(
        n_jobs=-1, 
        n_estimators=parameters[0], 
        max_depth=parameters[1],
        min_samples_split=parameters[2],
        min_samples_leaf=parameters[3],
        class_weight='balanced'
    )
f1scorer_macro = make_scorer(f1_score, average='macro')
f1 = cross_val_score(clf, inputDF, df['Priority'], scoring=f1scorer_macro, cv=10, n_jobs=-1)
average_f1 = np.mean(f1)
print 'Cross validation metrics:',average_f1, np.std(f1)
%time clf.fit(inputDF, df['Priority'])

Cross validation metrics: 0.312145369846 0.0127815664628
CPU times: user 1.6 s, sys: 76.9 ms, total: 1.67 s
Wall time: 737 ms


RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=12, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=4,
            min_weight_fraction_leaf=0.0, n_estimators=18, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [41]:
clf = RandomForestClassifier(n_jobs=-1)
f1scorer_macro = make_scorer(f1_score, average='macro')
f1 = cross_val_score(clf, inputDF, df['Priority'], scoring=f1scorer_macro, cv=10, n_jobs=-1)
average_f1 = np.mean(f1)
print 'Cross validation metrics:',average_f1, np.std(f1)
%time clf.fit(inputDF, df['Priority'])

Cross validation metrics: 0.412665846083 0.0134582934844
CPU times: user 34.3 s, sys: 414 ms, total: 34.7 s
Wall time: 11.9 s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=-1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)