# Differential Evolution for Hyperparameter Tuning

In [1]:
import numpy as np
import pandas as pd
from data_preprocessing import DataProcessor

### Loading data and bringing in the right format

In [8]:
train_df = pd.read_csv('../data/processed/train_processed.csv')
test_df = pd.read_csv('../data/processed/test_processed.csv')

In [9]:
p = DataProcessor()

In [3]:
train_processed = p.fit_transform(train_df)

removed nan values
count vectorizer finished fitting
count vector finished transforming
dummy variables created
created a sparse matrix of all features


In [4]:
train_processed

<83041x15940 sparse matrix of type '<type 'numpy.float64'>'
	with 3359482 stored elements in Compressed Sparse Row format>

In [6]:
test_processed = p.transform(test_df)

count vector finished transforming
dummy variables created
created a sparse matrix of all features


In [7]:
test_processed

<20761x15940 sparse matrix of type '<type 'numpy.float64'>'
	with 837223 stored elements in Compressed Sparse Row format>

In [2]:
inputDF, df = data_preprocessing.preprocess_data()

reading file...
Tokenizing Summary
Combining...
Preprocessing done!


In [3]:
inputDF, df.shape

(<83041x15940 sparse matrix of type '<type 'numpy.float64'>'
 	with 3359482 stored elements in Compressed Sparse Row format>, (83041, 40))

### ML model building

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import cross_val_score
# Differential evolution optimizer
from scipy.optimize import differential_evolution

In [18]:
args = (inputDF,df['Priority'])
bounds = [ (5,20),(5,15),(2,10),(1,8) ] # order: n_estimators,max_depth,min_samples_split,min_samples_leaf

In [36]:
def func(parameters, *args):
    #print args[0], args[1]
    parameters = map(int,parameters)
    clf = RandomForestClassifier(
        n_jobs=-1, 
        n_estimators=parameters[0], 
        max_depth=parameters[1],
        min_samples_split=parameters[2],
        min_samples_leaf=parameters[3],
        class_weight="balanced"
    )
#     clf = RandomForestClassifier(class_weight="balanced", n_jobs=-1)
    f1scorer_macro = make_scorer(f1_score, average='macro')
    print args[0].shape, args[1].shape
    f1 = cross_val_score(clf, args[0], args[1], scoring=f1scorer_macro, cv=3, n_jobs=-1)
    average_f1 = np.mean(f1)
    print average_f1, parameters
    return -1*average_f1

In [31]:
func([10, None, 2, 1], args)

(83041, 15940) (83041,)
0.386329938755 [10, None, 2, 1]


-0.38632993875491289

In [37]:
%time result = differential_evolution(func, bounds, args, strategy='rand1bin', popsize=10, mutation=(0.5,1.9), recombination=0.7, maxiter=1)

(83041, 15940) (83041,)
0.267430939196 [6, 10, 7, 3]
(83041, 15940) (83041,)
0.289290685853 [16, 9, 8, 2]
(83041, 15940) (83041,)
0.277088052572 [6, 14, 5, 4]
(83041, 15940) (83041,)
0.313735783334 [17, 12, 5, 1]
(83041, 15940) (83041,)
0.270045508412 [12, 8, 2, 4]
(83041, 15940) (83041,)
0.285646425806 [15, 6, 3, 2]
(83041, 15940) (83041,)
0.300864597744 [17, 11, 8, 7]
(83041, 15940) (83041,)
0.259415537582 [7, 9, 3, 4]
(83041, 15940) (83041,)
0.270013240147 [9, 9, 9, 6]
(83041, 15940) (83041,)
0.28728099567 [8, 14, 2, 1]
(83041, 15940) (83041,)
0.263325216102 [5, 12, 3, 3]
(83041, 15940) (83041,)
0.253492850529 [5, 8, 5, 2]
(83041, 15940) (83041,)
0.29570906617 [14, 11, 6, 7]
(83041, 15940) (83041,)
0.307205839237 [19, 13, 5, 4]
(83041, 15940) (83041,)
0.304406516303 [13, 13, 2, 5]
(83041, 15940) (83041,)
0.272904588285 [14, 6, 4, 6]
(83041, 15940) (83041,)
0.295426910825 [11, 13, 3, 1]
(83041, 15940) (83041,)
0.240198257083 [7, 7, 6, 7]
(83041, 15940) (83041,)
0.264759244958 [8, 10,

In [38]:
result.x

array([ 18.90506696,  12.07981089,   4.43601826,   1.16490721])

In [39]:
parameters = map(int,result.x)
print 'Tuned Parameters:',parameters

Tuned Parameters: [18, 12, 4, 1]


In [40]:
clf = RandomForestClassifier(
        n_jobs=-1, 
        n_estimators=parameters[0], 
        max_depth=parameters[1],
        min_samples_split=parameters[2],
        min_samples_leaf=parameters[3],
        class_weight='balanced'
    )
f1scorer_macro = make_scorer(f1_score, average='macro')
f1 = cross_val_score(clf, inputDF, df['Priority'], scoring=f1scorer_macro, cv=10, n_jobs=-1)
average_f1 = np.mean(f1)
print 'Cross validation metrics:',average_f1, np.std(f1)
%time clf.fit(inputDF, df['Priority'])

Cross validation metrics: 0.312145369846 0.0127815664628
CPU times: user 1.6 s, sys: 76.9 ms, total: 1.67 s
Wall time: 737 ms


RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=12, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=4,
            min_weight_fraction_leaf=0.0, n_estimators=18, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [41]:
clf = RandomForestClassifier(n_jobs=-1)
f1scorer_macro = make_scorer(f1_score, average='macro')
f1 = cross_val_score(clf, inputDF, df['Priority'], scoring=f1scorer_macro, cv=10, n_jobs=-1)
average_f1 = np.mean(f1)
print 'Cross validation metrics:',average_f1, np.std(f1)
%time clf.fit(inputDF, df['Priority'])

Cross validation metrics: 0.412665846083 0.0134582934844
CPU times: user 34.3 s, sys: 414 ms, total: 34.7 s
Wall time: 11.9 s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=-1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)