In [1]:
import numpy as np
import pandas as pd

In [2]:
data1 = pd.read_csv("glass.csv")
data1
X1 = data1.iloc[:, :9]
y1 = data1.iloc[:, 9]

In [3]:
data2 = pd.read_csv("riceClassification.csv")
data2
X2 = data2.iloc[:, 1:11]
y2 = data2.iloc[:, 11]

### Chromosomes 

We need to create the genetic individuals (chromosome) with these hyper parameters. Let's see how we can create them ...

Remember we need to define the Individual type with creator. Then we need to fill it up with the appropriate function in IndividualCreator. With the IndividualCreator then we fill up the population.

In [4]:
from deap import base
from deap import creator
from deap import tools
import random

creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)


# define a single objective, maximizing fitness strategy:
# create the Individual class based on list:


### Creating the individual

In order to fill up the individual chromosome, the DEAP toolbox provides 3 choices:

- initRepeat
- initIterate
- initCycle

`initRepeat` is for filling up the Individual with a single function call in a repeated loop.

`initIterate` is for filling up the Individual with a Generator that remembers which previous values have been called.

`initCycle` is for filling up the Individual with multiple functions that can be called repeatedly (if n is greater than 1).

In [5]:
toolbox = base.Toolbox()
toolbox.register("h1", random.uniform,1,100)
toolbox.register("h2", random.uniform,1,10)
toolbox.register("h3", random.uniform,0,1)
toolbox.register("IndividualCreator",tools.initCycle,creator.Individual,
                 (toolbox.h1,toolbox.h2,toolbox.h3),n=1)

# fill in the with random floats for each type of hyper parameter

In [6]:
# Test IndividualCreator
for i in range(10):
    print(toolbox.IndividualCreator())

[37.02545539539016, 9.046544468999498, 0.8689302277201331]
[75.22107954726013, 2.1489550285176975, 0.10961637390943546]
[48.10864187205121, 4.019294081923508, 0.6409282190789272]
[73.53676585773505, 4.858646145863747, 0.43156932615668064]
[95.4691672280852, 8.20922251906261, 0.23799431877549093]
[78.18287622647787, 4.9663605694206545, 0.32899067499399537]
[66.42844797350672, 1.289796237245516, 0.7934422384880807]
[83.58110872856672, 7.729786571619465, 0.6232787750707485]
[58.257093503916224, 1.177759655767333, 0.5335846300219832]
[38.13037018425884, 4.803456977199684, 0.5567419018930078]


### Individual Chromosome to Hyper Parameter

We need to convert back the Individual chromosome back to hyper parameter values.


In [7]:
def getParams(individual):
    n_est = round(individual[0])
    depth = round(individual[1])
    criterion = ['gini', 'entropy'][round(individual[2])] 
    return n_est, depth, criterion


 #for tree classifier

In [8]:
getParams([94.81418290008662, 0.629829049481967, 0.56079237793822816])

(95, 1, 'entropy')

## Fitness Evaluation

We use the accuracy of the machine learning algorithm - AdaBoostClassifier - for evaluation of the hyper parameters. 

In [9]:
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier

kfold = model_selection.KFold(n_splits=10, random_state=42, shuffle=True)

def getAccuracy(individual):
    n_estimators, max_depth, criterion = getParams(individual)
    classifier = RandomForestClassifier(random_state=69,
                                         n_estimators=n_estimators,
                                         max_depth=max_depth,
                                         criterion=criterion
                                         )

    cv_results = model_selection.cross_val_score(classifier,
                                                 X1,
                                                 y1,
                                                 cv=kfold,
                                                 scoring='accuracy')
    return cv_results.mean()

### Population and Evaluation Fitness

We register PopulationCreator and Evaluate with toolboox.

In [10]:
# create the population operator to generate a list of individuals:

toolbox.register("populationCreator", tools.initRepeat, list, 
                 toolbox.IndividualCreator)

def classificationAccuracy(individual):
    return getAccuracy(individual),
toolbox.register("evaluate", classificationAccuracy)

# fitness calculation

In [22]:
# genetic operators:

toolbox.register("mate", tools.cxUniform, indpb = 0.7)
toolbox.register("mutate", tools.mutGaussian, mu=[6,1,0.2], sigma=[3,1,0.05], indpb=0.2)
toolbox.register("select", tools.selTournament, tournsize=3)


## The main algorithm

In [23]:
from deap import algorithms

# Genetic Algorithm constants:
POPULATION_SIZE = 40
P_CROSSOVER = 0.8  # probability for crossover
P_MUTATION = 0.3  # probability for mutating an individual
MAX_GENERATIONS = 10
HALL_OF_FAME_SIZE = 5

# create initial population (generation 0):
population = toolbox.populationCreator(n=POPULATION_SIZE)

# prepare the statistics object:
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("max", np.max)
stats.register("avg", np.mean)

# define the hall-of-fame object:
hof = tools.HallOfFame(HALL_OF_FAME_SIZE)

# perform the Genetic Algorithm flow with hof feature added:
population, logbook = algorithms.eaSimple(population,
                                          toolbox,
                                          cxpb=P_CROSSOVER,
                                          mutpb=P_MUTATION,
                                          ngen=MAX_GENERATIONS,
                                          stats=stats,
                                          halloffame=hof,
                                          verbose=True)

# print best solution found:
print("- Best solution is: ")
print("params = ", hof.items[0])
print("Accuracy = %1.5f" % hof.items[0].fitness.values[0])

gen	nevals	max    	avg     
0  	40    	0.81342	0.734345
1  	37    	0.81342	0.769535
2  	37    	0.813853	0.797251
3  	39    	0.813853	0.804978
4  	39    	0.813853	0.810135
5  	35    	0.813853	0.811521
6  	32    	0.818615	0.812197
7  	40    	0.818615	0.813847
8  	35    	0.818615	0.811759
9  	30    	0.818615	0.813431
10 	38    	0.818615	0.814719
- Best solution is: 
params =  [40.030369342232575, 7.943308126486621, 0.49409809599806287]
Accuracy = 0.81861


In [17]:
print(getAccuracy([38, 8, 0]))

0.8186147186147187
