In [None]:
import pandas as pd
import numpy as np

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split

In [None]:
perth = pd.read_csv('perth_clean.csv')
perth = pd.get_dummies(perth, columns=['suburb'])

train_indices, test_indices = train_test_split(perth.index, test_size=0.2, random_state=0)

perth_train = perth.loc[train_indices].copy()
perth_test = perth.loc[test_indices].copy()

In [None]:
indices = list(perth_train.index.copy())
# np.random.seed(5)
np.random.shuffle(indices)

sampled_indices = indices[:int(len(indices) * 0.2)]
sampled_perth = perth_train.loc[sampled_indices, :].copy()

print("Sampled set length", len(sampled_indices))

Sampled set length 5384


In [None]:
n = len(sampled_perth)
sampled_train_indices, sampled_valid_indices = train_test_split(np.arange(n), test_size=0.2, random_state=0)


sampled_x_train = sampled_perth.iloc[sampled_train_indices].drop('log10_price', axis=1)
sampled_y_train = sampled_perth.iloc[sampled_train_indices]['log10_price'].copy()

sampled_x_valid = sampled_perth.iloc[sampled_valid_indices].drop('log10_price', axis=1)
sampled_y_valid = sampled_perth.iloc[sampled_valid_indices]['log10_price'].copy()

In [None]:
train_data = (sampled_x_train, sampled_y_train)
valid_data = (sampled_x_valid, sampled_y_valid)

In [None]:
print("Training length of sampled data", len(sampled_x_train))
print("Validiation length of sampled data", len(sampled_x_valid))

Training length of sampled data 4307
Validiation length of sampled data 1077


In [None]:
def score_population(population, train_data, valid_data):
    """ Scores the population on the given dataset """
    population_scores = []
    n = len(population)
    for i, person in enumerate(population):
        base_model = RandomForestRegressor(criterion='squared_error', random_state=0, **person)
        score = score_model(base_model, train_data, valid_data)
        population_scores.append(score)
        
        print(f"{i + 1} of {n}: DNA {person} has score {score}")
    
    return population_scores

def score_model(model, train_data, valid_data):
    x_train, y_train = train_data
    x_valid, y_valid = valid_data
    
    model.fit(x_train, y_train)
    
    y_pred = model.predict(x_valid)
    return np.sqrt(np.mean((y_valid - y_pred) ** 2))

In [None]:
def make_children_from_population(population, population_scores, n_children, mutation=0.5):
    children = []
    for i in range(n_children):
        parent1, parent2 = np.random.choice(population, 2, replace=False)
        
        child = generate_child(parent1, parent2)
        child = mutate_child(child, mutation=mutation)

        children.append(child)
        
    return children

def generate_child(person1, person2, p=0.5):
    child = {}
    for key in person1.keys():
        rand = np.random.rand()
        if rand < p:
            child[key] = person1[key]
        else:
            child[key] = person2[key]
            
    return child

def mutate_child(child, mutation=0.5):
    child = child.copy()
    for key in child.keys():
        rand = np.random.rand()
        if rand < mutation:
            child[key] = np.random.choice(parameter_grid[key])
            
    return child

In [None]:
def generate_next_population(current_population, train_data, valid_data, n_children=10, mutation=0.5):
    print("Scoring Population")
    population_scores = score_population(current_population, train_data, valid_data)

    sorted_population, sorted_scores = sort_population_by_scores(current_population, population_scores)

    n = len(sorted_population) // 2
    top_population = sorted_population[:n]
    top_scores = sorted_scores[:n]

    next_population = make_children_from_population(top_population, top_scores, n_children-1, mutation=mutation)
    next_population.append(sorted_population[0])  # Elitism
    
    print(f"Current Population Best Score {sorted_scores[0]}, with DNA {sorted_population[0]}")
    return next_population

def sort_population_by_scores(population, population_scores):
    sorted_population_with_scores = sorted(zip(population_scores, population))
    
    sorted_population = [person for score, person in sorted_population_with_scores]
    sorted_scores = [score for score, person in sorted_population_with_scores]

    return sorted_population, sorted_scores

In [None]:
parameter_grid = {'min_samples_leaf': np.arange(1, 100 + 1),
                  'max_features': np.arange(1, len(perth_train.columns) - 1, 5)}

In [None]:
initial_population = [{key: np.random.choice(val) for key, val in parameter_grid.items()}
                      for _ in range(20)]

Here I intentionally started of with an inital population where the `min_sample_leaf` is all very high. But, based on what we know of random forests, we should know that we want `min_sample_leaf` to be small. So lets see if genetic algorithms can actually find this.

In [None]:
generation1 = generate_next_population(initial_population, train_data, valid_data, n_children=20, mutation=0.5)

Scoring Population
1 of 20: DNA {'min_samples_leaf': 21, 'max_features': 151} has score 0.11538892000627582
2 of 20: DNA {'min_samples_leaf': 50, 'max_features': 246} has score 0.12891387539840599
3 of 20: DNA {'min_samples_leaf': 60, 'max_features': 276} has score 0.13272264547129287
4 of 20: DNA {'min_samples_leaf': 13, 'max_features': 271} has score 0.10981016146979641
5 of 20: DNA {'min_samples_leaf': 15, 'max_features': 116} has score 0.11360288593478063
6 of 20: DNA {'min_samples_leaf': 90, 'max_features': 121} has score 0.13804981290326476
7 of 20: DNA {'min_samples_leaf': 59, 'max_features': 236} has score 0.13185571519911496
8 of 20: DNA {'min_samples_leaf': 70, 'max_features': 131} has score 0.1349720843575971
9 of 20: DNA {'min_samples_leaf': 24, 'max_features': 196} has score 0.11696525912713024
10 of 20: DNA {'min_samples_leaf': 11, 'max_features': 116} has score 0.10999899715307844
11 of 20: DNA {'min_samples_leaf': 90, 'max_features': 131} has score 0.1383872030694755
12

In [None]:
generation2 = generate_next_population(generation1, train_data, valid_data, n_children=20, mutation=0.5)

Scoring Population
1 of 20: DNA {'min_samples_leaf': 37, 'max_features': 211} has score 0.12397019457926041
2 of 20: DNA {'min_samples_leaf': 49, 'max_features': 266} has score 0.13024105426353075
3 of 20: DNA {'min_samples_leaf': 4, 'max_features': 266} has score 0.1034871133408857
4 of 20: DNA {'min_samples_leaf': 21, 'max_features': 151} has score 0.11538892000627582
5 of 20: DNA {'min_samples_leaf': 50, 'max_features': 246} has score 0.12891387539840599
6 of 20: DNA {'min_samples_leaf': 95, 'max_features': 246} has score 0.1397793346952394
7 of 20: DNA {'min_samples_leaf': 16, 'max_features': 196} has score 0.11210459663541276
8 of 20: DNA {'min_samples_leaf': 72, 'max_features': 116} has score 0.13543117903324753
9 of 20: DNA {'min_samples_leaf': 69, 'max_features': 31} has score 0.18036857577079077
10 of 20: DNA {'min_samples_leaf': 37, 'max_features': 266} has score 0.1241478037430166
11 of 20: DNA {'min_samples_leaf': 49, 'max_features': 26} has score 0.18187221737660067
12 of 

In [None]:
generation3 = generate_next_population(generation2, train_data, valid_data, n_children=20, mutation=0.5)

Scoring Population
1 of 20: DNA {'min_samples_leaf': 58, 'max_features': 151} has score 0.13059337464275864
2 of 20: DNA {'min_samples_leaf': 26, 'max_features': 101} has score 0.12117516382749478
3 of 20: DNA {'min_samples_leaf': 4, 'max_features': 271} has score 0.10395497080216104
4 of 20: DNA {'min_samples_leaf': 35, 'max_features': 176} has score 0.1219392934481044
5 of 20: DNA {'min_samples_leaf': 86, 'max_features': 266} has score 0.13860797602538297
6 of 20: DNA {'min_samples_leaf': 21, 'max_features': 151} has score 0.11538892000627582
7 of 20: DNA {'min_samples_leaf': 77, 'max_features': 51} has score 0.15297115205966091
8 of 20: DNA {'min_samples_leaf': 26, 'max_features': 181} has score 0.11845698748859514
9 of 20: DNA {'min_samples_leaf': 72, 'max_features': 271} has score 0.13560644104105357
10 of 20: DNA {'min_samples_leaf': 89, 'max_features': 71} has score 0.14579861317730688
11 of 20: DNA {'min_samples_leaf': 37, 'max_features': 211} has score 0.12397019457926041
12 o

Very cool, in 3 generations we now have 3 people with low `min_samples_leaf`. So now we are at a point where we want the algorithm to exploit this new information it found. But because the mutation rate is so high, it is likely that `min_sample_leaf` will mutate to something bigger.

In [None]:
generation4_high_mutation = generate_next_population(generation3, train_data, valid_data, n_children=20, mutation=0.5)

Scoring Population
1 of 20: DNA {'min_samples_leaf': 57, 'max_features': 101} has score 0.13196686441543531
2 of 20: DNA {'min_samples_leaf': 36, 'max_features': 271} has score 0.12343741080081441
3 of 20: DNA {'min_samples_leaf': 47, 'max_features': 116} has score 0.12787457591718618
4 of 20: DNA {'min_samples_leaf': 1, 'max_features': 271} has score 0.10260411007269372
5 of 20: DNA {'min_samples_leaf': 4, 'max_features': 176} has score 0.10313711470499154
6 of 20: DNA {'min_samples_leaf': 36, 'max_features': 181} has score 0.12383760174700248
7 of 20: DNA {'min_samples_leaf': 4, 'max_features': 271} has score 0.10395497080216104
8 of 20: DNA {'min_samples_leaf': 26, 'max_features': 266} has score 0.11818852404011145
9 of 20: DNA {'min_samples_leaf': 4, 'max_features': 146} has score 0.10336221724165372
10 of 20: DNA {'min_samples_leaf': 19, 'max_features': 266} has score 0.11344773150806396
11 of 20: DNA {'min_samples_leaf': 77, 'max_features': 271} has score 0.13644383622135833
12 o

In [None]:
for x in generation4_high_mutation:
    print(x)

{'min_samples_leaf': 26, 'max_features': 266}
{'min_samples_leaf': 42, 'max_features': 136}
{'min_samples_leaf': 92, 'max_features': 161}
{'min_samples_leaf': 26, 'max_features': 161}
{'min_samples_leaf': 26, 'max_features': 36}
{'min_samples_leaf': 97, 'max_features': 266}
{'min_samples_leaf': 89, 'max_features': 271}
{'min_samples_leaf': 4, 'max_features': 146}
{'min_samples_leaf': 26, 'max_features': 211}
{'min_samples_leaf': 93, 'max_features': 266}
{'min_samples_leaf': 19, 'max_features': 266}
{'min_samples_leaf': 1, 'max_features': 26}
{'min_samples_leaf': 6, 'max_features': 266}
{'min_samples_leaf': 26, 'max_features': 266}
{'min_samples_leaf': 4, 'max_features': 111}
{'min_samples_leaf': 41, 'max_features': 271}
{'min_samples_leaf': 65, 'max_features': 266}
{'min_samples_leaf': 4, 'max_features': 146}
{'min_samples_leaf': 89, 'max_features': 271}
{'min_samples_leaf': 1, 'max_features': 271}


In [None]:
generation4_low_mutation = generate_next_population(generation3, train_data, valid_data, n_children=20, mutation=0.1)

Scoring Population
1 of 20: DNA {'min_samples_leaf': 57, 'max_features': 101} has score 0.13196686441543531
2 of 20: DNA {'min_samples_leaf': 36, 'max_features': 271} has score 0.12343741080081441
3 of 20: DNA {'min_samples_leaf': 47, 'max_features': 116} has score 0.12787457591718618
4 of 20: DNA {'min_samples_leaf': 1, 'max_features': 271} has score 0.10260411007269372
5 of 20: DNA {'min_samples_leaf': 4, 'max_features': 176} has score 0.10313711470499154
6 of 20: DNA {'min_samples_leaf': 36, 'max_features': 181} has score 0.12383760174700248
7 of 20: DNA {'min_samples_leaf': 4, 'max_features': 271} has score 0.10395497080216104
8 of 20: DNA {'min_samples_leaf': 26, 'max_features': 266} has score 0.11818852404011145
9 of 20: DNA {'min_samples_leaf': 4, 'max_features': 146} has score 0.10336221724165372
10 of 20: DNA {'min_samples_leaf': 19, 'max_features': 266} has score 0.11344773150806396
11 of 20: DNA {'min_samples_leaf': 77, 'max_features': 271} has score 0.13644383622135833
12 o

In [None]:
for x in generation4_low_mutation:
    print(x)

{'min_samples_leaf': 4, 'max_features': 271}
{'min_samples_leaf': 44, 'max_features': 271}
{'min_samples_leaf': 4, 'max_features': 146}
{'min_samples_leaf': 4, 'max_features': 271}
{'min_samples_leaf': 15, 'max_features': 271}
{'min_samples_leaf': 1, 'max_features': 271}
{'min_samples_leaf': 1, 'max_features': 266}
{'min_samples_leaf': 16, 'max_features': 271}
{'min_samples_leaf': 16, 'max_features': 196}
{'min_samples_leaf': 4, 'max_features': 266}
{'min_samples_leaf': 1, 'max_features': 271}
{'min_samples_leaf': 1, 'max_features': 271}
{'min_samples_leaf': 16, 'max_features': 196}
{'min_samples_leaf': 19, 'max_features': 161}
{'min_samples_leaf': 16, 'max_features': 146}
{'min_samples_leaf': 15, 'max_features': 176}
{'min_samples_leaf': 16, 'max_features': 266}
{'min_samples_leaf': 26, 'max_features': 266}
{'min_samples_leaf': 26, 'max_features': 161}
{'min_samples_leaf': 1, 'max_features': 271}


In [None]:
generation5 = generate_next_population(generation4_low_mutation, train_data, valid_data, n_children=20, mutation=0.1)

Scoring Population
1 of 20: DNA {'min_samples_leaf': 4, 'max_features': 271} has score 0.10395497080216104
2 of 20: DNA {'min_samples_leaf': 44, 'max_features': 271} has score 0.12714005659568547
3 of 20: DNA {'min_samples_leaf': 4, 'max_features': 146} has score 0.10336221724165372
4 of 20: DNA {'min_samples_leaf': 4, 'max_features': 271} has score 0.10395497080216104
5 of 20: DNA {'min_samples_leaf': 15, 'max_features': 271} has score 0.1111275363614617
6 of 20: DNA {'min_samples_leaf': 1, 'max_features': 271} has score 0.10260411007269372
7 of 20: DNA {'min_samples_leaf': 1, 'max_features': 266} has score 0.10309294850322123
8 of 20: DNA {'min_samples_leaf': 16, 'max_features': 271} has score 0.11190263569393659
9 of 20: DNA {'min_samples_leaf': 16, 'max_features': 196} has score 0.11210459663541276
10 of 20: DNA {'min_samples_leaf': 4, 'max_features': 266} has score 0.1034871133408857
11 of 20: DNA {'min_samples_leaf': 1, 'max_features': 271} has score 0.10260411007269372
12 of 20:

Note here that we only used 5 generations here, with 20 models each time. So a total of 100 models sampled. So lets compare this to what we did in preivous weeks where we sampled 159 models

In [None]:
%%time
best_parameters = {'min_samples_leaf': 1, 'max_features': 271}

final_model = RandomForestRegressor(criterion='squared_error', random_state=0, **best_parameters)
final_model.fit(perth_train.drop('log10_price', axis=1), perth_train['log10_price'])

y = perth_test['log10_price']
y_pred = final_model.predict(perth_test.drop('log10_price', axis=1))

print(np.mean((y - y_pred) ** 2))

0.007931673841989126
CPU times: user 56.6 s, sys: 76.6 ms, total: 56.6 s
Wall time: 56.7 s


In [None]:
# Random Search
best_parameters = {'min_samples_leaf': 1, 'max_features': 191}

final_model = RandomForestRegressor(criterion='squared_error', random_state=0, **best_parameters)
final_model.fit(perth_train.drop('log10_price', axis=1), perth_train['log10_price'])

y = perth_test['log10_price']
y_pred = final_model.predict(perth_test.drop('log10_price', axis=1))

print(np.mean((y - y_pred) ** 2))

0.007857488928888677


In [None]:
# Coordinate Descent
base_parameters = {'min_samples_leaf': 1, 'max_features': 106}

final_model = RandomForestRegressor(criterion='squared_error', random_state=0, **base_parameters)
final_model.fit(perth_train.drop('log10_price', axis=1), perth_train['log10_price'])

y = perth_test['log10_price']
y_pred = final_model.predict(perth_test.drop('log10_price', axis=1))
print(np.mean((y - y_pred) ** 2))


0.00781962457241284
