In [None]:
import pandas as pd
import numpy as np

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split

In [None]:
perth = pd.read_csv('perth_clean.csv')
perth = pd.get_dummies(perth, columns=['suburb'])

train_indices, test_indices = train_test_split(perth.index, test_size=0.2, random_state=0)

perth_train = perth.loc[train_indices].copy()
perth_test = perth.loc[test_indices].copy()

In [None]:
indices = list(perth_train.index.copy())
# np.random.seed(5)
np.random.shuffle(indices)

sampled_indices = indices[:int(len(indices) * 0.2)]
sampled_perth = perth_train.loc[sampled_indices, :].copy()

print("Sampled set length", len(sampled_indices))

Sampled set length 5384


In [None]:
n = len(sampled_perth)
sampled_train_indices, sampled_valid_indices = train_test_split(np.arange(n), test_size=0.2, random_state=0)


sampled_x_train = sampled_perth.iloc[sampled_train_indices].drop('log10_price', axis=1)
sampled_y_train = sampled_perth.iloc[sampled_train_indices]['log10_price'].copy()

sampled_x_valid = sampled_perth.iloc[sampled_valid_indices].drop('log10_price', axis=1)
sampled_y_valid = sampled_perth.iloc[sampled_valid_indices]['log10_price'].copy()

In [None]:
train_data = (sampled_x_train, sampled_y_train)
valid_data = (sampled_x_valid, sampled_y_valid)

In [None]:
print("Training length of sampled data", len(sampled_x_train))
print("Validiation length of sampled data", len(sampled_x_valid))

Training length of sampled data 4307
Validiation length of sampled data 1077


Here we will use the trick that I said where we are not going to optimize `n_estimators` and `learning_rate` together. Rather we will fix `n_estimators=200` and optimize learning rate by itself.

In [None]:
# Notice how we fix `n_estimators` here
def score_population(population, train_data, valid_data):
    """ Scores the population on the given dataset """
    population_scores = []
    n = len(population)
    for i, person in enumerate(population):
        base_model = XGBRegressor(objective='reg:squarederror', random_state=0, n_estimators=200, **person)
        score = score_model(base_model, train_data, valid_data)
        population_scores.append(score)
        
        print(f"{i + 1} of {n}: DNA {person} has score {score}")
    
    return population_scores

def score_model(model, train_data, valid_data):
    x_train, y_train = train_data
    x_valid, y_valid = valid_data
    
    model.fit(x_train, y_train)
    
    y_pred = model.predict(x_valid)
    return np.sqrt(np.mean((y_valid - y_pred) ** 2))

In [None]:
def make_children_from_population(population, population_scores, n_children, mutation=0.5):
    children = []
    for i in range(n_children):
        parent1, parent2 = np.random.choice(population, 2, replace=False)
        
        child = generate_child(parent1, parent2)
        child = mutate_child(child, mutation=mutation)

        children.append(child)
        
    return children

def generate_child(person1, person2, p=0.5):
    child = {}
    for key in person1.keys():
        rand = np.random.rand()
        if rand < p:
            child[key] = person1[key]
        else:
            child[key] = person2[key]
            
    return child

def mutate_child(child, mutation=0.5):
    child = child.copy()
    for key in child.keys():
        rand = np.random.rand()
        if rand < mutation:
            child[key] = np.random.choice(parameter_grid[key])
            
    return child

In [None]:
def generate_next_population(current_population, train_data, valid_data, n_children=10, mutation=0.5):
    print("Scoring Population")
    population_scores = score_population(current_population, train_data, valid_data)

    sorted_population, sorted_scores = sort_population_by_scores(current_population, population_scores)

    n = len(sorted_population) // 2
    top_population = sorted_population[:n]
    top_scores = sorted_scores[:n]

    next_population = make_children_from_population(top_population, top_scores, n_children-1, mutation=mutation)
    next_population.append(sorted_population[0])  # Elitism
    
    print(f"Current Population Best Score {sorted_scores[0]}, with DNA {sorted_population[0]}")
    return next_population

def sort_population_by_scores(population, population_scores):
    sorted_population_with_scores = sorted(list(zip(population_scores, population)), key=lambda x: x[0])
    
    sorted_population = [person for score, person in sorted_population_with_scores]
    sorted_scores = [score for score, person in sorted_population_with_scores]

    return sorted_population, sorted_scores

In [None]:
parameter_grid = {'learning_rate': np.linspace(0.01, 0.10, 10),
                  'max_depth': np.arange(1, 100 + 1, 10),
                  'subsample': np.linspace(0.1, 1.0, 10)}

In [None]:
initial_population = [{key: np.random.choice(val) for key, val in parameter_grid.items()}
                      for _ in range(20)]

In [None]:
generation1 = generate_next_population(initial_population, train_data, valid_data, n_children=20, mutation=0.5)

Scoring Population
1 of 20: DNA {'learning_rate': 0.01, 'max_depth': 61, 'subsample': 1.0} has score 0.7201481637298258
2 of 20: DNA {'learning_rate': 0.01, 'max_depth': 71, 'subsample': 0.6} has score 0.7214479650478723
3 of 20: DNA {'learning_rate': 0.06000000000000001, 'max_depth': 1, 'subsample': 0.6} has score 0.11537296530008394
4 of 20: DNA {'learning_rate': 0.04000000000000001, 'max_depth': 31, 'subsample': 0.7000000000000001} has score 0.09282861050285945
5 of 20: DNA {'learning_rate': 0.07, 'max_depth': 41, 'subsample': 0.5} has score 0.09178697948141408
6 of 20: DNA {'learning_rate': 0.07, 'max_depth': 81, 'subsample': 0.2} has score 0.0938800576177339
7 of 20: DNA {'learning_rate': 0.09000000000000001, 'max_depth': 11, 'subsample': 0.30000000000000004} has score 0.0938354276404361
8 of 20: DNA {'learning_rate': 0.06000000000000001, 'max_depth': 81, 'subsample': 0.5} has score 0.09138679506902327
9 of 20: DNA {'learning_rate': 0.01, 'max_depth': 31, 'subsample': 1.0} has sco

In [None]:
generation2 = generate_next_population(generation1, train_data, valid_data, n_children=20, mutation=0.2)

Scoring Population
1 of 20: DNA {'learning_rate': 0.06000000000000001, 'max_depth': 81, 'subsample': 0.4} has score 0.09296876809485832
2 of 20: DNA {'learning_rate': 0.08, 'max_depth': 31, 'subsample': 0.4} has score 0.09328076916730711
3 of 20: DNA {'learning_rate': 0.06000000000000001, 'max_depth': 51, 'subsample': 0.2} has score 0.09517230489528206
4 of 20: DNA {'learning_rate': 0.01, 'max_depth': 81, 'subsample': 0.5} has score 0.7218440411153014
5 of 20: DNA {'learning_rate': 0.04000000000000001, 'max_depth': 61, 'subsample': 0.1} has score 0.09224404553919113
6 of 20: DNA {'learning_rate': 0.08, 'max_depth': 91, 'subsample': 0.9} has score 0.09328348685064124
7 of 20: DNA {'learning_rate': 0.06000000000000001, 'max_depth': 51, 'subsample': 0.8} has score 0.09233212995206895
8 of 20: DNA {'learning_rate': 0.04000000000000001, 'max_depth': 81, 'subsample': 0.1} has score 0.09224404553919113
9 of 20: DNA {'learning_rate': 0.06000000000000001, 'max_depth': 81, 'subsample': 0.3000000

In [None]:
generation3 = generate_next_population(generation2, train_data, valid_data, n_children=20, mutation=0.2)

Scoring Population
1 of 20: DNA {'learning_rate': 0.06000000000000001, 'max_depth': 11, 'subsample': 0.5} has score 0.09149892777963936
2 of 20: DNA {'learning_rate': 0.06000000000000001, 'max_depth': 81, 'subsample': 0.2} has score 0.09412301455300627
3 of 20: DNA {'learning_rate': 0.06000000000000001, 'max_depth': 21, 'subsample': 1.0} has score 0.0950909032128807
4 of 20: DNA {'learning_rate': 0.020000000000000004, 'max_depth': 71, 'subsample': 0.2} has score 0.1434825485143911
5 of 20: DNA {'learning_rate': 0.06000000000000001, 'max_depth': 81, 'subsample': 0.5} has score 0.09138679506902327
6 of 20: DNA {'learning_rate': 0.020000000000000004, 'max_depth': 61, 'subsample': 0.6} has score 0.13934085847349692
7 of 20: DNA {'learning_rate': 0.04000000000000001, 'max_depth': 31, 'subsample': 0.7000000000000001} has score 0.09282861050285945
8 of 20: DNA {'learning_rate': 0.06000000000000001, 'max_depth': 51, 'subsample': 0.7000000000000001} has score 0.09104498351661133
9 of 20: DNA {'

In [None]:
generation4 = generate_next_population(generation3, train_data, valid_data, n_children=20, mutation=0.1)

Scoring Population
1 of 20: DNA {'learning_rate': 0.06000000000000001, 'max_depth': 31, 'subsample': 0.9} has score 0.0918065757203164
2 of 20: DNA {'learning_rate': 0.05000000000000001, 'max_depth': 81, 'subsample': 0.5} has score 0.09150340793283983
3 of 20: DNA {'learning_rate': 0.01, 'max_depth': 21, 'subsample': 0.8} has score 0.7206724865581686
4 of 20: DNA {'learning_rate': 0.05000000000000001, 'max_depth': 41, 'subsample': 0.4} has score 0.09085954641563716
5 of 20: DNA {'learning_rate': 0.030000000000000006, 'max_depth': 21, 'subsample': 0.8} has score 0.09421761785437727
6 of 20: DNA {'learning_rate': 0.01, 'max_depth': 81, 'subsample': 0.30000000000000004} has score 0.7242967554004879
7 of 20: DNA {'learning_rate': 0.06000000000000001, 'max_depth': 1, 'subsample': 0.5} has score 0.11438636194738296
8 of 20: DNA {'learning_rate': 0.06000000000000001, 'max_depth': 81, 'subsample': 0.5} has score 0.09138679506902327
9 of 20: DNA {'learning_rate': 0.04000000000000001, 'max_depth

In [22]:
generation5 = generate_next_population(generation4, train_data, valid_data, n_children=20, mutation=0.1)

Scoring Population
1 of 20: DNA {'learning_rate': 0.05000000000000001, 'max_depth': 81, 'subsample': 0.5} has score 0.09150340793283983
2 of 20: DNA {'learning_rate': 0.05000000000000001, 'max_depth': 81, 'subsample': 0.5} has score 0.09150340793283983
3 of 20: DNA {'learning_rate': 0.06000000000000001, 'max_depth': 81, 'subsample': 0.5} has score 0.09138679506902327
4 of 20: DNA {'learning_rate': 0.06000000000000001, 'max_depth': 81, 'subsample': 0.5} has score 0.09138679506902327
5 of 20: DNA {'learning_rate': 0.05000000000000001, 'max_depth': 81, 'subsample': 0.7000000000000001} has score 0.09023588206159663
6 of 20: DNA {'learning_rate': 0.07, 'max_depth': 81, 'subsample': 0.5} has score 0.09141323435133396
7 of 20: DNA {'learning_rate': 0.07, 'max_depth': 81, 'subsample': 0.5} has score 0.09141323435133396
8 of 20: DNA {'learning_rate': 0.06000000000000001, 'max_depth': 81, 'subsample': 0.9} has score 0.09219880973739603
9 of 20: DNA {'learning_rate': 0.06000000000000001, 'max_dep

The comparison here is not as fair for coordinate descent and random search because we fix `n_estimators` here. So it might be good to rerun coordinate descent and random search, but keep `n_estimators` fixed. Regardless, we can indeed see that GA does indeed give better results than coordinate descent and random search.

In [23]:
%%time
# Best parameters by genetic algorithm - 5 generations each with 20 population size (100 total)

best_parameters = {'learning_rate': 0.06000000000000001, 'max_depth': 11, 'subsample': 0.7000000000000001}

final_model = XGBRegressor(objective='reg:squarederror', random_state=0, n_estimators=200, **best_parameters)
final_model.fit(perth_train.drop('log10_price', axis=1), perth_train['log10_price'])

y = perth_test['log10_price']
y_pred = final_model.predict(perth_test.drop('log10_price', axis=1))

print(np.mean((y - y_pred) ** 2))

0.00724586724861683
CPU times: user 2min 43s, sys: 384 ms, total: 2min 44s
Wall time: 2min 43s


In [25]:
%%time
# Best parameters by coordinate descent - 153 different models sampled

best_parameters = {'learning_rate': 0.09, 'max_depth': 11, 'n_estimators': 170, 'subsample': 0.8}

final_model = XGBRegressor(objective='reg:squarederror', random_state=0, **best_parameters)
final_model.fit(perth_train.drop('log10_price', axis=1), perth_train['log10_price'])

y = perth_test['log10_price']
y_pred = final_model.predict(perth_test.drop('log10_price', axis=1))

print(np.mean((y - y_pred) ** 2))

0.007313312056798277
CPU times: user 2min 8s, sys: 242 ms, total: 2min 8s
Wall time: 2min 8s


In [27]:
%%time
# Best parameters by random search (75 models sampled)

best_parameters = {'learning_rate': 0.08, 'max_depth': 21, 'n_estimators': 160, 'subsample': 0.5}

final_model = XGBRegressor(objective='reg:squarederror', random_state=0, **best_parameters)
final_model.fit(perth_train.drop('log10_price', axis=1), perth_train['log10_price'])

y = perth_test['log10_price']
y_pred = final_model.predict(perth_test.drop('log10_price', axis=1))

print(np.mean((y - y_pred) ** 2))

0.007533777469334824
CPU times: user 4min 14s, sys: 358 ms, total: 4min 15s
Wall time: 4min 13s
