In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import scipy.stats as st
import seaborn as sns
from mpmath import *
from decimal import Decimal
from timeit import default_timer as timer
import json

import tools as tl
import benchmark_func as bf
from hyperheuristic import Hyperheuristic
from metaheuristic import Metaheuristic
from experiment import read_config_file

In [2]:
sns.set(context="paper", font_scale=1.8, palette="husl", style="ticks",
        rc={'font.family': 'serif', 'font.size': 18,
            "xtick.major.top": False, "ytick.major.right": False})
plt.rc('font', size=18) 

# Saving images flag
is_saving = True
show_plots = True
saving_format = 'png'

figures_folder = 'data_files/exp_figures/'
experiment_folder = 'data_files/exp_output'
ml_model_folder = "./data_files/ml_models_times/"
exp_label = 'default_nn_best_double_lstm'
bm_exp_label = 'basic_metaheuristic'

if is_saving:
    # Read (or create if so) a folder for storing images
    if not os.path.isdir(figures_folder):
        os.mkdir(figures_folder)

In [3]:
files = os.listdir(ml_model_folder)
files = [file for file in files if 'DS_S' not in file]

# Number of dimensions
dimensions = [2, 10, 30, 50]
num_dims = len(dimensions)

problems = list(set(file.split('-')[0] for file in files))
len(problems), dimensions

(119, [2, 10, 30, 50])

In [4]:
data = tl.read_json('./data_files/exp_output/'+exp_label+'.json')

In [5]:
def sort_dataset(dataset):
    # Sort the results of the 428 problems in lexicographically order
    list_pair_results = list(zip(dataset['problem'],
                                 dataset['dimensions'],
                                 dataset['results']))
    list_pair_results.sort()
    dataset['problem'], dataset['dimensions'], dataset['results'] = [], [], []
    for problem, dimension, result in list_pair_results:
        dataset['problem'].append(problem)
        dataset['dimensions'].append(dimension)
        dataset['results'].append(result)
        
def filter_by_dimensions(dataset):
    # Filter the dataset allowing only the dimensions from 'dimensions' list.
    # The usercase is for the basic metaheuristics that has results for extra 
    # dimensions out of scope of this paper.
    allowed_dim_inds = [index for d in dimensions for index in tl.listfind(dataset['dimensions'], d)]
    dict_filtered = {key: [val[x] for x in allowed_dim_inds] for key, val in dataset.items()}
    sort_dataset(dict_filtered)
    return dict_filtered


In [6]:
# Read operators and find their alias
collections = ['default.txt', 'basicmetaheuristics.txt']

encoded_heuristic_space = dict()
for collection_file in collections:
    with open('./collections/' + collection_file, 'r') as operators_file:
        encoded_heuristic_space[collection_file] = [eval(line.rstrip('\n')) for line in operators_file]

# Search operator aliases
perturbator_alias = {
    'random_search': 'RS',
    'central_force_dynamic': 'CF',
    'differential_mutation': 'DM',
    'firefly_dynamic': 'FD',
    'genetic_crossover': 'GC',
    'genetic_mutation': 'GM',
    'gravitational_search': 'GS',
    'random_flight': 'RF',
    'local_random_walk': 'RW',
    'random_sample': 'RX',
    'spiral_dynamic': 'SD',
    'swarm_dynamic': 'PS'}

selector_alias = {'greedy': 'g', 'all': 'd', 'metropolis': 'm', 'probabilistic': 'p'}

operator_families = {y: i for i, y in enumerate(sorted([x for x in perturbator_alias.values()]))}

# Pre-build the alias list
heuristic_space = dict()
for collection_file in collections:
    if all(isinstance(x, tuple) for x in encoded_heuristic_space[collection_file]):
        heuristic_space[collection_file] = [perturbator_alias[x[0]] + selector_alias[x[2]]
                                            for x in encoded_heuristic_space[collection_file]]


In [7]:
# Read basic metaheuristics
basic_mhs_collection = encoded_heuristic_space['basicmetaheuristics.txt']

# Read basic metaheuristics cardinality
basic_mhs_cadinality = [1 if isinstance(x, tuple) else len(x) for x in basic_mhs_collection]

# Load data from basic metaheuristics
basic_mhs_data = filter_by_dimensions(tl.read_json(f'{experiment_folder}/basic-metaheuristics-data_v2.json'))

# Mathematical attributes
chosen_categories = ['Differentiable', 'Unimodal']
case_label = 'DU'

# Call the problem categories
problem_features = bf.list_functions(fts=chosen_categories)
categories = sorted(set([problem_features[x]['Code'] for x in basic_mhs_data['problem']]), reverse=True)


In [8]:
# Retrive the results per experiment
data_frame = filter_by_dimensions(data)

# Summary
id = 'dlstm'
data_info = {
    'Dim': [int(x) for x in data_frame['dimensions']],
    'Pop': 30,
    'Problem': data_frame['problem'],
    'Cat': [problem_features[x]['Code'] for x in data_frame['problem']],
    'mhs': [x['encoded_solution'] for x in data_frame['results']],
    'hist': [x['hist_fitness'] for x in data_frame['results']]
}

In [9]:
def get_correct_id(problem_name, dimension):
  for id in range(len(data_info['Problem'])):
    correct_problem = data_info['Problem'][id] == problem_name
    correct_dimension = data_info['Dim'][id] == dimension
    if correct_problem and correct_dimension:
      return id
  return -1

In [11]:
text = """
exp_config, hh_config, prob_config = read_config_file(exp_label)
# Message to print and to store in folders
problem_name = 'Sphere'
dimension = 2
label = '{}-{}D-{}'.format(problem_name, dimension, exp_label)

# Get and format the problem
# problem = eval('bf.{}({})'.format(problem, dimension))
problem = bf.choose_problem(problem_name, dimension)

problem.set_offset_domain(2)
problem.set_offset_function(3)
problem.set_scale_domain(1.5)
problem.set_scale_function(4)
problem.set_noise_type('normal')
problem.set_noise_level(2)

problem_to_solve = problem.get_formatted_problem(True, ['Differentiable', 'Separable', 'Unimodal'])
from optproblems.cec2005 import F1
fun = F1(2, None)
problem_to_solve['function'] = fun

# Call the hyper-heuristic object
hh = Hyperheuristic(heuristic_space=exp_config['heuristic_collection_file'],
                        problem=problem_to_solve, parameters=hh_config,
                        file_label=label, weights_array=None)
"""

In [12]:
def get_best_mhs(problem_name, dimension):
  id = get_correct_id(problem_name, dimension)
  hists = [historial[-1] for historial in data_info['hist'][id]]
  optimal_position = np.argmin(hists)
  return data_info['mhs'][id][optimal_position]

In [56]:
exp_config, hh_config, prob_config = read_config_file(exp_label)


# Get and format the problem
# problem = eval('bf.{}({})'.format(problem, dimension))
problem_name = 'Sphere'
dimension = 10
problem = bf.choose_problem(problem_name, dimension)
problem_to_solve = problem.get_formatted_problem(True, ['Differentiable', 'Separable', 'Unimodal'])

label = '{}-{}D-{}'.format(problem_name, dimension, exp_label)
hh_config['verbose'] = False
hh = Hyperheuristic(heuristic_space=exp_config['heuristic_collection_file'],
                    problem=problem_to_solve, parameters=hh_config,
                    file_label=label, weights_array=None)

start_time = timer()
hh.parameters['num_replicas'] = 1000
results = hh._solve_neural_network()

end_time = timer() - start_time



In [57]:
# Test
encoded_heuristic_space['default.txt'][:5]
str(encoded_heuristic_space['default.txt'][0])

# Convert
sequences = results[1]
collection = encoded_heuristic_space['default.txt']
sequences_readable = [', '.join(
  [str(collection[operator]) for operator in seq[1:]]
  ) for seq in sequences]

In [58]:
if not os.path.isdir('vocabulary'):
  os.mkdir('vocabulary')

counting = 0
limit_seq = 100
paths = []
for i in range(0, len(sequences_readable), limit_seq):
  counting += 1
  with open(f'vocabulary/seq_read_{counting}_2.txt', 'w', encoding='utf-8') as file:
    file.write('\n'.join(sequences_readable[i:i+limit_seq]))
  with open(f'vocabulary/score_{counting}_2.txt', 'w', encoding='utf-8') as file:
    file.write('\n'.join([str(x) for x in results[0][i:i+limit_seq]]))
  paths.append(f'vocabulary/seq_read_{counting}_2.txt')


In [78]:
min(len(seq) for seq in sequences_readable)

4093

In [79]:
max(len(seq) for seq in sequences_readable)

11421

In [59]:
from tokenizers import ByteLevelBPETokenizer
# initialize
tokenizer = ByteLevelBPETokenizer()
# and train
tokenizer.train(files=paths, vocab_size=1024, min_frequency=2,
                special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'])

model_name = 'hybertheuristic'
if not os.path.isdir(model_name):
  os.mkdir(model_name)

tokenizer.save_model(model_name)






['hybertheuristic/vocab.json', 'hybertheuristic/merges.txt']

In [70]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained(model_name)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizerFast'.


In [81]:
context_length = 1024

def tokenize(element):
    outputs = tokenizer(
        element,
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = [tokenize(seq) for seq in sequences_readable]
len(tokenized_datasets)

1000

In [82]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

Downloading: 100%|██████████| 665/665 [00:00<00:00, 212kB/s]


In [83]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 86.2M parameters


In [84]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [85]:
out = data_collator([tokenized_datasets[i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([5, 3, 1024])
attention_mask shape: torch.Size([5, 3])
labels shape: torch.Size([5, 3, 1024])


In [91]:
from huggingface_hub import notebook_login
notebook_login()

Login successful
Your token has been saved to /Users/josetapia/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [94]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="customhys_transformer",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    #fp16=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
)

Cloning https://huggingface.co/josetapia/customhys_transformer into local empty directory.


In [107]:
max([len(tokenized_datasets[i]['input_ids']) for i in range(1000)])#for x in tokenized_datasets[i]['input_ids']])

3

In [95]:
trainer.train()

  0%|          | 0/4 [00:00<?, ?it/s]

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length.

In [71]:
sequences_readable[0][:112]

"('genetic_crossover', {'pairing': 'random', 'crossover': 'linear_0.5_0.5', 'mating_pool_factor': 0.4}, 'greedy')"

In [76]:
tokens = tokenizer(sequences_readable[0][:112])['input_ids']
print(tokens)

[0, 277, 299, 67, 275, 263, 280, 307, 262, 261, 318, 263, 261, 275, 262, 261, 398, 67, 20, 18, 25, 67, 20, 18, 25, 263, 261, 306, 67, 304, 67, 290, 262, 291, 18, 24, 278, 261, 342, 276, 2]


In [77]:
ids = tokenizer.convert_ids_to_tokens(tokens)
print(ids)

['<s>', "('", 'genetic', '_', 'crossover', "',", "Ġ{'", 'pairing', "':", "Ġ'", 'random', "',", "Ġ'", 'crossover', "':", "Ġ'", 'linear', '_', '0', '.', '5', '_', '0', '.', '5', "',", "Ġ'", 'mating', '_', 'pool', '_', 'factor', "':", 'Ġ0', '.', '4', '},', "Ġ'", 'greedy', "')", '</s>']


In [47]:
from transformers import pipeline

text2text_generator = pipeline("text2text-generation")


No model was supplied, defaulted to t5-base (https://huggingface.co/t5-base)
Downloading: 100%|██████████| 1.17k/1.17k [00:00<00:00, 540kB/s]
Downloading: 100%|██████████| 850M/850M [02:09<00:00, 6.89MB/s] 
Downloading: 100%|██████████| 773k/773k [00:00<00:00, 2.06MB/s]
Downloading: 100%|██████████| 1.32M/1.32M [00:00<00:00, 2.98MB/s]
Token indices sequence length is longer than the specified maximum sequence length for this model (558 > 512). Running this sequence through the model will result in indexing errors


[{'generated_text': "'swarm_dynamic': 'pairing': 'random"}]

In [52]:
text2text_generator(sequences_readable[0][:512], min_length=1000)

[{'generated_text': "'swarm_dynamic': 1.0, 'pairing': '"}]