In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from random import choices 

import operators as Operators
from metaheuristic import Metaheuristic
import benchmark_func as bf
from hyperheuristic import Hyperheuristic, _save_step
from neural_network import ModelPredictorTransformer, ModelPredictorTransformerOriginal, DatasetSequences
from encode_operators import compress_operator, decompress_operator

import torch
from transformers import PreTrainedTokenizerFast, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import load_metric as load_metric_hf 
from datasets import Dataset as Dataset_hf

from timeit import default_timer as timer

In [2]:
limit_seq = 100

seqs, costs = [], []
for counting in range(1, 11):
  with open(f'vocabulary/seq_read_{counting}.txt', 'r', encoding='utf-8') as file:
    seqs = seqs + file.read().split('\n')  
  with open(f'vocabulary/score_{counting}.txt', 'r', encoding='utf-8') as file:
    costs = costs + file.read().split('\n')
    

In [3]:
# Read operators and find their alias
collections = ['default.txt', 'basicmetaheuristics.txt']

encoded_heuristic_space = dict()
operators_string = dict()
for collection_file in collections:
    with open('./collections/' + collection_file, 'r') as operators_file:
        operators_string[collection_file] = [line.rstrip('\n') for line in operators_file]
        encoded_heuristic_space[collection_file] = [eval(line) for line in operators_string[collection_file]]

In [4]:
def parse_sequence(seq):
  operators = []
  prev_idx = 0
  counting = 0
  for i, c in enumerate(seq):
    if c == '(':
      counting += 1
    if c == ')':
      counting -= 1
      if counting == 0:
        operators.append(seq[prev_idx:i+1])
        prev_idx = i + 3
  return operators

def get_ids_operators(operators):
  ids = []
  for operator in operators:
    ids_bool = np.array(operators_string['default.txt']) == operator
    ids.append(np.where(ids_bool)[0][0])
  return ids

def generate_seqs():
  seqs_operators = []
  seqs_ids = []
  for seq in seqs:
    operators = parse_sequence(seq)
    seq_ids = get_ids_operators(operators)
    seqs_operators.append(operators)
    seqs_ids.append(seq_ids)
  fitnesses = [eval(cost) for cost in costs]
  return seqs_operators, seqs_ids, fitnesses

      
seqs_operators, seqs_ids, fitnesses = generate_seqs()

In [5]:
_, seqs_ids, fitnesses = generate_seqs()
ds = DatasetSequences(seqs_ids, fitnesses, 205, fitness_to_weight='rank')
seqs_operators, _, fitnesses = generate_seqs()
seqs_compressed_op = [[compress_operator(eval(operator)) for operator in seq] for seq in seqs_operators]
ds2 = DatasetSequences(seqs_compressed_op, fitnesses, fitness_to_weight='rank')
Xid, yid, fit = ds.obtain_dataset()
Xop, yop, _ = ds2.obtain_dataset()

A = list(zip(fit, Xop, yid))
A.sort(reverse=True)


Metal device set to: Apple M1 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



In [6]:
readable_seqs = []
readable_next = []
readable_fitness = fit

percentage = 0.3
B = A[:int(len(A) * percentage)]
for _, xop, y_op in B:
  readable_seqs.append(' '.join([str(x) for x in xop]))
  readable_next.append(y_op)
#  readable_fitness.append(fi)

In [8]:
def train_tokenizer():
    old_tokenizer = PreTrainedTokenizerFast.from_pretrained(
        "gpt2",
    )   
    print(old_tokenizer.pad_token)
    old_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    tokenizer = old_tokenizer.train_new_from_iterator(readable_seqs, vocab_size=30522)
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    tokenizer.save_pretrained('vocabulary/HyGpt-tokenizer-compress')
    tokenizer.push_to_hub('HyGpt-tokenizer-compress')
    return tokenizer
#tokenizer = train_tokenizer()

In [9]:
params = {
    'file_label': 'HyGpt-compress-tests-class',
    'num_steps': 100,
    'num_operators': 205,
    "load_model": True,
    "save_model": False,
    "encoder": "identity",
    "model_architecture": "transformer",
    "pretrained_tokenizer" : "josetapia/HyGpt-tokenizer-compress",
    "pretrained_model": "gpt2",
    "epochs": 2,
    "fitness_to_weight": "rank",
    "sample_params": {
      "retrieve_sequences": False,
      "limit_seqs": 400,
      "filter": "first_quartile",
      "store_sequences": False
    }
  }

In [10]:
model = ModelPredictorTransformer(params)
model._tokenizer.pad_token

'[PAD]'

In [21]:
model.fit(readable_seqs, readable_next, 3, readable_fitness, True)


100%|██████████| 95/95 [01:41<00:00,  1.07s/ba]
  0%|          | 0/8865 [00:00<?, ?it/s]

: 

: 

In [None]:
model._trainer.save_model('HyGpt/hygpt-class-compress')

In [None]:
model._tokenizer.pad_token_id, model._tokenizer.pad_token, model._tokenizer.pad

In [18]:
A = model._tokenizer(readable_seqs[:200], max_length=1024, truncation=True, padding=True)

In [19]:
min(len(a) for a in A["input_ids"])

1024