In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from random import choices 

import operators as Operators
from metaheuristic import Metaheuristic
import benchmark_func as bf
from hyperheuristic import Hyperheuristic, _save_step
from neural_network import ModelPredictorTransformerOriginal, DatasetSequences
from encode_operators import compress_operator, decompress_operator

import torch
from transformers import PreTrainedTokenizerFast, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import load_metric as load_metric_hf 
from datasets import Dataset as Dataset_hf

from timeit import default_timer as timer

In [2]:
limit_seq = 100

seqs, costs = [], []
for counting in range(1, 11):
  with open(f'vocabulary/seq_read_{counting}.txt', 'r', encoding='utf-8') as file:
    seqs = seqs + file.read().split('\n')  
  with open(f'vocabulary/score_{counting}.txt', 'r', encoding='utf-8') as file:
    costs = costs + file.read().split('\n')
    

In [3]:
# Read operators and find their alias
collections = ['default.txt', 'basicmetaheuristics.txt']

encoded_heuristic_space = dict()
operators_string = dict()
for collection_file in collections:
    with open('./collections/' + collection_file, 'r') as operators_file:
        operators_string[collection_file] = [line.rstrip('\n') for line in operators_file]
        encoded_heuristic_space[collection_file] = [eval(line) for line in operators_string[collection_file]]

In [4]:
collection_compressed = [compress_operator(op) for op in encoded_heuristic_space['default.txt']]

In [5]:
min(len(a) for a in collection_compressed)

25

In [6]:
max(len(a) for a in collection_compressed)

25

In [7]:
for a in collection_compressed[:10]:
  if len(a) == 25:
    print(a)

RS,1.0;u,g_______________
CF,0.001;0.01;1.5;1.0,d__
CF,0.001;0.01;1.5;1.0,g__
CF,0.001;0.01;1.5;1.0,m__
CF,0.001;0.01;1.5;1.0,p__
DM,rand;1;1.0,d__________
DM,rand;1;1.0,g__________
DM,rand;1;1.0,m__________
DM,rand;1;1.0,p__________
DM,best;1;1.0,d__________


In [8]:
def parse_sequence(seq):
  operators = []
  prev_idx = 0
  counting = 0
  for i, c in enumerate(seq):
    if c == '(':
      counting += 1
    if c == ')':
      counting -= 1
      if counting == 0:
        operators.append(seq[prev_idx:i+1])
        prev_idx = i + 3
  return operators

def get_ids_operators(operators):
  ids = []
  for operator in operators:
    ids_bool = np.array(operators_string['default.txt']) == operator
    ids.append(np.where(ids_bool)[0][0])
  return ids

def generate_seqs():
  seqs_operators = []
  seqs_ids = []
  for seq in seqs:
    operators = parse_sequence(seq)
    seq_ids = get_ids_operators(operators)
    seqs_operators.append(operators)
    seqs_ids.append(seq_ids)
  fitnesses = [eval(cost) for cost in costs]
  return seqs_operators, seqs_ids, fitnesses

      
seqs_operators, seqs_ids, fitnesses = generate_seqs()

In [9]:
_, seqs_ids, fitnesses = generate_seqs()
ds = DatasetSequences(seqs_ids, fitnesses, fitness_to_weight='rank')
seqs_operators, _, fitnesses = generate_seqs()
seqs_compressed_op = [[compress_operator(eval(operator)) for operator in seq] for seq in seqs_operators]
ds2 = DatasetSequences(seqs_compressed_op, fitnesses, fitness_to_weight='rank')
Xid, yid, fit = ds.obtain_dataset()
Xop, yop, fit = ds2.obtain_dataset()

A = list(zip(fit, Xop, yid))
A.sort(reverse=True)


Metal device set to: Apple M1 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



In [10]:
readable_seqs = []
readable_next = []
readable_fitness = []
for fi, xop, y_op in A:
  readable_seqs.append(' '.join([str(x) for x in xop]))
  readable_next.append(y_op)
  readable_fitness.append(fi)

In [11]:
seqs_compressed_op = [[compress_operator(eval(operator)) for operator in seq] for seq in seqs_operators]


In [26]:
from huggingface_hub import notebook_login
# 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
notebook_login()

Login successful
Your token has been saved to /Users/josetapia/.huggingface/token


In [25]:
from huggingface_hub import notebook_login
# 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
notebook_login()

VBox(children=(HTML(value='<center>\n<img src=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [13]:
from transformers import GPT2Tokenizer, TFGPT2Model
def train_tokenizer():
    old_tokenizer = PreTrainedTokenizerFast.from_pretrained(
        "gpt2",
    )   
    tokenizer = old_tokenizer.train_new_from_iterator(readable_seqs, vocab_size=30522)
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    tokenizer.save_pretrained('vocabulary/HyGpt-token-compress')
    tokenizer.push_to_hub('HyGpt-tokenizer-compress')
    return tokenizer
#tokenizer = train_tokenizer()

In [14]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("vocabulary/HyGpt-token-compress")

In [15]:
seqs_str = [' '.join(a) for a in seqs_compressed_op]

ds_dict = Dataset_hf.from_dict({
  'content': seqs_str,
})

context_length = 1024

train_dataset = ds_dict.map(lambda w: tokenizer(w['content'], 
                                                      truncation=True,
                                                      max_length=context_length,
                                                      #return_overflowing_tokens=True,
                                                      return_length=True),
                                batched=True)

input_batch = []
for length, input_ids in zip(train_dataset["length"], train_dataset["input_ids"]):
    if length == context_length:
        input_batch.append(input_ids)
ds_train = Dataset_hf.from_dict({"input_ids": input_batch})
#train_dataset.set_format(type='torch', columns=['input_ids',
#                                                'label',
#                                                'attention_mask'])

100%|██████████| 1/1 [00:01<00:00,  1.56s/ba]


In [16]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    #pad_token_id=tokenizer.pad_token_id,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

In [17]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 85.9M parameters


In [34]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "hygpt-generator-2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    #pad_token_id=tokenizer.pad_token_id,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)
model = GPT2LMHeadModel(config)

model.push_to_hub('hygpt2-cml-gen')

Cloning https://huggingface.co/josetapia/hygpt2-cml-gen into local empty directory.
remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/josetapia/hygpt2-cml-gen
   1395130..5765eea  main -> main

Upload file pytorch_model.bin: 100%|██████████| 340M/340M [10:02<00:00, 591kB/s]


'https://huggingface.co/josetapia/hygpt2-cml-gen/commit/5765eea97e9df01395637da32c58a45a78c9b8d9'

In [18]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

Using eos_token, but it is not set yet.


In [19]:
from transformers import Trainer, TrainingArguments


In [20]:

args = TrainingArguments(
    output_dir="hygpt2-cml-2",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=15,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    #fp16=True,
    disable_tqdm=False,
    #push_to_hub=True,
)
"""
# Training arguments
batch_size = 8
epochs = 1
torch.cuda.empty_cache()
args = TrainingArguments(
    output_dir='HyGpt',
    logging_dir='HyGpt',
    evaluation_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    eval_steps=1,
    num_train_epochs=epochs, 
    weight_decay=0.01,
    logging_steps = 1,
    disable_tqdm=False)
    """

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=ds_train,
    eval_dataset=ds_train,
)

In [21]:
ds_train

Dataset({
    features: ['input_ids'],
    num_rows: 880
})

In [22]:
trainer.train()
trainer.save_model('hygpt-generator-2')
#trainer.push_to_hub('hygpt2-cml-2-10')

100%|██████████| 90/90 [20:59:35<00:00, 839.72s/it]    


{'train_runtime': 75575.2102, 'train_samples_per_second': 0.175, 'train_steps_per_second': 0.001, 'train_loss': 2.0761405097113714, 'epoch': 14.87}


In [23]:
trainer.train()

  0%|          | 0/90 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [28]:
trainer.push_to_hub('hygpt2-cml-gen')

OSError: Tried to clone a repository in a non-empty folder that isn't a git repository. If you really want to do this, do it manually:
git init && git remote add origin && git pull origin main
 or clone repo to a new folder and move your existing files there afterwards.

In [29]:
import torch
from transformers import pipeline

#device = torch.device("cpu")
pipe = pipeline(
    "text-generation", model="hygpt-generator-2", max_length=1024#, device='cpu'
)

In [None]:
seqs_compressed_op[0][:15]

In [31]:
' '.join(seqs_compressed_op[0][:30])

'GC,r;l;0.4,g_____________ GS,1.0;0.02,p____________ GC,t;b;0.4,g_____________ GC,t;b;0.4,d_____________ GC,t;b;0.4,g_____________ GC,t;two;0.4,m___________ GC,t;b;0.4,d_____________ RS,0.01;u,p______________ RW,0.75;1.0;u,d__________ PS,1.0;2.54;2.56;c;g,g___ RS,0.01;levy,g___________ GC,t;u;0.4,g_____________ GC,t;u;0.4,g_____________ GC,r;u;0.4,m_____________ GC,t;s;0.4,m_____________ GC,r;b;0.4,p_____________ GC,r;two;0.4,p___________ SD,0.9;22.5;0.1,p________ GC,cost;u;0.4,d__________ GC,t;two;0.4,p___________ DM,rtbc;1;1.0,p__________ GC,t;b;0.4,g_____________ SD,0.9;22.5;0.1,p________ DM,best;1;1.0,d__________ DM,rtb;1;1.0,g___________ GC,r;l;0.4,m_____________ GC,r;l;0.4,m_____________ GC,cost;b;0.4,p__________ RW,0.75;1.0;u,m__________ DM,best;1;1.0,p__________'

In [32]:
txt = 'GC,r;l;0.4,g_____________ GS,1.0;0.02,p____________ GC,t;b;0.4,g_____________ GC,t;b;0.4,d_____________ GC,t;b;0.4,g_____________ GC,t;two;0.4,m___________ GC,t;b;0.4,d_____________ RS,0.01;u,p______________ RW,0.75;1.0;u,d__________ PS,1.0;2.54;2.56;c;g,g___ RS,0.01;levy,g___________ GC,t;u;0.4,g_____________ GC,t;u;0.4,g_____________ GC,r;u;0.4,m_____________ GC,t;s;0.4,m_____________ GC,r;b;0.4,p_____________ GC,r;two;0.4,p___________ SD,0.9;22.5;0.1,p________ GC,cost;u;0.4,d__________ GC,t;two;0.4,p___________ DM,rtbc;1;1.0,p__________ GC,t;b;0.4,g_____________ SD,0.9;22.5;0.1,p________ DM,best;1;1.0,d__________ DM,rtb;1;1.0,g___________ GC,r;l;0.4,m_____________ GC,r;l;0.4,m_____________ GC,cost;b;0.4,p__________ RW,0.75;1.0;u,m__________ DM,best;1;1.0,p__________'
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

GC,r;l;0.4,g_____________ GS,1.0;0.02,p____________ GC,t;b;0.4,g_____________ GC,t;b;0.4,d_____________ GC,t;b;0.4,g_____________ GC,t;two;0.4,m___________ GC,t;b;0.4,d_____________ RS,0.01;u,p______________ RW,0.75;1.0;u,d__________ PS,1.0;2.54;2.56;c;g,g___ RS,0.01;levy,g___________ GC,t;u;0.4,g_____________ GC,t;u;0.4,g_____________ GC,r;u;0.4,m_____________ GC,t;s;0.4,m_____________ GC,r;b;0.4,p_____________ GC,r;two;0.4,p___________ SD,0.9;22.5;0.1,p________ GC,cost;u;0.4,d__________ GC,t;two;0.4,p___________ DM,rtbc;1;1.0,p__________ GC,t;b;0.4,g_____________ SD,0.9;22.5;0.1,p________ DM,best;1;1.0,d__________ DM,rtb;1;1.0,g___________ GC,r;l;0.4,m_____________ GC,r;l;0.4,m_____________ GC,cost;b;0.4,p__________ RW,0.75;1.0;u,m__________ DM,best;1;1.0,p__________ GC,1.0.01;two;1.56;u;0,d______________ GC,t;1.75;1,p__________ DM,p_____________ DM,1.4,cost;0.4,g___ PS,g,1.56;r;0;g________ DM,t;0;l;1;u,t;0.4,1.5;2.0.4,p________ GC,0.0;2.5;1.4,0,m_____________ GC,p___________ GC,1.54

In [None]:
txt = 'GC,r;l;0.4,g'
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

In [None]:
from encode_operators import get_operator_param_names
get_operator_param_names()