In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from random import choices 

import operators as Operators
from metaheuristic import Metaheuristic
import benchmark_func as bf
from hyperheuristic import Hyperheuristic, _save_step
from neural_network import ModelPredictorTransformerOriginal, DatasetSequences
from encode_operators import compress_operator, decompress_operator

import torch
from transformers import PreTrainedTokenizerFast, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import load_metric as load_metric_hf 
from datasets import Dataset as Dataset_hf

from timeit import default_timer as timer

# Prepare sequences

In [2]:
limit_seq = 100

seqs, costs = [], []
for counting in range(1, 11):
  with open(f'vocabulary/seq_read_{counting}.txt', 'r', encoding='utf-8') as file:
    seqs = seqs + file.read().split('\n')  
  with open(f'vocabulary/score_{counting}.txt', 'r', encoding='utf-8') as file:
    costs = costs + file.read().split('\n')
    

In [2]:
# Read operators and find their alias
collections = ['default.txt', 'basicmetaheuristics.txt']

encoded_heuristic_space = dict()
operators_string = dict()
for collection_file in collections:
    with open('./collections/' + collection_file, 'r') as operators_file:
        operators_string[collection_file] = [line.rstrip('\n') for line in operators_file]
        encoded_heuristic_space[collection_file] = [eval(line) for line in operators_string[collection_file]]

'RS,1.0;u,g'

In [4]:
def parse_sequence(seq):
  operators = []
  prev_idx = 0
  counting = 0
  for i, c in enumerate(seq):
    if c == '(':
      counting += 1
    if c == ')':
      counting -= 1
      if counting == 0:
        operators.append(seq[prev_idx:i+1])
        prev_idx = i + 3
  return operators

def get_ids_operators(operators):
  ids = []
  for operator in operators:
    ids_bool = np.array(operators_string['default.txt']) == operator
    ids.append(np.where(ids_bool)[0][0])
  return ids

def generate_seqs():
  seqs_operators = []
  seqs_ids = []
  for seq in seqs:
    operators = parse_sequence(seq)
    seq_ids = get_ids_operators(operators)
    seqs_operators.append(operators)
    seqs_ids.append(seq_ids)
  fitnesses = [eval(cost) for cost in costs]
  return seqs_operators, seqs_ids, fitnesses

      
seqs_operators, seqs_ids, fitnesses = generate_seqs()

In [5]:
_, seqs_ids, fitnesses = generate_seqs()
ds = DatasetSequences(seqs_ids, fitnesses, fitness_to_weight='rank')
seqs_operators, _, fitnesses = generate_seqs()
seqs_compressed_op = [[compress_operator(eval(operator)) for operator in seq] for seq in seqs_operators]
ds2 = DatasetSequences(seqs_compressed_op, fitnesses, fitness_to_weight='rank')
Xid, yid, fit = ds.obtain_dataset()
Xop, yop, fit = ds2.obtain_dataset()

A = list(zip(fit, Xop, yid))
A.sort(reverse=True)


Metal device set to: Apple M1 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



In [None]:
seqs_compressed_op = [[compress_operator(eval(operator)) for operator in seq] for seq in seqs_operators]


In [6]:

percentage = 0.1
B = A[:int(len(A) * percentage)]

In [7]:
readable_seqs = []
readable_next = []
readable_fitness = []
for fi, xop, y_op in B:
  readable_seqs.append(' '.join([str(x) for x in xop]))
  readable_next.append(y_op)
  readable_fitness.append(fi)

  
counting = 0
limit_seq = 2000
paths = []
for i in range(0, len(readable_seqs), limit_seq):
  counting += 1
  with open(f'vocabulary/seq_read_ops_compressed_{counting}.txt', 'w', encoding='utf-8') as file:
    file.write('\n'.join(readable_seqs[i:i+limit_seq]))
  with open(f'vocabulary/classification_ids_{counting}.txt', 'w', encoding='utf-8') as file:
    file.write('\n'.join([str(x) for x in readable_next[i:i+limit_seq]]))
  paths.append(f'vocabulary/seq_read_ops_compressed_{counting}.txt')

# Tokenizer

In [None]:
from huggingface_hub import notebook_login
# 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
notebook_login()

In [None]:
from transformers import GPT2Tokenizer, TFGPT2Model


In [None]:
def train_tokenizer():
    old_tokenizer = PreTrainedTokenizerFast.from_pretrained(
        "gpt2",
        do_lower_case = True
    )   
    tokenizer = old_tokenizer.train_new_from_iterator(readable_seqs, vocab_size=30522)
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    tokenizer.save_pretrained('vocabulary/HyBert-token-compress')
    tokenizer.push_to_hub('HyBert-tokenizer-compress')
    return tokenizer
tokenizer = train_tokenizer()

In [None]:
max(len(tokenizer(seq)['input_ids']) for seq in readable_seqs)

In [None]:
min(len(tokenizer(seq)['input_ids']) for seq in readable_seqs)

# Model

In [3]:
params = {
    'file_label': 'HyGpt-compress-tests',
    'num_steps': 100,
    'num_operators': 205,
    "load_model": True,
    "save_model": False,
    "encoder": "identity",
    "model_architecture": "transformer_orig",
    "pretrained_tokenizer" : "vocabulary/HyGpt-tokenizer-compress", #"josetapia/HyBert-tokenizer-compress",
    "pretrained_model": "gpt2",
    "epochs": 3,
    "fitness_to_weight": "rank",
    "sample_params": {
      "retrieve_sequences": False,
      "limit_seqs": 400,
      "filter": "first_quartile",
      "store_sequences": False
    }
  }

In [4]:
model = ModelPredictorTransformerOriginal(params)


In [5]:
from transformers import AutoTokenizer, GPT2Config, GPT2Tokenizer, GPT2ForSequenceClassification
configuration = GPT2Config()
# set up your tokenizer, just like you described, and set the pad token
# instantiate the model

problem = bf.Sphere(10)

sequences_good, fitness_good = [], []

for case in ['']:#, '-2', '-3', '-4', '-5', '-6', '-7', '-8', '-9']:
    params["pretrained_tokenizer"] = "josetapia/HyGpt-tokenizer-compress"
    params["pretrained_model"] = f"josetapia/hygpt-compress-class{case}"#f"./hybertheuristic{case}/checkpoint-500/",
    model = ModelPredictorTransformerOriginal(params)
        
    model_2 = GPT2ForSequenceClassification(configuration).from_pretrained(f"josetapia/hygpt-compress-class{case}", num_labels=205).to('cpu')
    # set the pad token of the model's configuration
    model_2.config.pad_token_id = model_2.config.eos_token_id
    
    GPT2_tokenizer = AutoTokenizer.from_pretrained("josetapia/HyGpt-tokenizer-compress")
    #GPT2_tokenizer.pad_token = GPT2_tokenizer.eos_token
    GPT2_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model._tokenizer = GPT2_tokenizer
    #model._tokenizer.pad_token_id = model_2.config.eos_token_id
    #model._tokenizer.pad_token = GPT2_tokenizer.eos_token
    #tokenizer.pad_token = tokenizer.eos_token
    model._model = model_2
    
    training_args = TrainingArguments(output_dir=f'HyGpt-compress{case}', disable_tqdm=True)
    metric = load_metric_hf("accuracy")
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels)
    model._trainer = Trainer(
        model=model._model,
        args=training_args,
        tokenizer=model._tokenizer,
        compute_metrics=compute_metrics
    )      
    model._predict = model._trainer.predict
    
    num_replicas = 100
    
    file_label = "Sphere-10D-HyGpt-compress-tests{}".format(case)
    params['file_label'] = file_label
    hh = Hyperheuristic(problem=problem.get_formatted_problem(),
                        heuristic_space='default.txt',  # 'default.txt',  #short_collection  automatic medium_collection
                        file_label=file_label)

    hh.parameters['num_agents'] = 30
    hh.parameters['num_steps'] = 100
    hh.parameters['stagnation_percentage'] = 0.5
    hh.parameters['verbose'] = True
    sequence_per_repetition = list()
    fitness_per_repetition = list()

    logs_time = []
    for rep in range(num_replicas):
        # Metaheuristic
        start_time = timer()
        mh = Metaheuristic(hh.problem, num_agents=hh.parameters['num_agents'], num_iterations=hh.num_iterations)

        # Initialiser
        mh.apply_initialiser()

        # Extract the population and fitness values, and their best values
        current_fitness = np.copy(mh.pop.global_best_fitness)
        current_position = np.copy(mh.pop.rescale_back(mh.pop.global_best_position))

        # Heuristic sets
        hh.current_space = np.arange(hh.num_operators)

        # Initialise additional variables
        candidate_enc_so = list()
        current_sequence = []

        best_fitness = [current_fitness]
        best_position = [current_position]

        step = 0
        stag_counter = 0
        exclude_indices = []
        normalize_weights = lambda w: w / sum(w) if sum(w) > 0 else np.ones(hh.num_operators) / hh.num_operators

        # Finalisator
        while not hh._check_finalisation(step, stag_counter):
            # Use the trained model to predict operators weights
            if stag_counter == 0:
                #print(model._tokenizer.pad_token)
                if len(current_sequence) == 0:
                    operator_prediction = np.array(np.ones(hh.num_operators))
                else:
                    compress_seq = [compress_operator(encoded_heuristic_space['default.txt'][op]) for op in current_sequence]
                    operator_prediction = model.predict(compress_seq)
                operators_weights = normalize_weights(operator_prediction)

            # Select a simple heuristic and apply it
            candidate_enc_so = hh._obtain_candidate_solution(sol=1, operators_weights=operators_weights)
            candidate_search_operator = hh.get_operators([candidate_enc_so[-1]])
            perturbators, selectors = Operators.process_operators(candidate_search_operator)

            mh.apply_search_operator(perturbators[0], selectors[0])

            # Extract population and fitness values
            current_fitness = np.copy(mh.pop.global_best_fitness)
            current_position = np.copy(mh.pop.rescale_back(mh.pop.global_best_position))

            # Print update
            if hh.parameters['verbose']:
                print(
                    '{} :: Transformer, Rep: {:3d}, Step: {:3d}, Trial: {:3d}, SO: {:30s}, currPerf: {:.2e}, candPerf: {:.2e}, '
                    'csl: {:3d}'.format(
                        hh.file_label, rep + 1, step + 1, stag_counter,
                        candidate_search_operator[0][0] + ' & ' + candidate_search_operator[0][2][:4],
                        best_fitness[-1], current_fitness, len(hh.current_space)), end=' ')

            # If the candidate solution is better or equal than the current best solution
            if current_fitness < best_fitness[-1]:
                # Update the current sequence and its characteristics
                current_sequence.append(candidate_enc_so[-1])

                best_fitness.append(current_fitness)
                best_position.append(current_position)

                # Update counters
                step += 1
                stag_counter = 0
                # Reset tabu list
                exclude_indices = []

                # Add improvement mark
                if hh.parameters['verbose']:
                    print('+', end='')

            else:  # Then try another search operator
                # Revert the modification to the population in the mh object
                mh.pop.revert_positions()

                # Update stagnation
                stag_counter += 1
                if stag_counter % 5 == 0:
                    # Include last search operator's index to the tabu list
                    exclude_indices.append(candidate_enc_so[-1])
                    operator_prediction[exclude_indices[-1]] = 0
                    operators_weights = normalize_weights(operator_prediction)

            # Add ending mark
            if hh.parameters['verbose']:
                print('')

        # Print the best one
        if hh.parameters['verbose']:
            print('\nBest fitness: {},\nBest position: {}'.format(current_fitness, current_position))

        # Update the repetition register
        sequence_per_repetition.append(np.double(current_sequence).astype(int).tolist())
        fitness_per_repetition.append(np.double(best_fitness).tolist())

        
        # Save this historical register
        _save_step(rep,
                    dict(encoded_solution=np.array(current_sequence),
                        best_fitness=np.double(best_fitness),
                        best_positions=np.double(best_position),
                        details=dict(
                            fitness_per_rep=fitness_per_repetition,
                            sequence_per_rep=sequence_per_repetition,
                        )),
                    hh.file_label)
        
        logs_time.append(timer() - start_time)
    df_times = pd.DataFrame({"time": logs_time})
    df_times.to_csv(f'./data_files/ml_models/{hh.file_label}_mhs_time_prediction_logs.csv')
    
    fitness_good.append(fitness_per_repetition)
    sequences_good.append(sequence_per_repetition)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Sphere-10D-HyGpt-compress-tests :: Transformer, Rep:   1, Step:   1, Trial:   0, SO: genetic_crossover & prob      , currPerf: 1.95e+04, candPerf: 1.95e+04, csl: 205 
Sphere-10D-HyGpt-compress-tests :: Transformer, Rep:   1, Step:   1, Trial:   1, SO: differential_mutation & prob  , currPerf: 1.95e+04, candPerf: 1.95e+04, csl: 205 
Sphere-10D-HyGpt-compress-tests :: Transformer, Rep:   1, Step:   1, Trial:   2, SO: genetic_crossover & gree      , currPerf: 1.95e+04, candPerf: 7.91e+03, csl: 205 +
Sphere-10D-HyGpt-compress-tests :: Transformer, Rep:   1, Step:   2, Trial:   0, SO: genetic_crossover & metr      , currPerf: 7.91e+03, candPerf: 1.77e+03, csl: 205 +
Sphere-10D-HyGpt-compress-tests :: Transformer, Rep:   1, Step:   3, Trial:   0, SO: genetic_crossover & gree      , currPerf: 1.77e+03, candPerf: 1.53e+03, csl: 205 +
Sphere-10D-HyGpt-compress-tests :: Transformer, Rep:   1, Step:   4, Trial:   0, SO: genetic_crossover & prob      , currPerf: 1.53e+03, candPerf: 1.48e+03, csl: 

In [None]:
GPT2_tokenizer.pad_token, GPT2_tokenizer.eos_token

In [None]:
elem = model._tokenizer(['1'])
elem

In [None]:
model._trainer.predict([elem])

In [None]:
model.predict([1])

In [None]:
paths = []
for idx, (seqs, fits) in enumerate(zip(sequences_good, fitness_good)):
  with open(f'HyGpt/seq_good_{idx}.txt', 'w', encoding='utf-8') as file:
    file.write('\n'.join([str(x) for x in seqs]))
  with open(f'HyGpt/score_good_{idx}.txt', 'w', encoding='utf-8') as file:
    file.write('\n'.join([str(x) for x in fits]))
  paths.append(f'HyGpt/seq_good_{idx}.txt')

In [None]:
len(fitnesses)

In [None]:

figures_folder = 'HyGpt/'
saving_format = 'png'
def prove_comparison_fitness(sort_by_fitness=False, subset_to_use=[0, 1, 2, 3]):
        fitprep2 = fitness_good[0]
        fitprep3 = fitness_good[1]
        fitprep4 = fitness_good[2]
        fitprep = fitnesses[:100]

        fitprep.sort(key = lambda w: w[-1], reverse=True)
        fitprep2.sort(key = lambda w: w[-1], reverse=True)
        fitprep3.sort(key = lambda w: w[-1], reverse=True)
        fitprep4.sort(key = lambda w: w[-1], reverse=True)

        colours = plt.cm.get_cmap('Reds')(np.linspace(0, 1, len(fitprep)))
        colours2 = plt.cm.get_cmap('Blues')(np.linspace(0, 1, len(fitprep2)))
        colours3 = plt.cm.get_cmap('Greens')(np.linspace(0, 1, len(fitprep3)))
        colours4 = plt.cm.get_cmap('Purples')(np.linspace(0, 1, len(fitprep4)))
        #colours = ['tab:red' for _ in fitprep]
        #colours2 = ['tab:blue' for _ in fitprep]
        #colours3 = ['tab:green' for _ in fitprep]
        
        label = [None for _ in fitprep]
        label2 = [None for _ in fitprep2]
        label3 = [None for _ in fitprep3]
        label4 = [None for _ in fitprep4]
        label2[-1] = 'HyBert - 1 epoch'
        label3[-1] = 'HyBert - 2 epochs'
        label4[-1] = 'HyBert - 3 epochs'
        label[-1] = 'NNHH'
        
        #plt.cm.get_cmap
        
        xc = np.array(list(zip(fitprep, colours, label)))[list(range(0, len(fitprep), 1))]
        xc2 = np.array(list(zip(fitprep2, colours2, label2)))[list(range(0, len(fitprep2), 1))]
        xc3 = np.array(list(zip(fitprep3, colours3, label3)))[list(range(0, len(fitprep3), 1))]
        xc4 = np.array(list(zip(fitprep4, colours4, label4)))[list(range(0, len(fitprep4), 1))]
        
        fi1 = plt.figure(figsize=(8, 3.5))
        """
        for x, c, l in xc:
            plt.plot(x, '-o', color=c, alpha=0.3, label=l)
        for x, c, l in xc2:
            plt.plot(x, '-o', color=c, alpha=0.3, label=l)
        for x, c, l in xc3:
            plt.plot(x, '-o', color=c, alpha=0.3, label=l)
        """
        all_xc = []
        for id, arr in enumerate([xc, xc2, xc3, xc4]):
            if id in subset_to_use:
                all_xc = all_xc + list(arr)
        #all_xc = list(xc)+list(xc2)+list(xc3)+list(xc4)
        if sort_by_fitness:
            all_xc.sort(key = lambda w: w[0][-1], reverse=True)
        for x, c, l in all_xc:
            plt.plot(x, '-o', color=c, alpha=0.5 if l is None else 0.5, label=l)

        plt.legend(loc='best', prop={'size': 15})
        #title = lstmdata['problem'][idx] + f'-{lstmdata["dimensions"][idx]}D'
        #plt.title(title)
        title = f'Sphere-10D'#({"-".join(cec_relation[func_name])})'
        plt.title(title)
        plt.xlabel('Step')
        plt.ylabel('Fitness')
        #if is_saving:
        fi1.subplots_adjust()
        fi1.tight_layout()
        plt.savefig(figures_folder + f'comparison_options{"_".join(str(x) for x in subset_to_use)}_{title}.' + saving_format,
                    format=saving_format,bbox_inches = 'tight', dpi=333, transparent=True)

        plt.show()

In [None]:
prove_comparison_fitness(sort_by_fitness=True, subset_to_use=[0, 1])

In [None]:
prove_comparison_fitness(sort_by_fitness=True, subset_to_use=[0, 2])

In [None]:
prove_comparison_fitness(sort_by_fitness=True, subset_to_use=[0, 3])

In [None]:
prove_comparison_fitness(sort_by_fitness=True, subset_to_use=[0, 1, 2])

In [None]:
prove_comparison_fitness(sort_by_fitness=True, subset_to_use=[0, 1, 2, 3])

In [7]:
import scipy.stats as st

def compute_performance():
  performances = []
  for seqs in fitness_good + [fitnesses[:100]]:
    last_fitness = [x[-1] for x in seqs]
    performances.append(np.median(last_fitness) + st.iqr(last_fitness))
  return performances 

def get_statistics(arr):
  print(f'{np.mean(arr):.4f} +- {np.std(arr):.4f}')
  print("Mean:", np.mean(arr))
  print("Median:", np.median(arr))
  print("Max:", np.max(arr))
  print("Min:", np.min(arr))

In [8]:
compute_performance()

[4.891829972022509]

In [9]:
get_statistics([x[-1] for x in fitness_good[0]])

23.1867 +- 90.7959
Mean: 23.186675879641626
Median: 0.49176689846030575
Max: 761.236236343018
Min: 1.0342229937997366e-05


In [None]:
get_statistics([x[-1] for x in fitness_good[1]])

In [None]:
get_statistics([x[-1] for x in fitness_good[2]])

In [None]:
get_statistics([x[-1] for x in fitnesses[:100]])

In [None]:
seqs_operators, seqs_ids, fitnesses = generate_seqs()
fig = plt.figure(figsize =(7, 3))

# Creating axes instance
ax = fig.add_axes([0, 0, 1, 1])
 
def last_fitness(seqs):
  return [x[-1] for x in seqs] 
 
data = [last_fitness(seqs) for seqs in fitness_good] + [last_fitness(fitnesses[:100])]
# Creating plot
bp = ax.boxplot(data, showmeans=True)
m1 = [np.mean(x) for x in data]
med1 = [np.median(x) for x in data]
st1 = [np.std(x) for x in data]

for i, line in enumerate(bp['medians']):
  x, y = line.get_xydata()[1]
  
  text = '\n'.join((
  r'$\mu=%.1f$' % (m1[i], ),
  #r'$\mathrm{median}=%.2f$' % (med1[i], ),
  r'$\sigma=%.1f$' % (st1[i], )))
  #print(text)
  print(x, y)
  ax.annotate(text, xy=(x+0.03, y-2.95))
#ax.set_xticklabels(['HyGpt - 1 epoch', 'HyGpt - 2 epoch', 'HyGpt - 3 epoch', 'HyGpt - 4 epoch', 'NNHH'])
fig.subplots_adjust()
fig.tight_layout()
plt.ylabel('Fitness')
# show plot
figures_folder = 'HyGpt/'
saving_format = 'png'
fig.savefig(figures_folder + 'bmhs_vs_strategies.' + saving_format, 
                    format=saving_format, bbox_inches="tight", dpi=333, transparent=True)
plt.show()

In [None]:
for lr in [0.022901570846776326, 0.022901570846776326, 0.026868355635478923]:
  print(1/205, lr*205)

In [None]:
A = model.predict([100]*80)
B = model.predict([])
C = model.predict([3, 5])

In [None]:
def predict(seq):
  seq_str = ', '.join([str(x) for x in seq])
  token = model._tokenizer([seq_str])
  print(token)
  sequence_dataset = Dataset_hf.from_dict(token)
  print(sequence_dataset)
  prediction = model._predict(sequence_dataset).predictions[0]
  
  return np.exp(prediction) / sum(np.exp(prediction))
  #return (np.exp(prediction) / sum(np.exp(prediction)))[0]


A = predict([0])
A

In [None]:
model.predict([1])

In [None]:

seqs_operators, _, fitnesses = generate_seqs()
ds2 = DatasetSequences(seqs_operators, fitnesses, fitness_to_weight='rank')
Xop, yop, _ = ds2.obtain_dataset()

_, seqs_ids, fitnesses = generate_seqs()
ds = DatasetSequences(seqs_ids, fitnesses, 205, fitness_to_weight='rank')
Xid, yid, _ = ds.obtain_dataset()


In [10]:

ds_dict = Dataset_hf.from_dict({
  'text': readable_seqs,
  'label': readable_next
})

train_dataset = ds_dict.map(lambda w: model._tokenizer(w['text'], truncation=True, max_length=512),
                                batched=True)
train_dataset.set_format(type='torch', columns=['input_ids',
                                                'label',
                                                'attention_mask'])

100%|██████████| 10/10 [00:10<00:00,  1.08s/ba]


In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("josetapia/HyBert-tokenizer-compress")

In [None]:
A = tokenizer(ds_dict['content'], truncation=True, max_length=512, return_length=True)

In [None]:
max(len(b) for b in A["input_ids"])

In [None]:
train_dataset = ds_dict.map(lambda w: tokenizer(w['content'], 
                                                      truncation=True,
                                                      max_length=context_length,
                                                      #return_overflowing_tokens=True,
                                                      return_length=True),
                                batched=True)

In [None]:
seqs_str = [' '.join(a) for a in seqs_compressed_op]

ds_dict = Dataset_hf.from_dict({
  'content': seqs_str,
})

context_length = 512

train_dataset = ds_dict.map(lambda w: tokenizer(w['content'], 
                                                      truncation=True,
                                                      max_length=context_length,
                                                      #return_overflowing_tokens=True,
                                                      return_length=True),
                                batched=True)

input_batch = []
for length, input_ids in zip(train_dataset["length"], train_dataset["input_ids"]):
    if length == context_length:
        input_batch.append(input_ids)
ds_train = Dataset_hf.from_dict({"input_ids": input_batch})
#train_dataset.set_format(type='torch', columns=['input_ids',
#                                                'label',
#                                                'attention_mask'])

In [None]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

In [None]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="hygpt2-cml",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    #fp16=True,
    disable_tqdm=False,
    push_to_hub=True,
)
"""
# Training arguments
batch_size = 8
epochs = 1
torch.cuda.empty_cache()
args = TrainingArguments(
    output_dir='HyGpt',
    logging_dir='HyGpt',
    evaluation_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    eval_steps=1,
    num_train_epochs=epochs, 
    weight_decay=0.01,
    logging_steps = 1,
    disable_tqdm=False)
    """

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=ds_train,
    eval_dataset=ds_train,
)

In [None]:
trainer.train()

In [None]:

def train_model(model):
    # Prepare 'accuracy' metric
    metric = load_metric_hf("accuracy")
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels)

    # Training arguments
    batch_size = 8
    epochs = 1
    torch.cuda.empty_cache()
    training_args = TrainingArguments(
        output_dir='HyGpt',
        logging_dir='HyGpt',
        evaluation_strategy='epoch',
        learning_rate=5e-5,
        per_device_train_batch_size=batch_size,
        eval_steps=1,
        num_train_epochs=epochs, 
        weight_decay=0.01,
        logging_steps = 1,
        disable_tqdm=False)

    data_collator = DataCollatorWithPadding(tokenizer=model._tokenizer, padding=True)
    model._trainer = Trainer(
        model=model._model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=train_dataset,
        tokenizer=model._tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    # Fit model
    model._trainer.train()

    # Save predict function        
    model._predict = model._trainer.predict

In [None]:
model._model = AutoModelForSequenceClassification.from_pretrained('gpt2', num_labels=205)
training_args = TrainingArguments(output_dir='HyGpt2', disable_tqdm=True)
metric = load_metric_hf("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)
model._trainer = Trainer(
    model=model._model,
    args=training_args,
    tokenizer=model._tokenizer,
    compute_metrics=compute_metrics
)        
#self._trainer = Trainer(self._model)
model._predict = model._trainer.predict

In [None]:
from transformers import AutoTokenizer, GPT2Config, GPT2ForSequenceClassification
configuration = GPT2Config()
# set up your tokenizer, just like you described, and set the pad token
GPT2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
GPT2_tokenizer.pad_token = GPT2_tokenizer.eos_token
# instantiate the model
model_2 = GPT2ForSequenceClassification(configuration).from_pretrained('josetapia/HyGpt-trainer-4', num_labels=205).to('cpu')
# set the pad token of the model's configuration
model_2.config.pad_token_id = model_2.config.eos_token_id

In [None]:
model._model = model_2
model._tokenizer = GPT2_tokenizer
#model._tokenizer = AutoTokenizer.from_pretrained('josetapia/HyGpt-tokenizer-ids')
#model._tokenizer.add_special_tokens({'pad_token': '[PAD]'})
train_model(model)

In [None]:
model._model.push_to_hub('HyGpt-trainer')

In [None]:

model._tokenizer = GPT2_tokenizer
#model._tokenizer.pad_token_id = model_2.config.eos_token_id
#model._tokenizer.pad_token = GPT2_tokenizer.eos_token
#tokenizer.pad_token = tokenizer.eos_token
model._model = model_2

training_args = TrainingArguments(output_dir=f'HyGpt-4', disable_tqdm=True)
metric = load_metric_hf("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)
model._trainer = Trainer(
    model=model._model,
    args=training_args,
    tokenizer=model._tokenizer,
    compute_metrics=compute_metrics
)      
model._predict = model._trainer.predict

# Training arguments
batch_size = 8
epochs = 1
for num in range(5, 10):
    torch.cuda.empty_cache()
    training_args = TrainingArguments(
        output_dir=f'HyGpt-{num}',
        logging_dir=f'HyGpt-{num}',
        evaluation_strategy='epoch',
        learning_rate=5e-5,
        per_device_train_batch_size=batch_size,
        eval_steps=1,
        num_train_epochs=epochs, 
        weight_decay=0.01,
        logging_steps = 1,
        disable_tqdm=False)

    data_collator = DataCollatorWithPadding(tokenizer=model._tokenizer, padding=True)
    model._trainer = Trainer(
        model=model._model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=train_dataset,
        tokenizer=model._tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    # Fit model
    model._trainer.train()

    # Save predict function        
    model._predict = model._trainer.predict
    model._model.push_to_hub(f'HyGpt-trainer-{num}')

In [None]:
model._model = AutoModelForSequenceClassification.from_pretrained('hybertheuristic-trainer-3', num_labels=205)
training_args = TrainingArguments(output_dir='hybertheuristic-trainer-3', disable_tqdm=True)
metric = load_metric_hf("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)
model._trainer = Trainer(
    model=model._model,
    args=training_args,
    tokenizer=model._tokenizer,
    compute_metrics=compute_metrics
)        
#self._trainer = Trainer(self._model)
model._predict = model._trainer.predict

In [11]:

# Training arguments
batch_size = 32
epochs = 1
torch.cuda.empty_cache()
training_args = TrainingArguments(
    output_dir='hygpt-compress-class-4',
    logging_dir='hygpt-compress-class-4',
    evaluation_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    eval_steps=1,
    num_train_epochs=epochs, 
    weight_decay=0.01,
    logging_steps = 1,
    disable_tqdm=False)
metric = load_metric_hf("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)
data_collator = DataCollatorWithPadding(tokenizer=model._tokenizer, padding=True)
model._trainer = Trainer(
    model=model._model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,
    tokenizer=model._tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Fit model
model._trainer.train()
model._trainer.save_model('HyGpt/hygpt-compress-class-4')

  0%|          | 1/296 [01:08<5:36:26, 68.43s/it]

{'loss': 5.4185, 'learning_rate': 4.983108108108108e-05, 'epoch': 0.0}


  1%|          | 2/296 [02:09<5:13:18, 63.94s/it]

{'loss': 5.6456, 'learning_rate': 4.9662162162162164e-05, 'epoch': 0.01}


  1%|          | 3/296 [03:11<5:09:21, 63.35s/it]

{'loss': 5.414, 'learning_rate': 4.949324324324325e-05, 'epoch': 0.01}


  1%|▏         | 4/296 [04:14<5:06:11, 62.92s/it]

{'loss': 5.4014, 'learning_rate': 4.9324324324324325e-05, 'epoch': 0.01}


  2%|▏         | 5/296 [05:12<4:57:45, 61.39s/it]

{'loss': 5.3139, 'learning_rate': 4.9155405405405406e-05, 'epoch': 0.02}


  2%|▏         | 6/296 [06:15<4:58:39, 61.79s/it]

{'loss': 5.3113, 'learning_rate': 4.8986486486486486e-05, 'epoch': 0.02}


  2%|▏         | 7/296 [07:16<4:57:01, 61.67s/it]

{'loss': 4.7851, 'learning_rate': 4.881756756756757e-05, 'epoch': 0.02}


  3%|▎         | 8/296 [08:18<4:55:32, 61.57s/it]

{'loss': 5.3405, 'learning_rate': 4.8648648648648654e-05, 'epoch': 0.03}


  3%|▎         | 9/296 [09:20<4:55:22, 61.75s/it]

{'loss': 4.9734, 'learning_rate': 4.8479729729729735e-05, 'epoch': 0.03}


  3%|▎         | 10/296 [10:20<4:52:11, 61.30s/it]

{'loss': 4.9457, 'learning_rate': 4.8310810810810816e-05, 'epoch': 0.03}


  4%|▎         | 11/296 [11:22<4:51:26, 61.36s/it]

{'loss': 4.9551, 'learning_rate': 4.8141891891891896e-05, 'epoch': 0.04}


  4%|▍         | 12/296 [12:21<4:47:25, 60.72s/it]

{'loss': 4.8888, 'learning_rate': 4.797297297297298e-05, 'epoch': 0.04}


  4%|▍         | 13/296 [13:25<4:50:40, 61.63s/it]

{'loss': 4.7811, 'learning_rate': 4.780405405405405e-05, 'epoch': 0.04}


  5%|▍         | 14/296 [14:24<4:47:11, 61.10s/it]

{'loss': 4.6881, 'learning_rate': 4.763513513513514e-05, 'epoch': 0.05}


  5%|▌         | 15/296 [15:27<4:47:28, 61.38s/it]

{'loss': 4.7361, 'learning_rate': 4.746621621621622e-05, 'epoch': 0.05}


  5%|▌         | 16/296 [16:27<4:45:51, 61.25s/it]

{'loss': 5.0232, 'learning_rate': 4.72972972972973e-05, 'epoch': 0.05}


  6%|▌         | 17/296 [17:27<4:42:54, 60.84s/it]

{'loss': 4.9509, 'learning_rate': 4.712837837837838e-05, 'epoch': 0.06}


  6%|▌         | 18/296 [18:28<4:40:59, 60.65s/it]

{'loss': 4.7918, 'learning_rate': 4.695945945945946e-05, 'epoch': 0.06}


  6%|▋         | 19/296 [19:28<4:39:53, 60.63s/it]

{'loss': 4.962, 'learning_rate': 4.679054054054055e-05, 'epoch': 0.06}


  7%|▋         | 20/296 [20:30<4:40:27, 60.97s/it]

{'loss': 4.8482, 'learning_rate': 4.662162162162162e-05, 'epoch': 0.07}


  7%|▋         | 21/296 [21:30<4:38:09, 60.69s/it]

{'loss': 4.607, 'learning_rate': 4.64527027027027e-05, 'epoch': 0.07}


  7%|▋         | 22/296 [22:32<4:39:41, 61.25s/it]

{'loss': 5.3591, 'learning_rate': 4.628378378378378e-05, 'epoch': 0.07}


  8%|▊         | 23/296 [23:33<4:38:18, 61.17s/it]

{'loss': 5.1558, 'learning_rate': 4.6114864864864864e-05, 'epoch': 0.08}


  8%|▊         | 24/296 [24:34<4:35:57, 60.87s/it]

{'loss': 4.7819, 'learning_rate': 4.594594594594595e-05, 'epoch': 0.08}


  8%|▊         | 25/296 [25:34<4:34:31, 60.78s/it]

{'loss': 4.8903, 'learning_rate': 4.577702702702703e-05, 'epoch': 0.08}


  9%|▉         | 26/296 [26:35<4:34:00, 60.89s/it]

{'loss': 4.8344, 'learning_rate': 4.560810810810811e-05, 'epoch': 0.09}


  9%|▉         | 27/296 [27:36<4:32:56, 60.88s/it]

{'loss': 5.0906, 'learning_rate': 4.543918918918919e-05, 'epoch': 0.09}


  9%|▉         | 28/296 [28:37<4:31:25, 60.77s/it]

{'loss': 5.0619, 'learning_rate': 4.5270270270270274e-05, 'epoch': 0.09}


 10%|▉         | 29/296 [29:38<4:31:41, 61.05s/it]

{'loss': 4.8591, 'learning_rate': 4.510135135135135e-05, 'epoch': 0.1}


 10%|█         | 30/296 [30:39<4:30:04, 60.92s/it]

{'loss': 4.8892, 'learning_rate': 4.4932432432432435e-05, 'epoch': 0.1}


 10%|█         | 31/296 [31:40<4:29:12, 60.95s/it]

{'loss': 4.6756, 'learning_rate': 4.4763513513513516e-05, 'epoch': 0.1}


 11%|█         | 32/296 [32:40<4:26:26, 60.56s/it]

{'loss': 4.7827, 'learning_rate': 4.4594594594594596e-05, 'epoch': 0.11}


 11%|█         | 33/296 [33:43<4:28:41, 61.30s/it]

{'loss': 4.8861, 'learning_rate': 4.442567567567568e-05, 'epoch': 0.11}


 11%|█▏        | 34/296 [34:43<4:26:34, 61.05s/it]

{'loss': 4.9077, 'learning_rate': 4.425675675675676e-05, 'epoch': 0.11}


 12%|█▏        | 35/296 [35:44<4:25:31, 61.04s/it]

{'loss': 4.6996, 'learning_rate': 4.4087837837837845e-05, 'epoch': 0.12}


 12%|█▏        | 36/296 [36:46<4:24:53, 61.13s/it]

{'loss': 4.5376, 'learning_rate': 4.391891891891892e-05, 'epoch': 0.12}


 12%|█▎        | 37/296 [37:46<4:23:23, 61.02s/it]

{'loss': 4.7337, 'learning_rate': 4.375e-05, 'epoch': 0.12}


 13%|█▎        | 38/296 [38:48<4:23:35, 61.30s/it]

{'loss': 4.7178, 'learning_rate': 4.358108108108108e-05, 'epoch': 0.13}


 13%|█▎        | 39/296 [39:51<4:24:14, 61.69s/it]

{'loss': 4.7534, 'learning_rate': 4.341216216216216e-05, 'epoch': 0.13}


 14%|█▎        | 40/296 [40:52<4:22:12, 61.46s/it]

{'loss': 4.758, 'learning_rate': 4.324324324324325e-05, 'epoch': 0.14}


 14%|█▍        | 41/296 [41:52<4:19:55, 61.16s/it]

{'loss': 4.5137, 'learning_rate': 4.307432432432433e-05, 'epoch': 0.14}


 14%|█▍        | 42/296 [42:53<4:18:15, 61.01s/it]

{'loss': 4.6541, 'learning_rate': 4.290540540540541e-05, 'epoch': 0.14}


 15%|█▍        | 43/296 [43:54<4:17:34, 61.08s/it]

{'loss': 4.5836, 'learning_rate': 4.273648648648649e-05, 'epoch': 0.15}


 15%|█▍        | 44/296 [44:55<4:16:44, 61.13s/it]

{'loss': 4.6091, 'learning_rate': 4.256756756756757e-05, 'epoch': 0.15}


 15%|█▌        | 45/296 [45:56<4:14:40, 60.88s/it]

{'loss': 4.5096, 'learning_rate': 4.239864864864865e-05, 'epoch': 0.15}


 16%|█▌        | 46/296 [46:56<4:13:21, 60.81s/it]

{'loss': 4.6814, 'learning_rate': 4.222972972972973e-05, 'epoch': 0.16}


 16%|█▌        | 47/296 [47:58<4:13:32, 61.09s/it]

{'loss': 4.6916, 'learning_rate': 4.206081081081081e-05, 'epoch': 0.16}


 16%|█▌        | 48/296 [49:01<4:15:03, 61.71s/it]

{'loss': 4.7494, 'learning_rate': 4.189189189189189e-05, 'epoch': 0.16}


 17%|█▋        | 49/296 [50:03<4:13:38, 61.61s/it]

{'loss': 4.6566, 'learning_rate': 4.1722972972972974e-05, 'epoch': 0.17}


 17%|█▋        | 50/296 [51:03<4:11:24, 61.32s/it]

{'loss': 4.5897, 'learning_rate': 4.1554054054054055e-05, 'epoch': 0.17}


 17%|█▋        | 51/296 [52:05<4:10:34, 61.37s/it]

{'loss': 4.9295, 'learning_rate': 4.138513513513514e-05, 'epoch': 0.17}


 18%|█▊        | 52/296 [53:07<4:10:57, 61.71s/it]

{'loss': 4.9804, 'learning_rate': 4.1216216216216216e-05, 'epoch': 0.18}


 18%|█▊        | 53/296 [54:07<4:07:52, 61.20s/it]

{'loss': 4.62, 'learning_rate': 4.1047297297297297e-05, 'epoch': 0.18}


 18%|█▊        | 54/296 [55:07<4:04:57, 60.73s/it]

{'loss': 4.6714, 'learning_rate': 4.087837837837838e-05, 'epoch': 0.18}


 19%|█▊        | 55/296 [56:09<4:05:34, 61.14s/it]

{'loss': 4.6536, 'learning_rate': 4.070945945945946e-05, 'epoch': 0.19}


 19%|█▉        | 56/296 [57:12<4:06:18, 61.58s/it]

{'loss': 4.2976, 'learning_rate': 4.0540540540540545e-05, 'epoch': 0.19}


 19%|█▉        | 57/296 [58:14<4:05:48, 61.71s/it]

{'loss': 4.6075, 'learning_rate': 4.0371621621621626e-05, 'epoch': 0.19}


 20%|█▉        | 58/296 [59:14<4:03:11, 61.31s/it]

{'loss': 4.7296, 'learning_rate': 4.0202702702702707e-05, 'epoch': 0.2}


 20%|█▉        | 59/296 [1:00:15<4:01:36, 61.17s/it]

{'loss': 4.5294, 'learning_rate': 4.003378378378379e-05, 'epoch': 0.2}


 20%|██        | 60/296 [1:01:14<3:57:52, 60.48s/it]

{'loss': 4.7949, 'learning_rate': 3.986486486486487e-05, 'epoch': 0.2}


 21%|██        | 61/296 [1:02:13<3:55:22, 60.09s/it]

{'loss': 4.9071, 'learning_rate': 3.969594594594595e-05, 'epoch': 0.21}


 21%|██        | 62/296 [1:03:13<3:54:07, 60.03s/it]

{'loss': 4.7502, 'learning_rate': 3.952702702702703e-05, 'epoch': 0.21}


 21%|██▏       | 63/296 [1:04:11<3:51:31, 59.62s/it]

{'loss': 4.9145, 'learning_rate': 3.935810810810811e-05, 'epoch': 0.21}


 22%|██▏       | 64/296 [1:05:10<3:49:32, 59.37s/it]

{'loss': 4.4942, 'learning_rate': 3.918918918918919e-05, 'epoch': 0.22}


 22%|██▏       | 65/296 [1:06:09<3:47:35, 59.11s/it]

{'loss': 4.7135, 'learning_rate': 3.902027027027027e-05, 'epoch': 0.22}


 22%|██▏       | 66/296 [1:07:09<3:47:30, 59.35s/it]

{'loss': 4.615, 'learning_rate': 3.885135135135135e-05, 'epoch': 0.22}


 23%|██▎       | 67/296 [1:08:09<3:47:44, 59.67s/it]

{'loss': 4.5525, 'learning_rate': 3.868243243243244e-05, 'epoch': 0.23}


 23%|██▎       | 68/296 [1:09:07<3:45:15, 59.28s/it]

{'loss': 4.5769, 'learning_rate': 3.851351351351351e-05, 'epoch': 0.23}


 23%|██▎       | 69/296 [1:10:08<3:45:12, 59.52s/it]

{'loss': 4.743, 'learning_rate': 3.8344594594594594e-05, 'epoch': 0.23}


 24%|██▎       | 70/296 [1:11:08<3:44:53, 59.71s/it]

{'loss': 4.6037, 'learning_rate': 3.8175675675675674e-05, 'epoch': 0.24}


 24%|██▍       | 71/296 [1:12:09<3:45:13, 60.06s/it]

{'loss': 4.4965, 'learning_rate': 3.8006756756756755e-05, 'epoch': 0.24}


 24%|██▍       | 72/296 [1:13:07<3:42:22, 59.56s/it]

{'loss': 4.7803, 'learning_rate': 3.783783783783784e-05, 'epoch': 0.24}


 25%|██▍       | 73/296 [1:14:07<3:41:53, 59.70s/it]

{'loss': 4.8131, 'learning_rate': 3.766891891891892e-05, 'epoch': 0.25}


 25%|██▌       | 74/296 [1:15:07<3:41:07, 59.76s/it]

{'loss': 4.7108, 'learning_rate': 3.7500000000000003e-05, 'epoch': 0.25}


 25%|██▌       | 75/296 [1:16:05<3:38:45, 59.39s/it]

{'loss': 4.7275, 'learning_rate': 3.7331081081081084e-05, 'epoch': 0.25}


 26%|██▌       | 76/296 [1:17:07<3:40:06, 60.03s/it]

{'loss': 5.0988, 'learning_rate': 3.7162162162162165e-05, 'epoch': 0.26}


 26%|██▌       | 77/296 [1:18:06<3:38:00, 59.73s/it]

{'loss': 4.5171, 'learning_rate': 3.6993243243243245e-05, 'epoch': 0.26}


 26%|██▋       | 78/296 [1:19:04<3:35:28, 59.30s/it]

{'loss': 4.5433, 'learning_rate': 3.6824324324324326e-05, 'epoch': 0.26}


 27%|██▋       | 79/296 [1:20:04<3:34:41, 59.36s/it]

{'loss': 4.7105, 'learning_rate': 3.665540540540541e-05, 'epoch': 0.27}


 27%|██▋       | 80/296 [1:21:04<3:34:56, 59.71s/it]

{'loss': 4.5544, 'learning_rate': 3.648648648648649e-05, 'epoch': 0.27}


 27%|██▋       | 81/296 [1:22:03<3:33:20, 59.54s/it]

{'loss': 4.7761, 'learning_rate': 3.631756756756757e-05, 'epoch': 0.27}


 28%|██▊       | 82/296 [1:23:03<3:32:47, 59.66s/it]

{'loss': 4.7107, 'learning_rate': 3.6148648648648655e-05, 'epoch': 0.28}


 28%|██▊       | 83/296 [1:24:03<3:32:14, 59.79s/it]

{'loss': 4.7218, 'learning_rate': 3.5979729729729736e-05, 'epoch': 0.28}


 28%|██▊       | 84/296 [1:25:03<3:30:49, 59.67s/it]

{'loss': 4.7769, 'learning_rate': 3.581081081081081e-05, 'epoch': 0.28}


 29%|██▊       | 85/296 [1:26:02<3:29:00, 59.44s/it]

{'loss': 4.6214, 'learning_rate': 3.564189189189189e-05, 'epoch': 0.29}


 29%|██▉       | 86/296 [1:27:01<3:28:15, 59.50s/it]

{'loss': 4.7069, 'learning_rate': 3.547297297297297e-05, 'epoch': 0.29}


 29%|██▉       | 87/296 [1:27:59<3:25:36, 59.03s/it]

{'loss': 4.7706, 'learning_rate': 3.530405405405405e-05, 'epoch': 0.29}


 30%|██▉       | 88/296 [1:29:00<3:26:22, 59.53s/it]

{'loss': 4.498, 'learning_rate': 3.513513513513514e-05, 'epoch': 0.3}


 30%|███       | 89/296 [1:29:59<3:24:53, 59.39s/it]

{'loss': 4.5443, 'learning_rate': 3.496621621621622e-05, 'epoch': 0.3}


 30%|███       | 90/296 [1:30:57<3:22:40, 59.03s/it]

{'loss': 4.6415, 'learning_rate': 3.47972972972973e-05, 'epoch': 0.3}


 31%|███       | 91/296 [1:31:56<3:21:31, 58.98s/it]

{'loss': 4.551, 'learning_rate': 3.462837837837838e-05, 'epoch': 0.31}


 31%|███       | 92/296 [1:32:54<3:19:23, 58.65s/it]

{'loss': 4.6373, 'learning_rate': 3.445945945945946e-05, 'epoch': 0.31}


 31%|███▏      | 93/296 [1:33:53<3:19:14, 58.89s/it]

{'loss': 4.5267, 'learning_rate': 3.429054054054054e-05, 'epoch': 0.31}


 32%|███▏      | 94/296 [1:34:53<3:19:03, 59.13s/it]

{'loss': 4.7589, 'learning_rate': 3.412162162162162e-05, 'epoch': 0.32}


 32%|███▏      | 95/296 [1:35:52<3:17:43, 59.02s/it]

{'loss': 4.5384, 'learning_rate': 3.3952702702702704e-05, 'epoch': 0.32}


 32%|███▏      | 96/296 [1:36:52<3:17:29, 59.25s/it]

{'loss': 4.8916, 'learning_rate': 3.3783783783783784e-05, 'epoch': 0.32}


 33%|███▎      | 97/296 [1:37:51<3:16:55, 59.37s/it]

{'loss': 4.5144, 'learning_rate': 3.3614864864864865e-05, 'epoch': 0.33}


 33%|███▎      | 98/296 [1:38:50<3:15:28, 59.24s/it]

{'loss': 4.6652, 'learning_rate': 3.344594594594595e-05, 'epoch': 0.33}


 33%|███▎      | 99/296 [1:39:52<3:17:19, 60.10s/it]

{'loss': 5.0936, 'learning_rate': 3.327702702702703e-05, 'epoch': 0.33}


 34%|███▍      | 100/296 [1:40:52<3:15:24, 59.82s/it]

{'loss': 4.9058, 'learning_rate': 3.310810810810811e-05, 'epoch': 0.34}


 34%|███▍      | 101/296 [1:41:53<3:15:57, 60.30s/it]

{'loss': 4.7252, 'learning_rate': 3.293918918918919e-05, 'epoch': 0.34}


 34%|███▍      | 102/296 [1:42:51<3:13:10, 59.74s/it]

{'loss': 4.4658, 'learning_rate': 3.277027027027027e-05, 'epoch': 0.34}


 35%|███▍      | 103/296 [1:43:52<3:13:10, 60.06s/it]

{'loss': 4.6371, 'learning_rate': 3.260135135135135e-05, 'epoch': 0.35}


 35%|███▌      | 104/296 [1:44:51<3:11:20, 59.79s/it]

{'loss': 4.6821, 'learning_rate': 3.2432432432432436e-05, 'epoch': 0.35}


 35%|███▌      | 105/296 [1:45:53<3:11:41, 60.22s/it]

{'loss': 4.5278, 'learning_rate': 3.226351351351352e-05, 'epoch': 0.35}


 36%|███▌      | 106/296 [1:46:53<3:10:40, 60.21s/it]

{'loss': 4.3948, 'learning_rate': 3.20945945945946e-05, 'epoch': 0.36}


 36%|███▌      | 107/296 [1:47:52<3:09:09, 60.05s/it]

{'loss': 4.7448, 'learning_rate': 3.192567567567568e-05, 'epoch': 0.36}


 36%|███▋      | 108/296 [1:48:52<3:07:44, 59.91s/it]

{'loss': 4.583, 'learning_rate': 3.175675675675676e-05, 'epoch': 0.36}


 37%|███▋      | 109/296 [1:49:51<3:05:25, 59.49s/it]

{'loss': 4.7203, 'learning_rate': 3.158783783783784e-05, 'epoch': 0.37}


 37%|███▋      | 110/296 [1:50:50<3:04:27, 59.50s/it]

{'loss': 4.6392, 'learning_rate': 3.141891891891892e-05, 'epoch': 0.37}


 38%|███▊      | 111/296 [1:51:50<3:03:49, 59.62s/it]

{'loss': 4.8485, 'learning_rate': 3.125e-05, 'epoch': 0.38}


 38%|███▊      | 112/296 [1:52:48<3:01:31, 59.19s/it]

{'loss': 4.6704, 'learning_rate': 3.108108108108108e-05, 'epoch': 0.38}


 38%|███▊      | 113/296 [1:53:47<3:00:30, 59.18s/it]

{'loss': 4.8432, 'learning_rate': 3.091216216216216e-05, 'epoch': 0.38}


 39%|███▊      | 114/296 [1:54:47<3:00:12, 59.41s/it]

{'loss': 4.8171, 'learning_rate': 3.074324324324325e-05, 'epoch': 0.39}


 39%|███▉      | 115/296 [1:55:46<2:58:54, 59.30s/it]

{'loss': 4.7959, 'learning_rate': 3.057432432432433e-05, 'epoch': 0.39}


 39%|███▉      | 116/296 [1:56:45<2:57:44, 59.25s/it]

{'loss': 4.7584, 'learning_rate': 3.0405405405405407e-05, 'epoch': 0.39}


 40%|███▉      | 117/296 [1:57:45<2:57:02, 59.34s/it]

{'loss': 4.6573, 'learning_rate': 3.0236486486486488e-05, 'epoch': 0.4}


 40%|███▉      | 118/296 [1:58:47<2:58:34, 60.19s/it]

{'loss': 4.7473, 'learning_rate': 3.006756756756757e-05, 'epoch': 0.4}


 40%|████      | 119/296 [1:59:46<2:56:35, 59.86s/it]

{'loss': 4.6552, 'learning_rate': 2.9898648648648653e-05, 'epoch': 0.4}


 41%|████      | 120/296 [2:00:46<2:55:08, 59.71s/it]

{'loss': 4.5921, 'learning_rate': 2.9729729729729733e-05, 'epoch': 0.41}


 41%|████      | 121/296 [2:01:45<2:54:09, 59.71s/it]

{'loss': 4.6414, 'learning_rate': 2.9560810810810814e-05, 'epoch': 0.41}


 41%|████      | 122/296 [2:02:47<2:54:50, 60.29s/it]

{'loss': 4.5833, 'learning_rate': 2.9391891891891894e-05, 'epoch': 0.41}


 42%|████▏     | 123/296 [2:03:46<2:53:00, 60.00s/it]

{'loss': 4.7729, 'learning_rate': 2.9222972972972972e-05, 'epoch': 0.42}


 42%|████▏     | 124/296 [2:04:46<2:51:53, 59.96s/it]

{'loss': 4.3505, 'learning_rate': 2.9054054054054052e-05, 'epoch': 0.42}


 42%|████▏     | 125/296 [2:05:46<2:50:42, 59.90s/it]

{'loss': 4.6799, 'learning_rate': 2.888513513513514e-05, 'epoch': 0.42}


 43%|████▎     | 126/296 [2:06:46<2:50:00, 60.00s/it]

{'loss': 4.716, 'learning_rate': 2.8716216216216217e-05, 'epoch': 0.43}


 43%|████▎     | 127/296 [2:07:44<2:47:34, 59.49s/it]

{'loss': 4.8645, 'learning_rate': 2.8547297297297298e-05, 'epoch': 0.43}


 43%|████▎     | 128/296 [2:08:44<2:46:18, 59.39s/it]

{'loss': 4.7983, 'learning_rate': 2.8378378378378378e-05, 'epoch': 0.43}


 44%|████▎     | 129/296 [2:09:43<2:44:57, 59.26s/it]

{'loss': 4.6999, 'learning_rate': 2.820945945945946e-05, 'epoch': 0.44}


 44%|████▍     | 130/296 [2:10:41<2:43:34, 59.13s/it]

{'loss': 4.79, 'learning_rate': 2.8040540540540543e-05, 'epoch': 0.44}


 44%|████▍     | 131/296 [2:11:41<2:43:09, 59.33s/it]

{'loss': 4.6685, 'learning_rate': 2.7871621621621624e-05, 'epoch': 0.44}


 45%|████▍     | 132/296 [2:12:42<2:43:17, 59.74s/it]

{'loss': 4.6696, 'learning_rate': 2.7702702702702704e-05, 'epoch': 0.45}


 45%|████▍     | 133/296 [2:13:41<2:41:47, 59.56s/it]

{'loss': 4.5277, 'learning_rate': 2.7533783783783785e-05, 'epoch': 0.45}


 45%|████▌     | 134/296 [2:14:42<2:41:51, 59.95s/it]

{'loss': 4.5886, 'learning_rate': 2.7364864864864865e-05, 'epoch': 0.45}


 46%|████▌     | 135/296 [2:15:45<2:43:16, 60.84s/it]

{'loss': 4.5475, 'learning_rate': 2.719594594594595e-05, 'epoch': 0.46}


 46%|████▌     | 136/296 [2:16:46<2:42:18, 60.86s/it]

{'loss': 4.6823, 'learning_rate': 2.702702702702703e-05, 'epoch': 0.46}


 46%|████▋     | 137/296 [2:17:46<2:41:03, 60.78s/it]

{'loss': 4.7336, 'learning_rate': 2.685810810810811e-05, 'epoch': 0.46}


 47%|████▋     | 138/296 [2:18:46<2:39:01, 60.39s/it]

{'loss': 4.6651, 'learning_rate': 2.668918918918919e-05, 'epoch': 0.47}


 47%|████▋     | 139/296 [2:19:45<2:36:50, 59.94s/it]

{'loss': 4.8339, 'learning_rate': 2.652027027027027e-05, 'epoch': 0.47}


 47%|████▋     | 140/296 [2:20:44<2:35:13, 59.70s/it]

{'loss': 4.7401, 'learning_rate': 2.635135135135135e-05, 'epoch': 0.47}


 48%|████▊     | 141/296 [2:21:46<2:36:01, 60.39s/it]

{'loss': 4.6453, 'learning_rate': 2.6182432432432437e-05, 'epoch': 0.48}


 48%|████▊     | 142/296 [2:22:45<2:34:18, 60.12s/it]

{'loss': 4.8749, 'learning_rate': 2.6013513513513514e-05, 'epoch': 0.48}


 48%|████▊     | 143/296 [2:23:45<2:33:20, 60.13s/it]

{'loss': 4.6249, 'learning_rate': 2.5844594594594595e-05, 'epoch': 0.48}


 49%|████▊     | 144/296 [2:24:49<2:35:15, 61.29s/it]

{'loss': 4.588, 'learning_rate': 2.5675675675675675e-05, 'epoch': 0.49}


 49%|████▉     | 145/296 [2:25:49<2:32:48, 60.72s/it]

{'loss': 4.7219, 'learning_rate': 2.5506756756756756e-05, 'epoch': 0.49}


 49%|████▉     | 146/296 [2:26:49<2:31:40, 60.67s/it]

{'loss': 4.6939, 'learning_rate': 2.533783783783784e-05, 'epoch': 0.49}


 50%|████▉     | 147/296 [2:27:50<2:30:15, 60.50s/it]

{'loss': 4.7256, 'learning_rate': 2.516891891891892e-05, 'epoch': 0.5}


 50%|█████     | 148/296 [2:28:51<2:29:48, 60.73s/it]

{'loss': 4.6877, 'learning_rate': 2.5e-05, 'epoch': 0.5}


 50%|█████     | 149/296 [2:29:52<2:29:27, 61.00s/it]

{'loss': 4.7896, 'learning_rate': 2.4831081081081082e-05, 'epoch': 0.5}


 51%|█████     | 150/296 [2:30:54<2:28:46, 61.14s/it]

{'loss': 4.5887, 'learning_rate': 2.4662162162162162e-05, 'epoch': 0.51}


 51%|█████     | 151/296 [2:31:54<2:26:55, 60.80s/it]

{'loss': 4.5885, 'learning_rate': 2.4493243243243243e-05, 'epoch': 0.51}


 51%|█████▏    | 152/296 [2:32:53<2:24:21, 60.15s/it]

{'loss': 4.6244, 'learning_rate': 2.4324324324324327e-05, 'epoch': 0.51}


 52%|█████▏    | 153/296 [2:33:51<2:22:29, 59.79s/it]

{'loss': 4.7658, 'learning_rate': 2.4155405405405408e-05, 'epoch': 0.52}


 52%|█████▏    | 154/296 [2:34:52<2:21:56, 59.97s/it]

{'loss': 4.4738, 'learning_rate': 2.398648648648649e-05, 'epoch': 0.52}


 52%|█████▏    | 155/296 [2:35:52<2:21:08, 60.06s/it]

{'loss': 4.7132, 'learning_rate': 2.381756756756757e-05, 'epoch': 0.52}


 53%|█████▎    | 156/296 [2:36:52<2:19:45, 59.90s/it]

{'loss': 4.6669, 'learning_rate': 2.364864864864865e-05, 'epoch': 0.53}


 53%|█████▎    | 157/296 [2:37:51<2:18:10, 59.64s/it]

{'loss': 4.5901, 'learning_rate': 2.347972972972973e-05, 'epoch': 0.53}


 53%|█████▎    | 158/296 [2:38:52<2:18:05, 60.04s/it]

{'loss': 4.6364, 'learning_rate': 2.331081081081081e-05, 'epoch': 0.53}


 54%|█████▎    | 159/296 [2:39:52<2:17:10, 60.08s/it]

{'loss': 4.5433, 'learning_rate': 2.314189189189189e-05, 'epoch': 0.54}


 54%|█████▍    | 160/296 [2:40:50<2:14:58, 59.55s/it]

{'loss': 4.6215, 'learning_rate': 2.2972972972972976e-05, 'epoch': 0.54}


 54%|█████▍    | 161/296 [2:41:50<2:14:21, 59.71s/it]

{'loss': 4.4635, 'learning_rate': 2.2804054054054056e-05, 'epoch': 0.54}


 55%|█████▍    | 162/296 [2:42:48<2:12:13, 59.20s/it]

{'loss': 4.5189, 'learning_rate': 2.2635135135135137e-05, 'epoch': 0.55}


 55%|█████▌    | 163/296 [2:43:49<2:11:55, 59.51s/it]

{'loss': 4.6099, 'learning_rate': 2.2466216216216218e-05, 'epoch': 0.55}


 55%|█████▌    | 164/296 [2:44:47<2:10:16, 59.21s/it]

{'loss': 4.4848, 'learning_rate': 2.2297297297297298e-05, 'epoch': 0.55}


 56%|█████▌    | 165/296 [2:45:47<2:09:59, 59.54s/it]

{'loss': 4.6118, 'learning_rate': 2.212837837837838e-05, 'epoch': 0.56}


 56%|█████▌    | 166/296 [2:46:48<2:09:37, 59.83s/it]

{'loss': 4.6, 'learning_rate': 2.195945945945946e-05, 'epoch': 0.56}


 56%|█████▋    | 167/296 [2:47:46<2:07:44, 59.42s/it]

{'loss': 4.8151, 'learning_rate': 2.179054054054054e-05, 'epoch': 0.56}


 57%|█████▋    | 168/296 [2:48:45<2:06:18, 59.21s/it]

{'loss': 4.7038, 'learning_rate': 2.1621621621621624e-05, 'epoch': 0.57}


 57%|█████▋    | 169/296 [2:49:45<2:05:36, 59.35s/it]

{'loss': 4.4831, 'learning_rate': 2.1452702702702705e-05, 'epoch': 0.57}


 57%|█████▋    | 170/296 [2:50:44<2:04:30, 59.29s/it]

{'loss': 4.4062, 'learning_rate': 2.1283783783783785e-05, 'epoch': 0.57}


 58%|█████▊    | 171/296 [2:51:44<2:03:47, 59.42s/it]

{'loss': 4.5956, 'learning_rate': 2.1114864864864866e-05, 'epoch': 0.58}


 58%|█████▊    | 172/296 [2:52:44<2:03:37, 59.82s/it]

{'loss': 4.5505, 'learning_rate': 2.0945945945945947e-05, 'epoch': 0.58}


 58%|█████▊    | 173/296 [2:53:43<2:02:00, 59.52s/it]

{'loss': 4.6394, 'learning_rate': 2.0777027027027027e-05, 'epoch': 0.58}


 59%|█████▉    | 174/296 [2:54:42<2:00:55, 59.47s/it]

{'loss': 4.5986, 'learning_rate': 2.0608108108108108e-05, 'epoch': 0.59}


 59%|█████▉    | 175/296 [2:55:42<1:59:44, 59.37s/it]

{'loss': 4.5024, 'learning_rate': 2.043918918918919e-05, 'epoch': 0.59}


 59%|█████▉    | 176/296 [2:56:41<1:58:45, 59.38s/it]

{'loss': 4.4916, 'learning_rate': 2.0270270270270273e-05, 'epoch': 0.59}


 60%|█████▉    | 177/296 [2:57:41<1:58:16, 59.64s/it]

{'loss': 4.7249, 'learning_rate': 2.0101351351351353e-05, 'epoch': 0.6}


 60%|██████    | 178/296 [2:58:40<1:56:59, 59.48s/it]

{'loss': 4.7336, 'learning_rate': 1.9932432432432434e-05, 'epoch': 0.6}


 60%|██████    | 179/296 [2:59:40<1:56:13, 59.60s/it]

{'loss': 4.5074, 'learning_rate': 1.9763513513513515e-05, 'epoch': 0.6}


 61%|██████    | 180/296 [3:00:41<1:55:35, 59.79s/it]

{'loss': 4.7664, 'learning_rate': 1.9594594594594595e-05, 'epoch': 0.61}


 61%|██████    | 181/296 [3:01:40<1:54:28, 59.72s/it]

{'loss': 4.4693, 'learning_rate': 1.9425675675675676e-05, 'epoch': 0.61}


 61%|██████▏   | 182/296 [3:02:39<1:53:16, 59.62s/it]

{'loss': 4.9099, 'learning_rate': 1.9256756756756756e-05, 'epoch': 0.61}


 62%|██████▏   | 183/296 [3:03:38<1:51:55, 59.43s/it]

{'loss': 4.5718, 'learning_rate': 1.9087837837837837e-05, 'epoch': 0.62}


 62%|██████▏   | 184/296 [3:04:37<1:50:39, 59.28s/it]

{'loss': 4.3452, 'learning_rate': 1.891891891891892e-05, 'epoch': 0.62}


 62%|██████▎   | 185/296 [3:05:37<1:49:35, 59.24s/it]

{'loss': 4.657, 'learning_rate': 1.8750000000000002e-05, 'epoch': 0.62}


 63%|██████▎   | 186/296 [3:06:37<1:49:04, 59.50s/it]

{'loss': 4.7723, 'learning_rate': 1.8581081081081082e-05, 'epoch': 0.63}


 63%|██████▎   | 187/296 [3:07:36<1:48:01, 59.46s/it]

{'loss': 4.6777, 'learning_rate': 1.8412162162162163e-05, 'epoch': 0.63}


 64%|██████▎   | 188/296 [3:08:35<1:46:32, 59.19s/it]

{'loss': 4.5176, 'learning_rate': 1.8243243243243244e-05, 'epoch': 0.64}


 64%|██████▍   | 189/296 [3:09:33<1:45:21, 59.08s/it]

{'loss': 4.5442, 'learning_rate': 1.8074324324324328e-05, 'epoch': 0.64}


 64%|██████▍   | 190/296 [3:10:33<1:44:24, 59.10s/it]

{'loss': 4.56, 'learning_rate': 1.7905405405405405e-05, 'epoch': 0.64}


 65%|██████▍   | 191/296 [3:11:32<1:43:31, 59.15s/it]

{'loss': 4.7699, 'learning_rate': 1.7736486486486486e-05, 'epoch': 0.65}


 65%|██████▍   | 192/296 [3:12:30<1:42:06, 58.91s/it]

{'loss': 4.387, 'learning_rate': 1.756756756756757e-05, 'epoch': 0.65}


 65%|██████▌   | 193/296 [3:13:29<1:40:51, 58.75s/it]

{'loss': 4.5745, 'learning_rate': 1.739864864864865e-05, 'epoch': 0.65}


 66%|██████▌   | 194/296 [3:14:28<1:40:19, 59.01s/it]

{'loss': 4.6034, 'learning_rate': 1.722972972972973e-05, 'epoch': 0.66}


 66%|██████▌   | 195/296 [3:15:28<1:39:36, 59.18s/it]

{'loss': 4.5879, 'learning_rate': 1.706081081081081e-05, 'epoch': 0.66}


 66%|██████▌   | 196/296 [3:16:28<1:39:19, 59.60s/it]

{'loss': 4.5009, 'learning_rate': 1.6891891891891892e-05, 'epoch': 0.66}


 67%|██████▋   | 197/296 [3:17:29<1:38:49, 59.89s/it]

{'loss': 4.4571, 'learning_rate': 1.6722972972972976e-05, 'epoch': 0.67}


 67%|██████▋   | 198/296 [3:18:29<1:38:00, 60.01s/it]

{'loss': 4.751, 'learning_rate': 1.6554054054054053e-05, 'epoch': 0.67}


 67%|██████▋   | 199/296 [3:19:31<1:37:55, 60.58s/it]

{'loss': 4.6993, 'learning_rate': 1.6385135135135134e-05, 'epoch': 0.67}


 68%|██████▊   | 200/296 [3:20:32<1:37:09, 60.72s/it]

{'loss': 4.4667, 'learning_rate': 1.6216216216216218e-05, 'epoch': 0.68}


 68%|██████▊   | 201/296 [3:21:31<1:35:29, 60.31s/it]

{'loss': 4.5163, 'learning_rate': 1.60472972972973e-05, 'epoch': 0.68}


 68%|██████▊   | 202/296 [3:22:31<1:33:55, 59.95s/it]

{'loss': 4.4158, 'learning_rate': 1.587837837837838e-05, 'epoch': 0.68}


 69%|██████▊   | 203/296 [3:23:31<1:33:03, 60.04s/it]

{'loss': 4.6254, 'learning_rate': 1.570945945945946e-05, 'epoch': 0.69}


 69%|██████▉   | 204/296 [3:24:31<1:31:57, 59.97s/it]

{'loss': 4.8532, 'learning_rate': 1.554054054054054e-05, 'epoch': 0.69}


 69%|██████▉   | 205/296 [3:25:31<1:31:05, 60.06s/it]

{'loss': 4.5336, 'learning_rate': 1.5371621621621625e-05, 'epoch': 0.69}


 70%|██████▉   | 206/296 [3:26:31<1:30:17, 60.20s/it]

{'loss': 4.6593, 'learning_rate': 1.5202702702702704e-05, 'epoch': 0.7}


 70%|██████▉   | 207/296 [3:27:30<1:28:30, 59.67s/it]

{'loss': 4.5571, 'learning_rate': 1.5033783783783784e-05, 'epoch': 0.7}


 70%|███████   | 208/296 [3:28:29<1:27:27, 59.63s/it]

{'loss': 4.5535, 'learning_rate': 1.4864864864864867e-05, 'epoch': 0.7}


 71%|███████   | 209/296 [3:29:29<1:26:39, 59.76s/it]

{'loss': 4.7105, 'learning_rate': 1.4695945945945947e-05, 'epoch': 0.71}


 71%|███████   | 210/296 [3:30:28<1:25:11, 59.44s/it]

{'loss': 4.85, 'learning_rate': 1.4527027027027026e-05, 'epoch': 0.71}


 71%|███████▏  | 211/296 [3:31:27<1:24:08, 59.39s/it]

{'loss': 4.6175, 'learning_rate': 1.4358108108108108e-05, 'epoch': 0.71}


 72%|███████▏  | 212/296 [3:32:26<1:22:56, 59.24s/it]

{'loss': 4.7454, 'learning_rate': 1.4189189189189189e-05, 'epoch': 0.72}


 72%|███████▏  | 213/296 [3:33:25<1:21:46, 59.11s/it]

{'loss': 5.0292, 'learning_rate': 1.4020270270270271e-05, 'epoch': 0.72}


 72%|███████▏  | 214/296 [3:34:26<1:21:19, 59.51s/it]

{'loss': 4.6614, 'learning_rate': 1.3851351351351352e-05, 'epoch': 0.72}


 73%|███████▎  | 215/296 [3:35:24<1:20:02, 59.29s/it]

{'loss': 4.5824, 'learning_rate': 1.3682432432432433e-05, 'epoch': 0.73}


 73%|███████▎  | 216/296 [3:36:23<1:18:44, 59.05s/it]

{'loss': 4.5635, 'learning_rate': 1.3513513513513515e-05, 'epoch': 0.73}


 73%|███████▎  | 217/296 [3:37:22<1:17:53, 59.16s/it]

{'loss': 4.4279, 'learning_rate': 1.3344594594594596e-05, 'epoch': 0.73}


 74%|███████▎  | 218/296 [3:38:21<1:16:54, 59.16s/it]

{'loss': 4.5403, 'learning_rate': 1.3175675675675675e-05, 'epoch': 0.74}


 74%|███████▍  | 219/296 [3:39:22<1:16:18, 59.46s/it]

{'loss': 4.71, 'learning_rate': 1.3006756756756757e-05, 'epoch': 0.74}


 74%|███████▍  | 220/296 [3:40:20<1:14:59, 59.20s/it]

{'loss': 4.646, 'learning_rate': 1.2837837837837838e-05, 'epoch': 0.74}


 75%|███████▍  | 221/296 [3:41:19<1:13:57, 59.17s/it]

{'loss': 4.361, 'learning_rate': 1.266891891891892e-05, 'epoch': 0.75}


 75%|███████▌  | 222/296 [3:42:19<1:13:08, 59.31s/it]

{'loss': 4.7876, 'learning_rate': 1.25e-05, 'epoch': 0.75}


 75%|███████▌  | 223/296 [3:43:18<1:12:12, 59.34s/it]

{'loss': 4.7393, 'learning_rate': 1.2331081081081081e-05, 'epoch': 0.75}


 76%|███████▌  | 224/296 [3:44:18<1:11:28, 59.56s/it]

{'loss': 4.8301, 'learning_rate': 1.2162162162162164e-05, 'epoch': 0.76}


 76%|███████▌  | 225/296 [3:45:17<1:10:14, 59.36s/it]

{'loss': 4.5558, 'learning_rate': 1.1993243243243244e-05, 'epoch': 0.76}


 76%|███████▋  | 226/296 [3:46:16<1:09:03, 59.20s/it]

{'loss': 4.4951, 'learning_rate': 1.1824324324324325e-05, 'epoch': 0.76}


 77%|███████▋  | 227/296 [3:47:14<1:07:36, 58.78s/it]

{'loss': 4.1789, 'learning_rate': 1.1655405405405405e-05, 'epoch': 0.77}


 77%|███████▋  | 228/296 [3:48:14<1:07:01, 59.14s/it]

{'loss': 4.5553, 'learning_rate': 1.1486486486486488e-05, 'epoch': 0.77}


 77%|███████▋  | 229/296 [3:49:15<1:06:36, 59.64s/it]

{'loss': 4.5524, 'learning_rate': 1.1317567567567568e-05, 'epoch': 0.77}


 78%|███████▊  | 230/296 [3:50:14<1:05:35, 59.63s/it]

{'loss': 4.4903, 'learning_rate': 1.1148648648648649e-05, 'epoch': 0.78}


 78%|███████▊  | 231/296 [3:51:14<1:04:32, 59.57s/it]

{'loss': 4.6654, 'learning_rate': 1.097972972972973e-05, 'epoch': 0.78}


 78%|███████▊  | 232/296 [3:52:12<1:02:59, 59.06s/it]

{'loss': 4.7699, 'learning_rate': 1.0810810810810812e-05, 'epoch': 0.78}


 79%|███████▊  | 233/296 [3:53:12<1:02:24, 59.43s/it]

{'loss': 4.5288, 'learning_rate': 1.0641891891891893e-05, 'epoch': 0.79}


 79%|███████▉  | 234/296 [3:54:10<1:00:58, 59.00s/it]

{'loss': 4.731, 'learning_rate': 1.0472972972972973e-05, 'epoch': 0.79}


 79%|███████▉  | 235/296 [3:55:10<1:00:21, 59.37s/it]

{'loss': 4.5491, 'learning_rate': 1.0304054054054054e-05, 'epoch': 0.79}


 80%|███████▉  | 236/296 [3:56:09<59:20, 59.35s/it]  

{'loss': 4.6438, 'learning_rate': 1.0135135135135136e-05, 'epoch': 0.8}


 80%|████████  | 237/296 [3:57:08<58:01, 59.01s/it]

{'loss': 4.6042, 'learning_rate': 9.966216216216217e-06, 'epoch': 0.8}


 80%|████████  | 238/296 [3:58:06<56:47, 58.75s/it]

{'loss': 4.5049, 'learning_rate': 9.797297297297298e-06, 'epoch': 0.8}


 81%|████████  | 239/296 [3:59:05<56:00, 58.95s/it]

{'loss': 4.4251, 'learning_rate': 9.628378378378378e-06, 'epoch': 0.81}


 81%|████████  | 240/296 [4:00:04<55:03, 58.98s/it]

{'loss': 4.853, 'learning_rate': 9.45945945945946e-06, 'epoch': 0.81}


 81%|████████▏ | 241/296 [4:01:03<53:59, 58.90s/it]

{'loss': 4.7053, 'learning_rate': 9.290540540540541e-06, 'epoch': 0.81}


 82%|████████▏ | 242/296 [4:02:02<52:59, 58.88s/it]

{'loss': 4.796, 'learning_rate': 9.121621621621622e-06, 'epoch': 0.82}


 82%|████████▏ | 243/296 [4:03:02<52:20, 59.25s/it]

{'loss': 4.5606, 'learning_rate': 8.952702702702702e-06, 'epoch': 0.82}


 82%|████████▏ | 244/296 [4:04:02<51:28, 59.39s/it]

{'loss': 4.5634, 'learning_rate': 8.783783783783785e-06, 'epoch': 0.82}


 83%|████████▎ | 245/296 [4:05:01<50:34, 59.49s/it]

{'loss': 4.6019, 'learning_rate': 8.614864864864865e-06, 'epoch': 0.83}


 83%|████████▎ | 246/296 [4:06:01<49:39, 59.58s/it]

{'loss': 4.7149, 'learning_rate': 8.445945945945946e-06, 'epoch': 0.83}


 83%|████████▎ | 247/296 [4:07:01<48:43, 59.67s/it]

{'loss': 4.6199, 'learning_rate': 8.277027027027027e-06, 'epoch': 0.83}


 84%|████████▍ | 248/296 [4:08:02<47:55, 59.91s/it]

{'loss': 4.545, 'learning_rate': 8.108108108108109e-06, 'epoch': 0.84}


 84%|████████▍ | 249/296 [4:09:01<46:44, 59.68s/it]

{'loss': 4.4512, 'learning_rate': 7.93918918918919e-06, 'epoch': 0.84}


 84%|████████▍ | 250/296 [4:09:59<45:30, 59.36s/it]

{'loss': 4.7091, 'learning_rate': 7.77027027027027e-06, 'epoch': 0.84}


 85%|████████▍ | 251/296 [4:10:58<44:28, 59.30s/it]

{'loss': 4.5966, 'learning_rate': 7.601351351351352e-06, 'epoch': 0.85}


 85%|████████▌ | 252/296 [4:11:57<43:16, 59.02s/it]

{'loss': 4.6867, 'learning_rate': 7.432432432432433e-06, 'epoch': 0.85}


 85%|████████▌ | 253/296 [4:12:55<42:05, 58.73s/it]

{'loss': 4.3865, 'learning_rate': 7.263513513513513e-06, 'epoch': 0.85}


 86%|████████▌ | 254/296 [4:13:57<41:44, 59.63s/it]

{'loss': 4.5726, 'learning_rate': 7.0945945945945946e-06, 'epoch': 0.86}


 86%|████████▌ | 255/296 [4:14:54<40:21, 59.06s/it]

{'loss': 4.607, 'learning_rate': 6.925675675675676e-06, 'epoch': 0.86}


 86%|████████▋ | 256/296 [4:15:53<39:14, 58.86s/it]

{'loss': 4.7808, 'learning_rate': 6.7567567567567575e-06, 'epoch': 0.86}


 87%|████████▋ | 257/296 [4:16:51<38:09, 58.69s/it]

{'loss': 4.5474, 'learning_rate': 6.587837837837837e-06, 'epoch': 0.87}


 87%|████████▋ | 258/296 [4:17:49<37:04, 58.53s/it]

{'loss': 4.3355, 'learning_rate': 6.418918918918919e-06, 'epoch': 0.87}


 88%|████████▊ | 259/296 [4:18:49<36:17, 58.85s/it]

{'loss': 4.7206, 'learning_rate': 6.25e-06, 'epoch': 0.88}


 88%|████████▊ | 260/296 [4:19:48<35:20, 58.90s/it]

{'loss': 4.5714, 'learning_rate': 6.081081081081082e-06, 'epoch': 0.88}


 88%|████████▊ | 261/296 [4:20:47<34:28, 59.10s/it]

{'loss': 4.5359, 'learning_rate': 5.912162162162162e-06, 'epoch': 0.88}


 89%|████████▊ | 262/296 [4:21:46<33:21, 58.87s/it]

{'loss': 4.4623, 'learning_rate': 5.743243243243244e-06, 'epoch': 0.89}


 89%|████████▉ | 263/296 [4:22:45<32:28, 59.05s/it]

{'loss': 4.3573, 'learning_rate': 5.5743243243243245e-06, 'epoch': 0.89}


 89%|████████▉ | 264/296 [4:23:44<31:28, 59.01s/it]

{'loss': 4.6207, 'learning_rate': 5.405405405405406e-06, 'epoch': 0.89}


 90%|████████▉ | 265/296 [4:24:45<30:49, 59.65s/it]

{'loss': 4.4309, 'learning_rate': 5.236486486486487e-06, 'epoch': 0.9}


 90%|████████▉ | 266/296 [4:25:44<29:45, 59.52s/it]

{'loss': 4.5558, 'learning_rate': 5.067567567567568e-06, 'epoch': 0.9}


 90%|█████████ | 267/296 [4:26:44<28:48, 59.61s/it]

{'loss': 4.3861, 'learning_rate': 4.898648648648649e-06, 'epoch': 0.9}


 91%|█████████ | 268/296 [4:27:44<27:46, 59.52s/it]

{'loss': 4.5818, 'learning_rate': 4.72972972972973e-06, 'epoch': 0.91}


 91%|█████████ | 269/296 [4:28:44<26:54, 59.80s/it]

{'loss': 4.6059, 'learning_rate': 4.560810810810811e-06, 'epoch': 0.91}


 91%|█████████ | 270/296 [4:29:44<25:57, 59.91s/it]

{'loss': 4.7722, 'learning_rate': 4.391891891891892e-06, 'epoch': 0.91}


 92%|█████████▏| 271/296 [4:30:43<24:49, 59.57s/it]

{'loss': 4.7397, 'learning_rate': 4.222972972972973e-06, 'epoch': 0.92}


 92%|█████████▏| 272/296 [4:31:41<23:40, 59.20s/it]

{'loss': 4.666, 'learning_rate': 4.0540540540540545e-06, 'epoch': 0.92}


 92%|█████████▏| 273/296 [4:32:42<22:54, 59.75s/it]

{'loss': 4.5006, 'learning_rate': 3.885135135135135e-06, 'epoch': 0.92}


 93%|█████████▎| 274/296 [4:33:41<21:49, 59.50s/it]

{'loss': 4.5212, 'learning_rate': 3.7162162162162166e-06, 'epoch': 0.93}


 93%|█████████▎| 275/296 [4:34:41<20:52, 59.65s/it]

{'loss': 4.3359, 'learning_rate': 3.5472972972972973e-06, 'epoch': 0.93}


 93%|█████████▎| 276/296 [4:35:40<19:46, 59.35s/it]

{'loss': 4.5905, 'learning_rate': 3.3783783783783788e-06, 'epoch': 0.93}


 94%|█████████▎| 277/296 [4:36:39<18:45, 59.24s/it]

{'loss': 4.4498, 'learning_rate': 3.2094594594594594e-06, 'epoch': 0.94}


 94%|█████████▍| 278/296 [4:37:39<17:52, 59.57s/it]

{'loss': 4.7427, 'learning_rate': 3.040540540540541e-06, 'epoch': 0.94}


 94%|█████████▍| 279/296 [4:38:39<16:51, 59.52s/it]

{'loss': 4.5443, 'learning_rate': 2.871621621621622e-06, 'epoch': 0.94}


 95%|█████████▍| 280/296 [4:39:39<15:58, 59.90s/it]

{'loss': 4.8823, 'learning_rate': 2.702702702702703e-06, 'epoch': 0.95}


 95%|█████████▍| 281/296 [4:40:39<14:55, 59.69s/it]

{'loss': 4.5215, 'learning_rate': 2.533783783783784e-06, 'epoch': 0.95}


 95%|█████████▌| 282/296 [4:41:39<13:59, 59.95s/it]

{'loss': 4.765, 'learning_rate': 2.364864864864865e-06, 'epoch': 0.95}


 96%|█████████▌| 283/296 [4:42:37<12:52, 59.45s/it]

{'loss': 4.5865, 'learning_rate': 2.195945945945946e-06, 'epoch': 0.96}


 96%|█████████▌| 284/296 [4:43:37<11:53, 59.49s/it]

{'loss': 4.6707, 'learning_rate': 2.0270270270270273e-06, 'epoch': 0.96}


 96%|█████████▋| 285/296 [4:44:38<10:57, 59.81s/it]

{'loss': 4.7521, 'learning_rate': 1.8581081081081083e-06, 'epoch': 0.96}


 97%|█████████▋| 286/296 [4:45:37<09:57, 59.73s/it]

{'loss': 4.4125, 'learning_rate': 1.6891891891891894e-06, 'epoch': 0.97}


 97%|█████████▋| 287/296 [4:46:39<09:03, 60.34s/it]

{'loss': 4.5509, 'learning_rate': 1.5202702702702704e-06, 'epoch': 0.97}


 97%|█████████▋| 288/296 [4:47:38<07:58, 59.84s/it]

{'loss': 4.7045, 'learning_rate': 1.3513513513513515e-06, 'epoch': 0.97}


 98%|█████████▊| 289/296 [4:48:37<06:58, 59.79s/it]

{'loss': 4.8241, 'learning_rate': 1.1824324324324326e-06, 'epoch': 0.98}


 98%|█████████▊| 290/296 [4:49:39<06:01, 60.27s/it]

{'loss': 4.7342, 'learning_rate': 1.0135135135135136e-06, 'epoch': 0.98}


 98%|█████████▊| 291/296 [4:50:38<04:59, 59.92s/it]

{'loss': 4.4329, 'learning_rate': 8.445945945945947e-07, 'epoch': 0.98}


 99%|█████████▊| 292/296 [4:51:39<04:01, 60.29s/it]

{'loss': 4.6195, 'learning_rate': 6.756756756756758e-07, 'epoch': 0.99}


 99%|█████████▉| 293/296 [4:52:38<03:00, 60.07s/it]

{'loss': 4.484, 'learning_rate': 5.067567567567568e-07, 'epoch': 0.99}


 99%|█████████▉| 294/296 [4:53:38<01:59, 59.93s/it]

{'loss': 4.5378, 'learning_rate': 3.378378378378379e-07, 'epoch': 0.99}


100%|█████████▉| 295/296 [4:54:38<00:59, 59.84s/it]

{'loss': 4.8153, 'learning_rate': 1.6891891891891894e-07, 'epoch': 1.0}


100%|██████████| 296/296 [4:54:56<00:00, 47.27s/it]

{'loss': 4.6314, 'learning_rate': 0.0, 'epoch': 1.0}


                                                   
100%|██████████| 296/296 [5:24:47<00:00, 65.83s/it]


{'eval_loss': 4.582881450653076, 'eval_accuracy': 0.026023484608060934, 'eval_runtime': 1790.949, 'eval_samples_per_second': 5.278, 'eval_steps_per_second': 0.66, 'epoch': 1.0}
{'train_runtime': 19487.0879, 'train_samples_per_second': 0.485, 'train_steps_per_second': 0.015, 'train_loss': 4.678220758566985, 'epoch': 1.0}


In [None]:
# 0.026023484608060934

In [12]:
model._model.push_to_hub('hygpt-compress-class')

Cloning https://huggingface.co/josetapia/hygpt-compress-class into local empty directory.
Upload file pytorch_model.bin: 100%|█████████▉| 340M/340M [10:14<00:00, 587kB/s] remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/josetapia/hygpt-compress-class
   5bd038c..a83bca3  main -> main

Upload file pytorch_model.bin: 100%|██████████| 340M/340M [10:17<00:00, 578kB/s]


'https://huggingface.co/josetapia/hygpt-compress-class/commit/a83bca39e13fdff3ad0ecd37ed786cec7e59997b'

In [None]:

# Training arguments
batch_size = 32
epochs = 8
torch.cuda.empty_cache()
training_args = TrainingArguments(
    output_dir='hybertheuristic-10',
    logging_dir='hybertheuristic-10',
    evaluation_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    eval_steps=1,
    num_train_epochs=epochs, 
    weight_decay=0.01,
    logging_steps = 1,
    disable_tqdm=False)

data_collator = DataCollatorWithPadding(tokenizer=model._tokenizer, padding=True)
model._trainer = Trainer(
    model=model._model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,
    tokenizer=model._tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Fit model
model._trainer.train()

In [None]:
model._model.push_to_hub('hybertheuristic-trainer-3') 

In [None]:
def predict(seq):
  seq_str = ', '.join([str(x) for x in seq])
  token = model._tokenizer([seq_str])
  print(token)
  sequence_dataset = Dataset_hf.from_dict(token)
  print(sequence_dataset)
  prediction = model._predict(sequence_dataset).predictions[0]
  
  return np.exp(prediction) / sum(np.exp(prediction))
  #return (np.exp(prediction) / sum(np.exp(prediction)))[0]


A = predict([0])
A

In [None]:
choices(encoded_heuristic_space['default.txt'], A)

In [None]:
choices(encoded_heuristic_space['default.txt'], A)

In [None]:
choices(encoded_heuristic_space['default.txt'], A)

In [None]:
choices(encoded_heuristic_space['default.txt'], A)

In [None]:
choices(encoded_heuristic_space['default.txt'], A)