In [1]:
import configparser
import os 
from pathlib import Path
import pandas as pd
import json
import re
import numpy as np
from transformers import PreTrainedTokenizerFast

  from .autonotebook import tqdm as notebook_tqdm


## Descarga de archivos

In [3]:
base_dir = Path(os.getcwd()).resolve().parent
config = configparser.ConfigParser()
config.read(os.path.join(base_dir,'config.ini'))

['/home/lamedinaa/testing_rl/config.ini']

In [4]:
autor = 'nihal'
name_file_abstract_datasets = f"abstract_episodes_4_nihal_351905032025.csv"
path_abstract_datasets_dir = config.get(autor,'path_abstract_datasets')
path_file_abstract_datasets = os.path.join(base_dir,path_abstract_datasets_dir,name_file_abstract_datasets)
print(path_file_abstract_datasets)

name_file_abstract_states = f'abstract_states_4_nihal_351905032025.json'
path_abstract_states_dir = config.get(autor,'path_abstract_states_dir')
path_file_abstract_states = os.path.join(base_dir,path_abstract_states_dir,name_file_abstract_states)
print(path_file_abstract_states)

### abstract to abstract class to build vocab
f = open(path_file_abstract_states,'r')
dict_abstract_states = json.load(f)


/home/lamedinaa/testing_rl/data/4_random_forest_model/datasets/abstract_episodes_4_nihal_351905032025.csv
/home/lamedinaa/testing_rl/data/2_abstract_classes/abstract_states_4_nihal_351905032025.json


In [8]:
##### DF DASTASET
df_abstract_states = pd.read_csv(path_file_abstract_datasets,sep=';')

## Tokenize

In [52]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace

## definimos el vocabularion
especial_words = ["1","0","True",'[UNK]','[CLS]']
vocabulary_map = {f'w{i+1}':abstract_class for i,abstract_class in enumerate(dict_abstract_states.keys()) }
vocabulary = list(vocabulary_map.keys()) + especial_words 
## tokenizamos
vocab =  {word:idx for idx,word in enumerate(vocabulary)}

tokenizer = Tokenizer(WordLevel(vocab,unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

print(len(vocabulary))

50


In [53]:
#### get and prepare data
df_abstract_states = pd.read_csv(path_file_abstract_datasets,sep=';')

def clean_text(row):
    return '[CLS]'+ re.sub(r'\[\[.*?\]\]', '[UNK]', row['parse_abstract_states']) + ' True'

texts = df_abstract_states.apply(clean_text,axis=1)

###### CODIFICACIÓN DE TEXTOS 
from transformers import PreTrainedTokenizerFast

fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object = tokenizer)
fast_tokenizer.add_special_tokens({
    'unk_token': '[UNK]',
    'pad_token': '[PAD]'
})
encodings = fast_tokenizer(
    list(texts),
    padding=True, 
    truncation=True, 
    return_tensors='pt'
)
######## PRUEBA
ids = [int(t) for t in list(encodings['input_ids'][0])]
print(tokenizer.decode(ids))
print(texts[0])


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[UNK] [UNK] [UNK] w29 1 w29 1 w29 0 w29 0 w21 1 w21 0 w21 0 w21 1 w24 0 w24 1 w24 0 w24 1 w30 0 w30 1 w30 0 w30 1 w23 0 w23 1 w23 0 w23 1 w32 0 w32 1 w32 0 w32 0 w20 1 w20 0 w20 1 w20 0 w28 1 w28 0 w28 1 w28 0 w12 1 w12 0 w12 1 w12 0 w18 1 w18 0 w18 0 w18 1 w17 0 w17 1 w17 0 w17 1 w1 0 w1 1 w1 0 w1 1 w31 0 w31 1 w31 0 w31 1 w4 0 w4 1 w4 0 w4 0 w2 1 w2 0 w2 1 w2 0 w14 1 w14 0 w14 1 w14 0 w8 0 w8 1 w8 0 w8 1 w22 0 w22 1 w22 0 w22 0 w26 1 w26 0 w26 1 w26 0 w15 1 [UNK] 0 w15 1 w15 0 w25 0 w25 1 w25 0 w25 1 w19 0 w19 0 w19 1 w19 0 w5 1 w5 0 w5 1 w5 0 w11 0 w11 1 w11 0 w11 1 w13 0 w13 1 w13 0 w13 0 w10 1 w10 0 w10 1 w10 0 w9 0 w9 1 w9 0 w9 1 w27 0 w27 1 w27 0 w27 0 w16 1 w16 0 w16 1 w16 0 w3 1 w3 0 w3 1 w3 0 w6 0 w6 1 w6 0 w6 1 w7 0 w7 1 w7 0 True
[CLS]w29 1 w29 1 w29 0 w29 0 w21 1 w21 0 w21 0 w21 1 w24 0 w24 1 w24 0 w24 1 w30 0 w30 1 w30 0 w30 1 w23 0 w23 1 w23 0 w23 1 w32 0 w32 1 w32 0 w32 0 w20 1 w20 0 w20 1 w20 0 w28 1 w28 0 w28 1 w28 0 w12 1 w12 0 w12 1 w12 0 w18 1 w18 0 w18 0 w18 1 w17

## Model Transformer txt generation

### OPTION 1

In [None]:
import torch
from transformers import GPT2Config, GPT2LMHeadModel
from torch.utils.data import Dataset,DataLoader


device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")

# Paso 1: Crear el modelo
config = GPT2Config(
    vocab_size=len(fast_tokenizer),
    n_embd=512,
    n_layer=6,
    n_head=8,
    bos_token_id=fast_tokenizer.pad_token_id,
    eos_token_id=fast_tokenizer.pad_token_id,
    pad_token_id=fast_tokenizer.pad_token_id
)
model = GPT2LMHeadModel(config)
model = model.to(device)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4)
dataloader = DataLoader(encodings["input_ids"], batch_size=2, shuffle=True)

# Tu dataloader que da batches de input_ids
for epoch in range(5):  # 5 épocas
    model.train()
    for batch in dataloader:
        input_ids = batch.to(device)

        outputs = model(input_ids, labels=input_ids)  # Language Modeling: entrada=salida
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f"Epoch {epoch+1} completada, pérdida: {loss.item()}")


save_directory = '/home/lamedinaa/testing_rl/data/5_models_LP/transformers/gpt2'
model.save_pretrained(save_directory)
fast_tokenizer.save_pretrained(save_directory)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch 1 completada, pérdida: 0.11655746400356293
Epoch 2 completada, pérdida: 0.05890839174389839
Epoch 3 completada, pérdida: 0.07286675274372101
Epoch 4 completada, pérdida: 0.054355982691049576
Epoch 5 completada, pérdida: 0.07866627722978592


#### OPTION 2: train model

In [37]:
encodings

{'input_ids': tensor([[28, 45, 28,  ..., 49, 49, 49],
        [34, 45, 34,  ..., 49, 49, 49],
        [29, 45, 29,  ..., 49, 49, 49],
        ...,
        [23, 45, 23,  ..., 49, 49, 49],
        [20, 45, 20,  ..., 49, 49, 49],
        [23, 45, 23,  ..., 49, 49, 49]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [None]:
from torch.utils.data import Dataset

class EncodingsDataset(Dataset):

    """
     Volver la clase encodings a la clase dataset necesaria para que Trainer lo acepte!!
    """
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return self.encodings["input_ids"].size(0)

    def __getitem__(self, idx):
        item = {key: tensor[idx] for key, tensor in self.encodings.items()}
        item["labels"] = item["input_ids"].clone()
        return item

dataset = EncodingsDataset(encodings)

print(type(dataset))

<class '__main__.EncodingsDataset'>


In [18]:
from transformers import Trainer,TrainingArguments
from transformers import GPT2Config, GPT2LMHeadModel
import torch
from torch.utils.data import Dataset,DataLoader,TensorDataset


device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")
save_directory = '/home/lamedinaa/testing_rl/data/5_models_LP/transformers/gpt2_model3'

config = GPT2Config(
    vocab_size=len(fast_tokenizer),
    n_embd=512,
    n_layer=6,
    n_head=8,
    bos_token_id=fast_tokenizer.pad_token_id,
    eos_token_id=fast_tokenizer.pad_token_id,
    pad_token_id=fast_tokenizer.pad_token_id
)
model = GPT2LMHeadModel(config)
model = model.to(device)


training_args = TrainingArguments(
    output_dir = save_directory,
    num_train_epochs = 5,
    per_device_train_batch_size =8, 
    save_steps=500, 
    save_total_limit=2, 
)

trainer = Trainer(
    model = model,
    args= training_args, 
    train_dataset=dataset,
)

trainer.train()


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
500,0.1012
1000,0.0332
1500,0.0269


TrainOutput(global_step=1565, training_loss=0.05261854447495823, metrics={'train_runtime': 544.5562, 'train_samples_per_second': 91.744, 'train_steps_per_second': 2.874, 'total_flos': 2035551080939520.0, 'train_loss': 0.05261854447495823, 'epoch': 5.0})

### Prompting

##### loading model

In [22]:
prompt = "w29 1 w29 1 w29"
inputs = fast_tokenizer(prompt, return_tensors="pt")
inputs = inputs.to(device)

In [None]:


# Paso 3: Generar texto
outputs = model.generate(
    inputs['input_ids'],
    max_length=300,
    num_return_sequences=2,
    do_sample=True,
    top_k=50,
    top_p=0.95
)

# Paso 4: Decodificar
generated_text = fast_tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)
print(texts[0])

w29 1 w29 1 w29 1 w21 0 w21 0 w21 1 w21 0 w24 0 w24 1 w24 0 w24 1 w30 0 w30 1 w30 0 w30 0 w23 1 w23 0 w23 1 w23 0 w32 1 w32 0 w32 1 w32 0 w20 1 w20 0 w20 1 w20 0 w28 0 w28 1 w28 0 w28 1 w12 0 w12 1 w12 0 w12 1 w18 0 w18 1 w18 0 w18 1 w17 0 w17 1 w17 0 w17 1 w1 0 w1 0 w1 1 w1 0 w31 1 w31 0 w31 1 w31 0 w4 1 w4 0 w4 1 w4 0 w2 0 w2 1 w2 0 w2 1 w14 0 w14 1 w14 0 w14 1 w8 0 w8 1 w8 0 w8 0 w22 1 w22 0 w22 1 w22 0 w26 1 w26 0 w26 0 w26 1 w15 0 w15 1 w15 0 w15 1 w25 0 w25 0 w25 1 w25 0 w19 1 w19 0 w19 1 w19 0 w5 0 w5 1 w5 0 w5 1 w11 0 w11 0 w11 1 w11 0 w13 1 w13 0 w13 1 w13 0 w10 0 w10 1 w10 0 w10 1 w9 0 w9 0 w9 1 w9 0 w27 1 w27 0 w27 1 w27 0 w16 0 w16 1 w16 0 w16 1 w3 0 w3 1 w3 0 w3 1 w6 0 w6 0 w6 1 w6 0 w7 1 w7 0 w7 1 True
w29 1 w29 1 w29 0 w29 0 w21 1 w21 0 w21 0 w21 1 w24 0 w24 1 w24 0 w24 1 w30 0 w30 1 w30 0 w30 1 w23 0 w23 1 w23 0 w23 1 w32 0 w32 1 w32 0 w32 0 w20 1 w20 0 w20 1 w20 0 w28 1 w28 0 w28 1 w28 0 w12 1 w12 0 w12 1 w12 0 w18 1 w18 0 w18 0 w18 1 w17 0 w17 1 w17 0 w17 1 w1 0 w1 1 

In [21]:
device

device(type='cuda', index=3)

In [39]:
from transformers import AutoTokenizer

model.eval()
device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")
model.to(device)

prompt = "w3"
inputs = fast_tokenizer(prompt, return_tensors="pt")
inputs = inputs.to(device)

outputs = model.generate(
    **inputs,
    max_length=300,
    do_sample=True,
    temperature=1.0,         # controla “creatividad” (1.0 = sin cambio)
    num_return_sequences=2,
    top_k=50,               # máximo 50
    top_p=0.95              # elige los tokens que sumen una probabilidad >= 0.95
)

###
generated_text = fast_tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)
print(texts[0])

w3 1 True 1 True 1 True 0 w34 0 w29 1 True 0 w34 1 w29 0 True 0 True 1 True
w29 1 w29 1 w29 0 w29 0 w21 1 w21 0 w21 0 w21 1 w24 0 w24 1 w24 0 w24 1 w30 0 w30 1 w30 0 w30 1 w23 0 w23 1 w23 0 w23 1 w32 0 w32 1 w32 0 w32 0 w20 1 w20 0 w20 1 w20 0 w28 1 w28 0 w28 1 w28 0 w12 1 w12 0 w12 1 w12 0 w18 1 w18 0 w18 0 w18 1 w17 0 w17 1 w17 0 w17 1 w1 0 w1 1 w1 0 w1 1 w31 0 w31 1 w31 0 w31 1 w4 0 w4 1 w4 0 w4 0 w2 1 w2 0 w2 1 w2 0 w14 1 w14 0 w14 1 w14 0 w8 0 w8 1 w8 0 w8 1 w22 0 w22 1 w22 0 w22 0 w26 1 w26 0 w26 1 w26 0 w15 1 [UNK] 0 w15 1 w15 0 w25 0 w25 1 w25 0 w25 1 w19 0 w19 0 w19 1 w19 0 w5 1 w5 0 w5 1 w5 0 w11 0 w11 1 w11 0 w11 1 w13 0 w13 1 w13 0 w13 0 w10 1 w10 0 w10 1 w10 0 w9 0 w9 1 w9 0 w9 1 w27 0 w27 1 w27 0 w27 0 w16 1 w16 0 w16 1 w16 0 w3 1 w3 0 w3 1 w3 0 w6 0 w6 1 w6 0 w6 1 w7 0 w7 1 w7 0 True


In [26]:
generated_text = fast_tokenizer.decode(outputs[1], skip_special_tokens=True)
print(generated_text)
print(texts[0])

w29 1 w29 1 w29 0 w29 0 w21 1 w21 0 w21 1 w21 0 w24 0 w24 1 w24 0 w24 1 w30 0 w30 1 w30 0 w30 1 w23 0 w23 1 w23 0 w23 1 w32 0 w32 1 w32 0 w32 1 w20 0 w20 0 w20 1 w20 0 w28 1 w28 0 w28 1 w28 0 w12 1 w12 0 w12 1 w12 0 w18 1 w18 0 w18 0 w18 1 w17 0 w17 1 w17 0 w1 1 w1 0 w1 1 w31 0 w31 1 w31 0 w31 1 w4 0 w4 1 w4 0 w4 0 w2 1 w2 0 w2 1 w2 0 w14 1 w14 0 w14 1 w14 0 w8 0 w8 1 w8 0 w8 1 w22 0 w22 1 w22 0 w26 0 w26 1 w26 0 w26 1 w15 0 w15 1 w15 0 w25 0 w25 1 w25 0 w25 1 w19 0 w19 0 w19 1 w5 0 w5 1 w5 0 w11 1 w11 0 w11 0 w11 1 w13 0 w13 1 w13 0 w10 0 w10 1 w10 0 w9 1 w9 0 w9 0 w9 1 w27 0 w27 1 w27 0 w16 1 w16 0 w16 0 w3 1 w3 0 w3 1 w3 0 w6 1 w6 0 w6 0 w7 1 w7 0 True
w29 1 w29 1 w29 0 w29 0 w21 1 w21 0 w21 0 w21 1 w24 0 w24 1 w24 0 w24 1 w30 0 w30 1 w30 0 w30 1 w23 0 w23 1 w23 0 w23 1 w32 0 w32 1 w32 0 w32 0 w20 1 w20 0 w20 1 w20 0 w28 1 w28 0 w28 1 w28 0 w12 1 w12 0 w12 1 w12 0 w18 1 w18 0 w18 0 w18 1 w17 0 w17 1 w17 0 w17 1 w1 0 w1 1 w1 0 w1 1 w31 0 w31 1 w31 0 w31 1 w4 0 w4 1 w4 0 w4 0 w2 1 w2 

In [36]:
dir(model.transformer.wte.weight)

Parameter containing:
tensor([[ 0.0135,  0.0093, -0.0225,  ...,  0.0290, -0.0041, -0.0025],
        [-0.0215,  0.0151,  0.0091,  ...,  0.0241, -0.0054,  0.0267],
        [ 0.0066, -0.0003, -0.0704,  ...,  0.0288,  0.0062,  0.0195],
        ...,
        [ 0.0125, -0.0450, -0.0536,  ...,  0.0143,  0.0027, -0.0330],
        [-0.0076,  0.0061, -0.0181,  ...,  0.0318, -0.0044,  0.0029],
        [ 0.0103,  0.0113,  0.0043,  ..., -0.0055,  0.0324,  0.0190]],
       device='cuda:3', requires_grad=True)

In [50]:
GPT2LMHeadModel.__dir__

<function torch.nn.modules.module.Module.__dir__(self)>

In [49]:
dir(GPT2LMHeadModel)

['T_destination',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_assisted_decoding',
 '_auto_class',
 '_autoset_attn_implementation',
 '_backward_compatibility_gradient_checkpointing',
 '_beam_search',
 '_beam_search_has_unfinished_sequences',
 '_cache_dependant_input_preparation',
 '_cache_dependant_input_preparation_exporting',
 '_call_impl',
 '_check_and_enable_flash_attn_2',
 '_check_and_enable_flex_attn',
 '_check_and_enable_sdpa',
 '_compiled_call_impl',
 '_constrained_beam_search',
 '_contrastive_search',
 '_convert_head_mask_to_5d',
 '_copy_lm_head_original_to_resiz

In [51]:
dir(fast_tokenizer)

['SPECIAL_TOKENS_ATTRIBUTES',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_tokens',
 '_auto_class',
 '_batch_encode_plus',
 '_call_one',
 '_convert_encoding',
 '_convert_id_to_token',
 '_convert_token_to_id_with_added_voc',
 '_create_repo',
 '_decode',
 '_decode_use_source_tokenizer',
 '_encode_plus',
 '_eventual_warn_about_too_long_sequence',
 '_eventually_correct_t5_max_length',
 '_from_pretrained',
 '_get_files_timestamps',
 '_get_padding_truncation_strategies',
 '_in_target_context_manager',
 '_pad',
 '_pad_token_type_id',
 '_processor_class',
 '_save_pretrained',
 '_set_model_specific_special_to