In [1]:
! pip -q install transformers

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from peft import LoraConfig, get_peft_model

#carregando modelo base
device = "cuda"
model_name = "microsoft/DialoGPT-medium"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model = model.to(device)



tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/863M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [3]:
import json

#treino
f_train = open("/kaggle/input/dadoclusterizado/treino_cluster.json")
train_data = json.load(f_train)
f_train.close()

#validacao
f_validate = open("/kaggle/input/dadoclusterizado/val_cluster.json")
validate_data = json.load(f_validate)
f_validate.close()


In [4]:

#criando dicionarios com o par pergunta e resposta
train_data = [
    {
        "input": f"{pair[0]}",
        "label": f"{pair[1]}"
    }
    for pair in train_data
]


validate_data = [
    {
        "input": f"{pair[0]}",
        "label": f"{pair[1]}"
    }
    for pair in validate_data
]


In [5]:
import pandas as pd
from transformers import AutoTokenizer
from datasets import Dataset

tokenizer


GPT2TokenizerFast(name_or_path='microsoft/DialoGPT-medium', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
)

In [6]:
print(tokenizer.eos_token)  #mostrando qual o token de fim de sequencia


<|endoftext|>


In [7]:
import pandas as pd
from transformers import AutoTokenizer
import torch

#função para criacao de contexto acumulado
def create_contextual_data(data, history_size=3):
    contextual_data = []
    
    for i in range(history_size, len(data)):
        context = []
        prev = i - history_size
        for j in range(i, prev, -1):
            #adciona o token de fim de sequencia após a pergunta (input) e resposta (label)
            context.append(f"{data[j]['input']} <|endoftext|> {data[j]['label']} <|endoftext|> ")

        
        contextual_entry = {
            "context": "\n".join(context),  #historizy_size=3 indica q o contexto é formado por 3 pares pergunta-resposta
            "text": data[i]["input"],       #pergunta atual
            "labels": data[i]["label"]      #resposta atual
        }
        
        contextual_data.append(contextual_entry)
    
    return contextual_data

#a função aos dados
train_contexted = create_contextual_data(train_data, history_size=1)
validate_contexted = create_contextual_data(validate_data, history_size=1)

In [8]:
columns = ['context', 'text', 'labels']
trn_df = pd.DataFrame.from_records(train_contexted, columns=columns)
vld_df = pd.DataFrame.from_records(validate_contexted, columns=columns)

trn_df['context'].head()

0    How can I highlight boots in my outfit? <|endo...
1    About summer clothes, what outfit can I wear? ...
2    In summer, what kind of colors should I use mo...
3    In autumn, what kind of colors should I use mo...
4    In winter, what kind of colors should I use mo...
Name: context, dtype: object

In [9]:


#função para construir a sequencia completa: contexto + pergunta + resposta
def construct_conv(row, tokenizer):
    #concatena contexto, pergunta e resposta, adicionando token de fim de sequencia (EOS) ao final de cada parte
    conversation = (
        row["context"]  #contexto já tem EOS
        + row["text"] + "<|endoftext|> "  #EOS após a pergunta atual
        + row["labels"] + "<|endoftext|> "  #EOS após a resposta atual
    )
    
    #tokenizar a sequência concatenada
    tokenized_conv = tokenizer(
        conversation,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )
    
    return tokenized_conv

#tokenizar os dados de treinamento e validação
tokenized_train = [construct_conv(row, tokenizer) for _, row in trn_df.iterrows()]
tokenized_test = [construct_conv(row, tokenizer) for _, row in vld_df.iterrows()]

#como o modelo é de autoregressao pode-se passar os dados diretamente sem a marcacao entrada e resposta (input e label)

In [10]:
from transformers import Trainer, TrainingArguments
from transformers import AdamW
import torch



In [11]:
# import torch
# torch.cuda.empty_cache()

In [None]:
from transformers import Trainer, TrainingArguments,DataCollatorForLanguageModeling,EarlyStoppingCallback
from transformers import AdamW


#token de padding
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

#preparando dados para o treinamento , mlm false pois o dialo é gpt like e preve com base na palavra anterior
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)


training_args = TrainingArguments(
    output_dir="./moda_modelo",
    num_train_epochs=4,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    evaluation_strategy="epoch",
    logging_strategy="steps",
    logging_steps=1,
    fp16=True,
    weight_decay=0.01,
    adam_epsilon=1e-8,
    max_grad_norm=1.0,
    lr_scheduler_type="linear",
    warmup_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_strategy="epoch",
    save_total_limit=2,
    logging_dir="./logs",
    report_to="none",
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
)


optimizer = AdamW(model.parameters(), lr=training_args.learning_rate, eps=training_args.adam_epsilon, weight_decay=training_args.weight_decay)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,  # 
    eval_dataset=tokenized_test,    #
    optimizers=(optimizer, None),  #
    data_collator=data_collator,
)

result= trainer.train()




Epoch,Training Loss,Validation Loss
1,1.0203,1.13472
2,0.8053,0.923463
3,0.6874,0.875609
4,0.7028,0.866737


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


In [13]:
eval_results = trainer.evaluate()

#calculo de perplexidade com base na loss média retornada pelo Trainer
if "eval_loss" in eval_results:
    eval_loss = eval_results["eval_loss"]
    perplexity = torch.exp(torch.tensor(eval_loss))
    eval_results["perplexity"] = perplexity.item()

print(eval_results)



{'eval_loss': 0.8667369484901428, 'eval_runtime': 11.0541, 'eval_samples_per_second': 10.675, 'eval_steps_per_second': 0.724, 'epoch': 4.0, 'perplexity': 2.3791348934173584}


In [14]:
trainer.save_model("./moda_modelo/final")
tokenizer.save_pretrained("./moda_modelo/final")


('./moda_modelo/final/tokenizer_config.json',
 './moda_modelo/final/special_tokens_map.json',
 './moda_modelo/final/vocab.json',
 './moda_modelo/final/merges.txt',
 './moda_modelo/final/added_tokens.json',
 './moda_modelo/final/tokenizer.json')

In [15]:
result.metrics

{'train_runtime': 1594.7334,
 'train_samples_per_second': 3.208,
 'train_steps_per_second': 0.201,
 'total_flos': 4751232753205248.0,
 'train_loss': 1.2060132037848235,
 'epoch': 4.0}

In [16]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_path = "/kaggle/working/moda_modelo/final"

#carregando modelo e o tokenizador treinado
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)


In [17]:
from transformers import AutoTokenizer, AutoModelForCausalLM


def generate_response(prompt,model):
    
    bot_input_ids = tokenizer.encode(prompt + tokenizer.eos_token, return_tensors='pt')

    #gerando uma resposta
    chat_history_ids = model.generate(
        bot_input_ids, max_length=256,
        pad_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=3,
        do_sample=True,
        top_k=50,
        top_p=0.9,
        temperature=0.7
    )

    #decodificar e retornar a resposta
    response = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
    return response

#dataset era em ingles portanto prompt em ingles
prompt = "i am a woman. Can you reccomend a outfit that make use of a hoodie, what is the best choice of cloths to make a outstanding outfit?"
response = generate_response(prompt,model)
print("Response: ", response)


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Response:   A hoodie can be paired with jeans or a jumpsuit for a trendy look. If you want to stay trendy, wear it over a dress or skirt. Accessories like a belt or a scarf can also elevate the look.
