# NEW CODE

Verificar que los modelos caben en memoria.

Verificar clip

In [None]:
import torch
from transformers import CLIPModel

try:
    clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
    clip_model.eval()
    clip_model = torch.quantization.quantize_dynamic(clip_model, {torch.nn.Linear}, dtype=torch.qint8)
    clip_model.cuda()
    print("CLIP model loaded successfully.")
except RuntimeError as e:
    print("Error loading CLIP model:", e)


CLIP model loaded successfully.


Verificar mistral

In [None]:
import torch
from transformers import AutoModelForCausalLM

try:
    mistral_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", use_auth_token='tu_token_hf')
    mistral_model.cuda()
    print("Mistral model loaded successfully.")
except RuntimeError as e:
    print("Error loading Mistral model:", e)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Error loading Mistral model: CUDA out of memory. Tried to allocate 500.00 MiB. GPU 


## ENFOQUE ENCODER DECODER LLAMA

1. Generar y almacenar embeddings de imagenes

In [None]:
from transformers import CLIPModel, CLIPProcessor
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from torchvision.transforms import Compose, Resize, ToTensor, Normalize
import json
import os
import pickle

# Definición del dataset para generar embeddings
class ImageDataset(Dataset):
    def __init__(self, data_dict, img_dir, processor):
        self.data_dict = data_dict
        self.img_dir = img_dir
        self.processor = processor
        self.transforms = Compose([
            Resize((224, 224)),
            ToTensor(),
            Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])
        ])

    def __len__(self):
        return len(self.data_dict)

    def __getitem__(self, idx):
        tweet_id = list(self.data_dict.keys())[idx]
        img_filename = tweet_id + ".jpg"
        img_path = os.path.join(self.img_dir, img_filename)
        image = Image.open(img_path).convert('RGB')
        image = self.transforms(image)
        return image, tweet_id

def generate_and_save_embeddings(data_dict, img_dir, processor, model, batch_size=8, output_file='image_embeddings.pkl'):
    dataset = ImageDataset(data_dict, img_dir, processor)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    model.eval()
    
    embeddings = {}
    with torch.no_grad():
        for images, tweet_ids in dataloader:
            outputs = model.get_image_features(images)
            outputs = outputs.numpy()
            for tweet_id, embedding in zip(tweet_ids, outputs):
                embeddings[tweet_id] = embedding

    with open(output_file, 'wb') as f:
        pickle.dump(embeddings, f)

def main_generate_embeddings():
    base_path = "./"
    with open(f'{base_path}/MMHS150K_GT.json', 'r') as f:
        data = json.load(f)

    with open(f'{base_path}/splits/train_ids.txt', 'r') as f:
        id_train = f.read().split()
    with open(f'{base_path}/splits/val_ids.txt', 'r') as f:
        id_val = f.read().split()
    with open(f'{base_path}/splits/test_ids.txt', 'r') as f:
        id_test = f.read().split()

    dict_data = {x: data[x] for x in id_train + id_val + id_test if x in data}

    processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
    clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
    clip_model.eval()
    clip_model = torch.quantization.quantize_dynamic(clip_model, {torch.nn.Linear, torch.nn.Conv2d}, dtype=torch.qint8)

    generate_and_save_embeddings(dict_data, base_path + 'img_resized', processor, clip_model, batch_size=32, output_file='image_embeddings.pkl')

if __name__ == "__main__":
    main_generate_embeddings()


2024-06-10 22:13:27.519366: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-10 22:13:27.614689: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


2. Entrenar decoder

In [None]:
import re
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import (
    LoraConfig,
    get_peft_model,
)
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from datasets import DatasetDict
import wandb
from trl import SFTTrainer
import pickle
import json
import gc
import torch.cuda.amp as amp
from accelerate import infer_auto_device_map, dispatch_model
from accelerate.utils import set_module_tensor_to_device

# Preprocessing function
def preprocess_text(text):
    """Preprocess the text by removing links and replacing @mentions with [USR]"""
    text = re.sub(r"http\S+", "", text)  # remove links
    text = re.sub(r"@\w+", "[USR]", text)  # replace mentions
    return text

# Dataset class
class TextDataset(Dataset):
    def __init__(self, data_dict, embeddings, tokenizer, max_length=128):
        self.data_dict = data_dict
        self.embeddings = embeddings
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data_dict)

    def __getitem__(self, idx):
        tweet_id = list(self.data_dict.keys())[idx]
        tweet_info = self.data_dict[tweet_id]
        embedding = self.embeddings[tweet_id]

        tweet_text = preprocess_text(tweet_info['tweet_text'])
        inputs = self.tokenizer(tweet_text, return_tensors="pt", max_length=self.max_length, padding='max_length', truncation=True)

        labels = inputs.input_ids.squeeze(0).clone()
        input_ids = inputs.input_ids.squeeze(0).clone()
        input_ids = torch.cat([torch.tensor([self.tokenizer.pad_token_id]), input_ids[:-1]])

        return {
            "embedding": torch.tensor(embedding, dtype=torch.float16),
            "input_ids": input_ids,
            "attention_mask": inputs.attention_mask.squeeze(0),
            "labels": labels,
            "tweet_text": tweet_text,
            "inputs": inputs.input_ids.squeeze(0)
        }

# Function to train the model
def main_train_decoder():
    base_path = "./"
    with open(f'{base_path}/MMHS150K_GT.json', 'r') as f:
        data = json.load(f)

    with open(f'{base_path}/splits/train_ids.txt', 'r') as f:
        id_train = f.read().split()
    with open(f'{base_path}/splits/val_ids.txt', 'r') as f:
        id_val = f.read().split()
    with open(f'{base_path}/splits/test_ids.txt', 'r') as f:
        id_test = f.read().split()

    dict_train = {x: data[x] for x in id_train if x in data}
    dict_val = {x: data[x] for x in id_val if x in data}
    dict_test = {x: data[x] for x in id_test if x in data}

    with open('image_embeddings.pkl', 'rb') as f:
        embeddings = pickle.load(f)

    token = 'tu_token_hf'  # Asegúrate de que este token sea el correcto
    api_key = 'tu_api_key_wandb'

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )

    llama_model = AutoModelForCausalLM.from_pretrained(
        "meta-llama/Meta-Llama-3-8B-Instruct",
        quantization_config=bnb_config,
        device_map="auto",
    )

    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token=token)

    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

    peft_config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
    )
    llama_model = get_peft_model(llama_model, peft_config)

    train_dataset = TextDataset(dict_train, embeddings, tokenizer)
    val_dataset = TextDataset(dict_val, embeddings, tokenizer)
    test_dataset = TextDataset(dict_test, embeddings, tokenizer)

    dataset = DatasetDict({
        "train": train_dataset,
        "test": val_dataset,
    })

    # Inicializar wandb antes del entrenamiento
    wandb.login(key=api_key)
    run = wandb.init(
        project='Fine-tune Llama 3 8B on Image Embeddings', 
        job_type="training", 
        anonymous="allow"
    )

    training_arguments = TrainingArguments(
        output_dir="llama-3-8b-meme-poster",
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=2,
        optim="paged_adamw_32bit",
        num_train_epochs=1,
        evaluation_strategy="steps",
        eval_steps=0.2,
        logging_steps=1,
        warmup_steps=10,
        logging_strategy="steps",
        learning_rate=2e-4,
        fp16=False,
        bf16=False,
        group_by_length=True,
        report_to="wandb"
    )

    trainer = SFTTrainer(
        model=llama_model,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        peft_config=peft_config,
        max_seq_length=512,
        dataset_text_field="tweet_text",
        tokenizer=tokenizer,
        args=training_arguments,
        packing=False,
    )

    trainer.train()

    wandb.finish()

if __name__ == "__main__":
    main_train_decoder()


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)
wandb: ERROR Error while calling W&B API: run jtorwjgz was previously created and deleted; try a new run name (<Response [409]>)
wandb: ERROR Error while calling W&B API: run jtorwjgz was previously created and deleted; try a new run name (<Response [409]>)
wandb: ERROR Error while calling W&B API: run jtorwjgz was previously created and deleted; try a new run name (<Response [409]>)
wandb: ERROR Error while calling W&B API: run jtorwjgz was previously created and deleted; try a new run name (<Response [409]>)
wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)
wandb: ERROR Error while calling W&B API: run jtorwjgz was previously created and deleted; try a new run name (<Response [409]>)
wandb: ERROR Error while calling W&B API: run jtorwjgz was previously created and deleted; try a new run name (<Response [409]>)
wandb: ERROR Error while calling W&B API: run jtorwjgz was previously created an

BrokenPipeError: [Errno 32] Broken pipe

Eso ya ejecuta, pero 15 horas para 1 epoch. Intentaré acelerar aumentando batch size

In [None]:
import re
import time
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import (
    LoraConfig,
    get_peft_model,
)
import torch
from torch.utils.data import Dataset
from datasets import DatasetDict
import wandb
from trl import SFTTrainer
import pickle
import json
import gc

# Preprocessing function
def preprocess_text(text):
    """Preprocess the text by removing links and replacing @mentions with [USR]"""
    text = re.sub(r"http\S+", "", text)  # remove links
    text = re.sub(r"@\w+", "[USR]", text)  # replace mentions
    return text

# Dataset class
class TextDataset(Dataset):
    def __init__(self, data_dict, embeddings, tokenizer, max_length=128):
        self.data_dict = data_dict
        self.embeddings = embeddings
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data_dict)

    def __getitem__(self, idx):
        tweet_id = list(self.data_dict.keys())[idx]
        tweet_info = self.data_dict[tweet_id]
        embedding = self.embeddings[tweet_id]

        tweet_text = preprocess_text(tweet_info['tweet_text'])
        inputs = self.tokenizer(tweet_text, return_tensors="pt", max_length=self.max_length, padding='max_length', truncation=True)

        labels = inputs.input_ids.squeeze(0).clone()
        input_ids = inputs.input_ids.squeeze(0).clone()
        input_ids = torch.cat([torch.tensor([self.tokenizer.pad_token_id]), input_ids[:-1]])

        return {
            "embedding": torch.tensor(embedding, dtype=torch.float16),
            "input_ids": input_ids,
            "attention_mask": inputs.attention_mask.squeeze(0),
            "labels": labels,
            "tweet_text": tweet_text,
            "inputs": inputs.input_ids.squeeze(0)
        }

# Function to train the model
def main_train_decoder():
    base_path = "./"
    with open(f'{base_path}/MMHS150K_GT.json', 'r') as f:
        data = json.load(f)

    with open(f'{base_path}/splits/train_ids.txt', 'r') as f:
        id_train = f.read().split()
    with open(f'{base_path}/splits/val_ids.txt', 'r') as f:
        id_val = f.read().split()
    with open(f'{base_path}/splits/test_ids.txt', 'r') as f:
        id_test = f.read().split()

    dict_train = {x: data[x] for x in id_train if x in data}
    dict_val = {x: data[x] for x in id_val if x in data}
    dict_test = {x: data[x] for x in id_test if x in data}

    with open('image_embeddings.pkl', 'rb') as f:
        embeddings = pickle.load(f)

    token = 'tu_token_hf'  # Asegúrate de que este token sea el correcto
    api_key = 'tu_api_key_wandb'

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )

    llama_model = AutoModelForCausalLM.from_pretrained(
        "meta-llama/Meta-Llama-3-8B-Instruct",
        quantization_config=bnb_config,
        device_map="auto",
    )

    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token=token)

    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

    peft_config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
    )
    llama_model = get_peft_model(llama_model, peft_config)

    train_dataset = TextDataset(dict_train, embeddings, tokenizer)
    val_dataset = TextDataset(dict_val, embeddings, tokenizer)
    test_dataset = TextDataset(dict_test, embeddings, tokenizer)

    dataset = DatasetDict({
        "train": train_dataset,
        "test": val_dataset,
    })

    # Inicializar wandb antes del entrenamiento
    try:
        wandb.login(key=api_key)
        run = wandb.init(
            project='Fine-tune Llama 3 8B on Memes Embeddings', 
            job_type="training", 
            anonymous="allow",
            name=f"run-{int(time.time())}"  # Nombre único basado en el timestamp actual
        )
    except Exception as e:
        print(f"Error initializing wandb: {e}")
        return

    training_arguments = TrainingArguments(
        output_dir="llama-3-8b-meme-poster",
        per_device_train_batch_size=8,  # Incrementar tamaño del lote a 8
        per_device_eval_batch_size=8,  # Incrementar tamaño del lote a 8
        gradient_accumulation_steps=8,  # Aumentar acumulación de gradientes
        optim="paged_adamw_32bit",
        num_train_epochs=1,
        evaluation_strategy="steps",
        eval_steps=0.2,
        logging_steps=1,
        warmup_steps=10,
        logging_strategy="steps",
        learning_rate=2e-4,
        fp16=False,
        bf16=False,
        group_by_length=True,
        report_to="wandb"
    )

    trainer = SFTTrainer(
        model=llama_model,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        peft_config=peft_config,
        max_seq_length=512,
        dataset_text_field="tweet_text",
        tokenizer=tokenizer,
        args=training_arguments,
        packing=False,
    )

    try:
        trainer.train()
    except Exception as e:
        print(f"Error during training: {e}")
    finally:
        wandb.finish()

if __name__ == "__main__":

    main_train_decoder()


2024-06-11 20:09:50.017037: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-11 20:09:50.077641: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjsantamariag[0m ([33mj-santamariag[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/javiermo/.netrc



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Step,Training Loss,Validation Loss
422,2.908,3.021783
844,2.9799,2.948987
1266,2.6541,2.903344
1688,2.632,2.876436


VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,█▄▂▁
eval/runtime,▁▁█▆
eval/samples_per_second,██▁▃
eval/steps_per_second,██▁▁
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/grad_norm,█▁▃▂▃▃▂▃▂▃▃▃▅▄▂▃▃▂▃▁▁▃▂▃▃▂▃▃▃▂▃▃▃▂▂▂▃▂▂▂
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁▁
train/loss,█▇▇▇▅▆▆▅▅▆▅▆▅▅▆▇▅▅▄▃▁▇▅▆▄▄▄▄▄▅▆▅▃▄▄▃▂▄▄▃

0,1
eval/loss,2.87644
eval/runtime,270.5164
eval/samples_per_second,18.483
eval/steps_per_second,2.31
total_flos,7.812073095891517e+17
train/epoch,0.9997
train/global_step,2106.0
train/grad_norm,1.26374
train/learning_rate,0.0
train/loss,2.8261


In [None]:
import re
import time
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import (
    LoraConfig,
    get_peft_model,
)
import torch
from torch.utils.data import Dataset
from datasets import DatasetDict
import wandb
from trl import SFTTrainer
import pickle
import json
import gc

# Preprocessing function
def preprocess_text(text):
    """Preprocess the text by removing links and replacing @mentions with [USR]"""
    text = re.sub(r"http\S+", "", text)  # remove links
    text = re.sub(r"@\w+", "[USR]", text)  # replace mentions
    return text

# Dataset class
class TextDataset(Dataset):
    def __init__(self, data_dict, embeddings, tokenizer, max_length=128):
        self.data_dict = data_dict
        self.embeddings = embeddings
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data_dict)

    def __getitem__(self, idx):
        tweet_id = list(self.data_dict.keys())[idx]
        tweet_info = self.data_dict[tweet_id]
        embedding = self.embeddings[tweet_id]

        tweet_text = preprocess_text(tweet_info['tweet_text'])
        inputs = self.tokenizer(tweet_text, return_tensors="pt", max_length=self.max_length, padding='max_length', truncation=True)

        labels = inputs.input_ids.squeeze(0).clone()
        input_ids = inputs.input_ids.squeeze(0).clone()
        input_ids = torch.cat([torch.tensor([self.tokenizer.pad_token_id]), input_ids[:-1]])

        return {
            "embedding": torch.tensor(embedding, dtype=torch.float16),
            "input_ids": input_ids,
            "attention_mask": inputs.attention_mask.squeeze(0),
            "labels": labels,
            "tweet_text": tweet_text,
            "inputs": inputs.input_ids.squeeze(0)
        }

# Function to train the model
def main_train_decoder():
    base_path = "./"
    with open(f'{base_path}/MMHS150K_GT.json', 'r') as f:
        data = json.load(f)

    with open(f'{base_path}/splits/train_ids.txt', 'r') as f:
        id_train = f.read().split()
    with open(f'{base_path}/splits/val_ids.txt', 'r') as f:
        id_val = f.read().split()
    with open(f'{base_path}/splits/test_ids.txt', 'r') as f:
        id_test = f.read().split()

    dict_train = {x: data[x] for x in id_train if x in data}
    dict_val = {x: data[x] for x in id_val if x in data}
    dict_test = {x: data[x] for x in id_test if x in data}

    with open('image_embeddings.pkl', 'rb') as f:
        embeddings = pickle.load(f)

    token = 'tu_token_hf'  # Asegúrate de que este token sea el correcto
    api_key = 'tu_api_key_wandb'

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )

    llama_model = AutoModelForCausalLM.from_pretrained(
        "meta-llama/Meta-Llama-3-8B-Instruct",
        quantization_config=bnb_config,
        device_map="auto",
        torch_dtype=torch.float16,
    )

    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token=token)

    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

    peft_config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
    )
    llama_model = get_peft_model(llama_model, peft_config)

    train_dataset = TextDataset(dict_train, embeddings, tokenizer)
    val_dataset = TextDataset(dict_val, embeddings, tokenizer)

    dataset = DatasetDict({
        "train": train_dataset,
        "test": val_dataset,
    })

    # Inicializar wandb antes del entrenamiento
    try:
        wandb.login(key=api_key)
        run = wandb.init(
            project='Fine-tune Llama 3 8B on Memes Embeddings', 
            job_type="training", 
            anonymous="allow",
            name=f"run-{int(time.time())}"  # Nombre único basado en el timestamp actual
        )
    except Exception as e:
        print(f"Error initializing wandb: {e}")
        return

    training_arguments = TrainingArguments(
        output_dir="llama-3-8b-meme-poster",
        per_device_train_batch_size=16,  # Reducir tamaño del lote a 1
        per_device_eval_batch_size=16,  # Reducir tamaño del lote a 1
        gradient_accumulation_steps=16,  # Ajustar acumulación de gradientes
        optim="adamw_hf",
        num_train_epochs=3,  # Aumentar el número de épocas
        evaluation_strategy="steps",
        eval_steps=100,
        logging_steps=10,
        warmup_steps=50,
        learning_rate=5e-5,
        fp16=True,
        report_to="wandb"
    )

    trainer = SFTTrainer(
        model=llama_model,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        peft_config=peft_config,
        max_seq_length=512,
        dataset_text_field="tweet_text",
        tokenizer=tokenizer,
        args=training_arguments,
        packing=False,
    )

    try:
        trainer.train()
    except Exception as e:
        print(f"Error during training: {e}")
    finally:
        trainer.save_model(output_dir="llama-3-8b-meme-poster")  # Guardar el modelo localmente
        wandb.finish()

if __name__ == "__main__":
    torch.cuda.set_device(1)  # Seleccionar GPU 1
    main_train_decoder()


2024-06-13 00:26:26.696865: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-13 00:26:26.758960: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjsantamariag[0m ([33mj-santamariag[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/javiermo/.netrc



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Step,Training Loss,Validation Loss


Sospecho que hay algo mal en evaluación. Vo ya agregar unos cuantos prints y a revisar la función de eval.

In [None]:
import re
import time
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import (
    LoraConfig,
    get_peft_model,
)
import torch
from torch.utils.data import Dataset
from datasets import DatasetDict
import wandb
from trl import SFTTrainer
import pickle
import json
import gc

# Preprocessing function
def preprocess_text(text):
    """Preprocess the text by removing links and replacing @mentions with [USR]"""
    text = re.sub(r"http\S+", "", text)  # remove links
    text = re.sub(r"@\w+", "[USR]", text)  # replace mentions
    return text

# Dataset class
class TextDataset(Dataset):
    def __init__(self, data_dict, embeddings, tokenizer, max_length=128):
        self.data_dict = data_dict
        self.embeddings = embeddings
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data_dict)

    def __getitem__(self, idx):
        tweet_id = list(self.data_dict.keys())[idx]
        tweet_info = self.data_dict[tweet_id]
        embedding = self.embeddings[tweet_id]

        tweet_text = preprocess_text(tweet_info['tweet_text'])
        inputs = self.tokenizer(tweet_text, return_tensors="pt", max_length=self.max_length, padding='max_length', truncation=True)

        labels = inputs.input_ids.squeeze(0).clone()
        input_ids = inputs.input_ids.squeeze(0).clone()
        input_ids = torch.cat([torch.tensor([self.tokenizer.pad_token_id]), input_ids[:-1]]) # endofsentencetokenid ?? cat al revés?? padding n la derecha
        ''' labels e inputids están al revés. labels te dice que hay padding a la derecha '''
        
        
        return {
            "embedding": torch.tensor(embedding, dtype=torch.float16),
            "input_ids": input_ids,
            "attention_mask": inputs.attention_mask.squeeze(0),
            "labels": labels,
            "tweet_text": tweet_text,
            "inputs": inputs.input_ids.squeeze(0)
        }

# Function to train the model
def main_train_decoder():
    base_path = "./"
    with open(f'{base_path}/MMHS150K_GT.json', 'r') as f:
        data = json.load(f)

    with open(f'{base_path}/splits/train_ids.txt', 'r') as f:
        id_train = f.read().split()
    with open(f'{base_path}/splits/val_ids.txt', 'r') as f:
        id_val = f.read().split()
    with open(f'{base_path}/splits/test_ids.txt', 'r') as f:
        id_test = f.read().split()

    dict_train = {x: data[x] for x in id_train if x in data}
    dict_val = {x: data[x] for x in id_val if x in data}
    dict_test = {x: data[x] for x in id_test if x in data}

    with open('image_embeddings.pkl', 'rb') as f:
        embeddings = pickle.load(f)

    token = 'tu_token_hf'  # Asegúrate de que este token sea el correcto
    api_key = 'tu_api_key_wandb'


    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )

    llama_model = AutoModelForCausalLM.from_pretrained(
        "meta-llama/Meta-Llama-3-8B-Instruct",
        quantization_config=bnb_config,
        device_map="auto",
        torch_dtype=torch.float16,
    )

    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token=token)

    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

    peft_config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
    )
    llama_model = get_peft_model(llama_model, peft_config)

    train_dataset = TextDataset(dict_train, embeddings, tokenizer)
    val_dataset = TextDataset(dict_val, embeddings, tokenizer)

    dataset = DatasetDict({
        "train": train_dataset,
        "test": val_dataset,
    })

    # Inicializar wandb antes del entrenamiento
    try:
        wandb.login(key=api_key)
        run = wandb.init(
            project='Fine-tune Llama 3 8B on Memes Embeddings', 
            job_type="training", 
            anonymous="allow",
            name=f"run-{int(time.time())}"  # Nombre único basado en el timestamp actual
        )
    except Exception as e:
        print(f"Error initializing wandb: {e}")
        return

    training_arguments = TrainingArguments(
        output_dir="llama-3-8b-meme-poster",
        per_device_train_batch_size=8,  # Reducir tamaño del lote a 16
        per_device_eval_batch_size=8,  # Reducir tamaño del lote a 16
        gradient_accumulation_steps=4,  # Ajustar acumulación de gradientes
        optim="adamw_hf",
        num_train_epochs=3,  # Aumentar el número de épocas
        evaluation_strategy="steps",
        eval_steps=100,
        logging_steps=10,
        warmup_steps=50,
        learning_rate=5e-5,
        fp16=True,
        report_to="wandb"
    )

    trainer = SFTTrainer(
        model=llama_model,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        peft_config=peft_config,
        max_seq_length=512,
        dataset_text_field="tweet_text",
        tokenizer=tokenizer,
        args=training_arguments,
        packing=False,
    )

    def print_sample_predictions(trainer, dataset, tokenizer, num_samples=5):
        model = trainer.model
        model.eval()
        for i in range(num_samples):
            sample = dataset[i]
            input_ids = sample['input_ids'].unsqueeze(0).to(trainer.args.device)
            attention_mask = sample['attention_mask'].unsqueeze(0).to(trainer.args.device)
            embedding = sample['embedding'].unsqueeze(0).to(trainer.args.device)

            with torch.no_grad():
                '''MISMO PROBLEMA. INPUT IDS NO. ATTENTION MASK NO. inputs_embeds en vez de embeddings'''
                generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, embeddings=embedding, max_length=128)
                generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

            print(f"Original text: {sample['tweet_text']}")
            print(f"Generated text: {generated_text}\n")

    try:
        trainer.train()
        print_sample_predictions(trainer, train_dataset, tokenizer)
    except Exception as e:
        print(f"Error during training: {e}")
    finally:
        trainer.save_model(output_dir="llama-3-8b-meme-poster")  # Guardar el modelo localmente
        wandb.finish()

if __name__ == "__main__":
    #torch.cuda.set_device(1)  # Seleccionar GPU 1
    main_train_decoder()


2024-06-13 18:37:57.125271: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-13 18:37:57.194657: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjsantamariag[0m ([33mj-santamariag[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/javiermo/.netrc



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Step,Training Loss,Validation Loss


# 🔴Nuevo código corrigiendo generate

In [None]:
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjsantamariag[0m ([33mj-santamariag[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
import re
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import (
    LoraConfig,
    get_peft_model,
)
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import DatasetDict
import wandb
from trl import SFTTrainer
import pickle
import json

# Preprocessing function
def preprocess_text(text):
    """Preprocess the text by removing links and replacing @mentions with [USR]"""
    text = re.sub(r"http\S+", "", text)  # remove links
    text = re.sub(r"@\w+", "[USR]", text)  # replace mentions
    return text

# Dataset class for training
class TextDataset(Dataset):
    def __init__(self, data_dict, embeddings, tokenizer, max_length=128):
        self.data_dict = data_dict
        self.embeddings = embeddings
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data_dict)

    def __getitem__(self, idx):
        tweet_id = list(self.data_dict.keys())[idx]
        tweet_info = self.data_dict[tweet_id]
        embedding = self.embeddings[tweet_id]

        tweet_text = preprocess_text(tweet_info['tweet_text'])
        inputs = self.tokenizer(tweet_text, return_tensors="pt", max_length=self.max_length, padding='max_length', truncation=True)

        labels = inputs.input_ids.squeeze(0).clone()
        input_ids = inputs.input_ids.squeeze(0).clone()
        input_ids = torch.cat([torch.tensor([self.tokenizer.pad_token_id]), input_ids[:-1]])

        return {
            "embedding": torch.tensor(embedding, dtype=torch.float16),
            "input_ids": input_ids,
            "attention_mask": inputs.attention_mask.squeeze(0),
            "labels": labels,
            "tweet_text": tweet_text,
            "inputs": inputs.input_ids.squeeze(0)
        }

# Dataset class for validation
class ValidationDataset(Dataset):
    def __init__(self, data_dict, embeddings, tokenizer, max_length=128):
        self.data_dict = data_dict
        self.embeddings = embeddings
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data_dict)

    def __getitem__(self, idx):
        tweet_id = list(self.data_dict.keys())[idx]
        tweet_info = self.data_dict[tweet_id]
        embedding = self.embeddings[tweet_id]

        tweet_text = preprocess_text(tweet_info['tweet_text'])

        return {
            "embedding": torch.tensor(embedding, dtype=torch.float16),
            "tweet_id": tweet_id,
            "tweet_text": tweet_text
        }

# Custom DataLoader for validation to bypass DataCollator
def custom_validation_loop(model, dataloader, device, tokenizer, print_every_n_steps=10):
    model.eval()
    predictions = []
    for step, batch in enumerate(dataloader):
        embeddings = batch["embedding"].to(device)
        tweet_texts = batch["tweet_text"]
        with torch.no_grad():
            outputs = model.generate(embeddings, max_length=tokenizer.model_max_length)
            decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        for tweet_text, pred_text in zip(tweet_texts, decoded_outputs):
            predictions.append((tweet_text, pred_text))

            if step % print_every_n_steps == 0:
                print(f"Step {step} - Original: {tweet_text}")
                print(f"Step {step} - Generated: {pred_text}")
                print()

    return predictions

# Function to train the model
def main_train_decoder():
    base_path = "./"
    with open(f'{base_path}/MMHS150K_GT.json', 'r') as f:
        data = json.load(f)

    with open(f'{base_path}/splits/train_ids.txt', 'r') as f:
        id_train = f.read().split()
    with open(f'{base_path}/splits/val_ids.txt', 'r') as f:
        id_val = f.read().split()
    with open(f'{base_path}/splits/test_ids.txt', 'r') as f:
        id_test = f.read().split()

    dict_train = {x: data[x] for x in id_train if x in data}
    dict_val = {x: data[x] for x in id_val if x in data}
    dict_test = {x: data[x] for x in id_test if x in data}

    with open('image_embeddings.pkl', 'rb') as f:
        embeddings = pickle.load(f)

    token = 'tu_token_hf'  # Asegúrate de que este token sea el correcto
    api_key = 'tu_api_key_wandb'

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )

    llama_model = AutoModelForCausalLM.from_pretrained(
        "meta-llama/Meta-Llama-3-8B-Instruct",
        quantization_config=bnb_config,
        device_map="auto",
    )

    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token=token)

    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

    peft_config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
    )
    llama_model = get_peft_model(llama_model, peft_config)

    train_dataset = TextDataset(dict_train, embeddings, tokenizer)
    val_dataset = ValidationDataset(dict_val, embeddings, tokenizer)
    test_dataset = ValidationDataset(dict_test, embeddings, tokenizer)

    train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

    # Inicializar wandb antes del entrenamiento
    wandb.login(key=api_key)
    run = wandb.init(
        project='Fine-tune Llama 3 8B on Image Embeddings', 
        job_type="training", 
        anonymous="allow"
    )

    training_arguments = TrainingArguments(
        output_dir="llama-3-8b-meme-poster",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=8,
        optim="paged_adamw_32bit",
        num_train_epochs=1,
        evaluation_strategy="no",  # Disable automatic evaluation during training
        logging_steps=1,
        warmup_steps=10,
        logging_strategy="steps",
        learning_rate=5e-4,
        fp16=True,
        bf16=False,
        group_by_length=True,
        report_to="wandb"    
    )

    trainer = SFTTrainer(
        model=llama_model,
        train_dataset=train_dataset,
        eval_dataset=None,  # Disable automatic evaluation
        peft_config=peft_config,
        max_seq_length=512,
        dataset_text_field="tweet_text",
        tokenizer=tokenizer,
        args=training_arguments,
        packing=False,
    )

    trainer.train()

    wandb.finish()

    # Custom validation loop
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    llama_model.to(device)
    predictions = custom_validation_loop(llama_model, val_dataloader, device, tokenizer, print_every_n_steps=10)
    print(predictions)

if __name__ == "__main__":
    torch.cuda.set_device(2)
    main_train_decoder()

2024-06-19 13:55:54.700346: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-19 13:55:54.773521: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjsantamariag[0m ([33mj-santamariag[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/javiermo/.netrc



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Guardar modelo

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

def save_model_locally(output_dir):
    # Load the trained model and tokenizer
    llama_model = AutoModelForCausalLM.from_pretrained("llama-3-8b-meme-poster")
    tokenizer = AutoTokenizer.from_pretrained("llama-3-8b-meme-poster")

    # Save the model locally
    llama_model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Model saved locally at {output_dir}")

if __name__ == "__main__":
    # Define the output directory
    output_dir = "finetuned-llamas"

    # Save the model locally
    save_model_locally(output_dir)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Model saved locally at finetuned-llamas


Probar

In [None]:
import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import pickle
import json
from torch.utils.data import Dataset, DataLoader
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
from peft import get_peft_model, LoraConfig
from safetensors import safe_open

# Dataset class for validation
class ValidationDataset(Dataset):
    def __init__(self, data_dict, embeddings, tokenizer, max_length=128):
        self.data_dict = data_dict
        self.embeddings = embeddings
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data_dict)

    def __getitem__(self, idx):
        tweet_id = list(self.data_dict.keys())[idx]
        tweet_info = self.data_dict[tweet_id]
        embedding = self.embeddings[tweet_id]

        tweet_text = preprocess_text(tweet_info['tweet_text'])

        return {
            "embedding": torch.tensor(embedding, dtype=torch.float16),
            "tweet_id": tweet_id,
            "tweet_text": tweet_text
        }

# Custom DataLoader for validation to bypass DataCollator
def custom_validation_loop(model, dataloader, device, tokenizer, print_every_n_steps=10):
    model.eval()
    predictions = []
    for step, batch in enumerate(dataloader):
        embeddings = batch["embedding"].to(device)
        tweet_texts = batch["tweet_text"]
        with torch.no_grad():
            outputs = model.generate(inputs_embeds=embeddings, max_length=tokenizer.model_max_length)
            decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        for tweet_text, pred_text in zip(tweet_texts, decoded_outputs):
            predictions.append((tweet_text, pred_text))

            if step % print_every_n_steps == 0:
                print(f"Step {step} - Original: {tweet_text}")
                print(f"Step {step} - Generated: {pred_text}")
                print()

    return predictions

def preprocess_text(text):
    """Preprocess the text by removing links and replacing @mentions with [USR]"""
    text = re.sub(r"http\S+", "", text)  # remove links
    text = re.sub(r"@\w+", "[USR]", text)  # replace mentions
    return text

if __name__ == "__main__":
    
    torch.cuda.set_device(1)
    
    # Paths
    data_file = './MMHS150K_GT.json'
    val_ids_file = './splits/val_ids.txt'
    embeddings_file = 'image_embeddings.pkl'
    model_dir = 'finetuned-llamas'

    # Load data and embeddings
    with open(data_file, 'r') as f:
        data_dict = json.load(f)
    with open(val_ids_file, 'r') as f:
        val_ids = f.read().split()
    data_dict = {x: data_dict[x] for x in val_ids if x in data_dict}
    with open(embeddings_file, 'rb') as f:
        embeddings = pickle.load(f)

    # Load tokenizer and model configuration
    tokenizer = AutoTokenizer.from_pretrained(model_dir)

    # Load the model with empty weights initially
    with init_empty_weights():
        llama_model = AutoModelForCausalLM.from_pretrained(model_dir, low_cpu_mem_usage=True, device_map="auto")

    # Apply LoRA configuration
    peft_config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
    )
    
    llama_model = get_peft_model(llama_model, peft_config)

    # Load LoRA weights manually
    with safe_open(model_dir, framework="pt", device="cpu") as f:
        for k in f.keys():
            weight = f.get_tensor(k)
            llama_model.state_dict()[k].copy_(weight)

    # Dispatch the model to device with CPU offload
    llama_model = load_checkpoint_and_dispatch(
        llama_model, 
        model_dir,
        device_map={"": "cpu"},
        offload_folder="offload",
        offload_state_dict=True,
        dtype=torch.float16
    )

    # Prepare validation dataset and dataloader
    val_dataset = ValidationDataset(data_dict, embeddings, tokenizer)
    val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

    # Custom validation loop
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    llama_model.to(device)
    predictions = custom_validation_loop(llama_model, val_dataloader, device, tokenizer, print_every_n_steps=10)
    print(predictions)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU  has a total capacity of 23.69 GiB of which 70.00 MiB is free. Process 398686 has 6.18 GiB memory in use. Process 778695 has 776.00 MiB memory in use. Including non-PyTorch memory, this process has 13.57 GiB memory in use. Process 1335198 has 3.10 GiB memory in use. Of the allocated memory 13.18 GiB is allocated by PyTorch, and 106.70 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

Añadir cálculo de métricas (BLEU, Perplexity, loss)

In [None]:
import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import pickle
import json
from torch.utils.data import Dataset, DataLoader
from accelerate import load_checkpoint_and_dispatch
from peft import get_peft_model, LoraConfig
from safetensors import safe_open
from nltk.translate.bleu_score import sentence_bleu
from torch.nn import CrossEntropyLoss
from torch.optim import Adam

# Dataset class for validation
class ValidationDataset(Dataset):
    def __init__(self, data_dict, embeddings, tokenizer, max_length=128):
        self.data_dict = data_dict
        self.embeddings = embeddings
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data_dict)

    def __getitem__(self, idx):
        tweet_id = list(self.data_dict.keys())[idx]
        tweet_info = self.data_dict[tweet_id]
        embedding = self.embeddings[tweet_id]

        tweet_text = preprocess_text(tweet_info['tweet_text'])

        return {
            "embedding": torch.tensor(embedding, dtype=torch.float16),
            "tweet_id": tweet_id,
            "tweet_text": tweet_text
        }

# Custom DataLoader for validation to bypass DataCollator
def custom_validation_loop(model, dataloader, device, tokenizer, criterion, print_every_n_steps=10):
    model.eval()
    predictions = []
    total_loss = 0
    total_tokens = 0
    for step, batch in enumerate(dataloader):
        embeddings = batch["embedding"].to(device)
        tweet_texts = batch["tweet_text"]
        inputs = tokenizer(tweet_texts, return_tensors='pt', padding=True, truncation=True, max_length=tokenizer.model_max_length).to(device)
        labels = inputs.input_ids
        with torch.no_grad():
            outputs = model.generate(inputs_embeds=embeddings, max_length=tokenizer.model_max_length)
            decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            model_outputs = model(**inputs)
            logits = model_outputs.logits
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            loss = criterion(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            total_loss += loss.item() * labels.size(0)
            total_tokens += labels.size(0)

        for tweet_text, pred_text in zip(tweet_texts, decoded_outputs):
            predictions.append((tweet_text, pred_text))

            if step % print_every_n_steps == 0:
                print(f"Step {step} - Original: {tweet_text}")
                print(f"Step {step} - Generated: {pred_text}")
                print()

    average_loss = total_loss / total_tokens
    perplexity = torch.exp(torch.tensor(average_loss)).item()
    return predictions, average_loss, perplexity

def preprocess_text(text):
    """Preprocess the text by removing links and replacing @mentions with [USR]"""
    text = re.sub(r"http\S+", "", text)  # remove links
    text = re.sub(r"@\w+", "[USR]", text)  # replace mentions
    return text

def calculate_bleu(predictions):
    bleu_scores = []
    for original, generated in predictions:
        reference = original.split()  # Reference (ground truth)
        candidate = generated.split()  # Generated text
        bleu_score = sentence_bleu([reference], candidate)
        bleu_scores.append(bleu_score)
    return sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0

if __name__ == "__main__":
    
    torch.cuda.set_device(1)
    
    # Paths
    data_file = './MMHS150K_GT.json'
    val_ids_file = './splits/val_ids.txt'
    embeddings_file = 'image_embeddings.pkl'
    model_dir = 'finetuned-llamas'
    
    num_epochs = 3  # Number of epochs

    # Load data and embeddings
    with open(data_file, 'r') as f:
        data_dict = json.load(f)
    with open(val_ids_file, 'r') as f:
        val_ids = f.read().split()
    data_dict = {x: data_dict[x] for x in val_ids if x in data_dict}
    with open(embeddings_file, 'rb') as f:
        embeddings = pickle.load(f)

    # Load tokenizer and model configuration
    tokenizer = AutoTokenizer.from_pretrained(model_dir)

    # Load the model directly
    llama_model = AutoModelForCausalLM.from_pretrained(model_dir, low_cpu_mem_usage=True)

    # Apply LoRA configuration
    peft_config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
    )
    
    llama_model = get_peft_model(llama_model, peft_config)

    # Load LoRA weights manually
    with safe_open(model_dir, framework="pt", device="cpu") as f:
        for k in f.keys():
            weight = f.get_tensor(k)
            llama_model.state_dict()[k].copy_(weight)

    # Dispatch the model to device with CPU offload
    llama_model = load_checkpoint_and_dispatch(
        llama_model, 
        model_dir,
        device_map={"": "cpu"},
        offload_folder="offload",
        offload_state_dict=True,
        dtype=torch.float16
    )

    # Prepare training and validation datasets and dataloaders
    train_dataset = ValidationDataset(data_dict, embeddings, tokenizer)
    val_dataset = ValidationDataset(data_dict, embeddings, tokenizer)
    train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

    # Training and validation loop
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    llama_model.to(device)

    optimizer = Adam(llama_model.parameters(), lr=5e-5)
    criterion = CrossEntropyLoss()

    for epoch in range(num_epochs):
        print(f"Epoch {epoch+1}/{num_epochs}")

        # Training loop
        llama_model.train()
        total_train_loss = 0
        total_train_tokens = 0
        for step, batch in enumerate(train_dataloader):
            embeddings = batch["embedding"].to(device)
            tweet_texts = batch["tweet_text"]
            inputs = tokenizer(tweet_texts, return_tensors='pt', padding=True, truncation=True, max_length=tokenizer.model_max_length).to(device)
            labels = inputs.input_ids
            optimizer.zero_grad()
            outputs = llama_model(**inputs)
            logits = outputs.logits
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            loss = criterion(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item() * labels.size(0)
            total_train_tokens += labels.size(0)
        
        average_train_loss = total_train_loss / total_train_tokens
        train_perplexity = torch.exp(torch.tensor(average_train_loss)).item()
        print(f"Training Loss: {average_train_loss}")
        print(f"Training Perplexity: {train_perplexity}")

        # Validation loop
        predictions, val_loss, val_perplexity = custom_validation_loop(llama_model, val_dataloader, device, tokenizer, criterion, print_every_n_steps=10)
        
        # Calculate BLEU score
        bleu_score = calculate_bleu(predictions)
        print(f"Validation BLEU score: {bleu_score}")
        print(f"Validation Loss: {val_loss}")
        print(f"Validation Perplexity: {val_perplexity}")

        print(predictions)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



NotImplementedError: Cannot copy out of meta tensor; no data! Please use torch.nn.Module.to_empty() instead of torch.nn.Module.to() when moving module from meta to a different device.

# 🟢Primer entrenamiento satisfactorio

In [1]:
import os

# Hacer visibles solo las GPUs 1 y 2
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"

import torch

# Ahora PyTorch solo verá las GPUs 1 y 2
print(torch.cuda.device_count())  # Debería imprimir 2
print(torch.cuda.get_device_name(0))  # Nombre de la primera GPU visible (anteriormente GPU 1)
print(torch.cuda.get_device_name(1))  # Nombre de la segunda GPU visible (anteriormente GPU 2)


2
NVIDIA GeForce RTX 3090
NVIDIA GeForce RTX 3090


In [2]:
import re
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import (
    LoraConfig,
    get_peft_model,
)
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import wandb
from trl import SFTTrainer
import pickle
import json
import psutil
import signal

# Preprocessing function
def preprocess_text(text):
    """Preprocess the text by removing links and replacing @mentions with [USR]"""
    text = re.sub(r"http\S+", "", text)  # remove links
    text = re.sub(r"@\w+", "[USR]", text)  # replace mentions
    return text

# Dataset class for training
class TextDataset(Dataset):
    def __init__(self, data_dict, embeddings, tokenizer, max_length=128):
        self.data_dict = data_dict
        print(f"Training data size: {len(self.data_dict)}")
        self.embeddings = embeddings
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data_dict)

    def __getitem__(self, idx):
        tweet_id = list(self.data_dict.keys())[idx]
        tweet_info = self.data_dict[tweet_id]
        embedding = self.embeddings[tweet_id]

        tweet_text = preprocess_text(tweet_info['tweet_text'])
        inputs = self.tokenizer(tweet_text, return_tensors="pt", max_length=self.max_length, padding='max_length', truncation=True)

        labels = inputs.input_ids.squeeze(0).clone()
        input_ids = inputs.input_ids.squeeze(0).clone()
        input_ids = torch.cat([torch.tensor([self.tokenizer.pad_token_id]), input_ids[:-1]])

        return {
            "embedding": torch.tensor(embedding, dtype=torch.float16),
            "input_ids": input_ids,
            "attention_mask": inputs.attention_mask.squeeze(0),
            "labels": labels,
            "tweet_text": tweet_text,
            "inputs": inputs.input_ids.squeeze(0)
        }

# Dataset class for validation
class ValidationDataset(Dataset):
    def __init__(self, data_dict, embeddings, tokenizer, max_length=128):
        self.data_dict = data_dict
        print(f"Validation data size: {len(self.data_dict)}")
        self.embeddings = embeddings
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data_dict)

    def __getitem__(self, idx):
        tweet_id = list(self.data_dict.keys())[idx]
        tweet_info = self.data_dict[tweet_id]
        embedding = self.embeddings[tweet_id]

        tweet_text = preprocess_text(tweet_info['tweet_text'])

        return {
            "embedding": torch.tensor(embedding, dtype=torch.float16),
            "tweet_id": tweet_id,
            "tweet_text": tweet_text
        }

# Custom DataLoader for validation to bypass DataCollator
class TimeoutException(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutException

# Register the signal function handler
signal.signal(signal.SIGALRM, timeout_handler)

def custom_validation_loop(model, dataloader, device, tokenizer, linear_layer, print_every_n_steps=10):
    model.eval()
    predictions = []
    for step, batch in enumerate(dataloader):
        print(f"Validation step {step}")
        print_memory_usage()
        embeddings = batch["embedding"].to(device)  # Ensure embeddings are in float16
        tweet_texts = batch["tweet_text"]
        
        print(f"Embeddings shape before linear layer: {embeddings.shape}")
        
        with torch.no_grad():
            embeddings = linear_layer(embeddings).to(device)  # Apply the linear layer to project embeddings
            print(f"Embeddings shape after linear layer: {embeddings.shape}")
            input_ids = torch.full((embeddings.size(0), 1), tokenizer.pad_token_id, dtype=torch.long).to(device)
            
            print(f"Input IDs shape: {input_ids.shape}")
            
            # Ensure embeddings has batch size dimension
            if len(embeddings.shape) == 2:
                embeddings = embeddings.unsqueeze(1)
                print(f"Reshaped Embeddings shape: {embeddings.shape}")
            
            print("Checking memory before generation")
            print_memory_usage()
            
            try:
                # Set the alarm for 30 seconds
                signal.alarm(30)
                
                outputs = model.generate(
                    input_ids=input_ids,
                    inputs_embeds=embeddings,
                    max_length=256,  # Limit length to prevent excessively long texts
                    num_beams=2,
                    do_sample=True,
                    top_k=50,
                    top_p=0.95,
                    temperature=1.0,
                    repetition_penalty=2.0,  # Increase repetition penalty to avoid word repetitions
                    pad_token_id=tokenizer.pad_token_id,
                    eos_token_id=tokenizer.eos_token_id
                )
                print(f"Outputs shape: {outputs.shape}")
                decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
                
                # Disable the alarm
                signal.alarm(0)
            except TimeoutException:
                print("Generation timed out")
                continue
            except Exception as e:
                print(f"Error during generation: {e}")
                continue

        for tweet_text, pred_text in zip(tweet_texts, decoded_outputs):
            predictions.append((tweet_text, pred_text))

            if step % print_every_n_steps == 0:
                print(f"Step {step} - Original: {tweet_text}")
                print(f"Step {step} - Generated: {pred_text}")
                print()

    return predictions

def print_memory_usage():
    mem = psutil.virtual_memory()
    print(f"Memory Usage: {mem.percent}% used. {mem.available / 1024 ** 2:.2f}MB available.")

    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            gpu_mem = torch.cuda.memory_reserved(i) / 1024 ** 2
            gpu_max_mem = torch.cuda.max_memory_allocated(i) / 1024 ** 2
            gpu_mem_alloc = torch.cuda.memory_allocated(i) / 1024 ** 2
            print(f"GPU {i} Memory Usage: {gpu_mem:.2f}MB reserved. {gpu_max_mem:.2f}MB max allocated. {gpu_mem_alloc:.2f}MB currently allocated.")

# Function to train the model
def main_train_decoder():
    base_path = "./"
    with open(f'{base_path}/MMHS150K_GT.json', 'r') as f:
        data = json.load(f)

    with open(f'{base_path}/splits/train_ids.txt', 'r') as f:
        id_train = f.read().split()
    with open(f'{base_path}/splits/val_ids.txt', 'r') as f:
        id_val = f.read().split()

    dict_train = {x: data[x] for x in id_train if x in data}
    dict_val = {x: data[x] for x in id_val if x in data}

    with open('image_embeddings.pkl', 'rb') as f:
        embeddings = pickle.load(f)

    token = 'tu_token_hf'  # Asegúrate de que este token sea el correcto
    api_key = 'tu_api_key_wandb'

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )

    print_memory_usage()
    llama_model = AutoModelForCausalLM.from_pretrained(
        "meta-llama/Meta-Llama-3-8B-Instruct",
        quantization_config=bnb_config,
        device_map="auto",  # Mantener el mapeo automático de dispositivos
    )
    print("Model loaded.")
    print_memory_usage()

    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token=token)

    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

    peft_config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
    )
    llama_model = get_peft_model(llama_model, peft_config)

    train_dataset = TextDataset(dict_train, embeddings, tokenizer)
    val_dataset = ValidationDataset(dict_val, embeddings, tokenizer)

    train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=2, shuffle=False)  # Reducir tamaño del lote para validación

    # Add linear layer for projecting embeddings
    linear_layer = nn.Linear(768, 4096).to('cuda').to(torch.float16)

    # Inicializar wandb antes del entrenamiento
    wandb.login(key=api_key)
    run = wandb.init(
        project='Fine-tune Llama 3 8B on Image Embeddings', 
        job_type="training", 
        anonymous="allow"
    )

    training_arguments = TrainingArguments(
        output_dir="llama-3-8b-meme-poster",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        gradient_accumulation_steps=8,
        optim="paged_adamw_32bit",
        num_train_epochs=1,
        evaluation_strategy="no",  # Disable automatic evaluation during training
        logging_steps=1,
        warmup_steps=10,
        logging_strategy="steps",
        learning_rate=5e-4,
        fp16=True,
        bf16=False,
        group_by_length=True,
        report_to="wandb"    
    )

    trainer = SFTTrainer(
        model=llama_model,
        train_dataset=train_dataset,
        eval_dataset=None,  # Disable automatic evaluation
        peft_config=peft_config,
        max_seq_length=256,
        dataset_text_field="tweet_text",
        tokenizer=tokenizer,
        args=training_arguments,
        packing=False,
    )

    print("Starting training.")
    print_memory_usage()
    trainer.train()
    print("Training finished.")
    print_memory_usage()

    wandb.finish()

    # Custom validation loop  
    print("Custom validation loop") 
    print("device = cuda 0")
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 
    print("Move linear layer to device")
    linear_layer.to(device)  # Move the linear layer to the same device
    
    # Liberar memoria antes de la validación
    torch.cuda.empty_cache()
    print("Memory cache cleared before validation.")
    print_memory_usage() 
     
    print("Now generate predictions")
    
    try:
        predictions = custom_validation_loop(llama_model, val_dataloader, device, tokenizer, linear_layer, print_every_n_steps=1)
        print(predictions)
    except:
        print("Error al generar predicciones")
    

if __name__ == "__main__":
    main_train_decoder()

2024-07-03 14:50:00.869258: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-03 14:50:00.945046: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[2024-07-03 14:50:03,605] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
Memory Usage: 25.2% used. 192741.15MB available.
GPU 0 Memory Usage: 0.00MB reserved. 0.00MB max allocated. 0.00MB currently allocated.
GPU 1 Memory Usage: 0.00MB reserved. 0.00MB max allocated. 0.00MB currently allocated.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded.
Memory Usage: 25.4% used. 192168.53MB available.
GPU 0 Memory Usage: 1862.00MB reserved. 1955.44MB max allocated. 1860.59MB currently allocated.
GPU 1 Memory Usage: 3668.00MB reserved. 3694.41MB max allocated. 3578.44MB currently allocated.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Training data size: 134823
Validation data size: 5000


[34m[1mwandb[0m: Currently logged in as: [33mjsantamariag[0m ([33mj-santamariag[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/javiermo/.netrc



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Starting training.
Memory Usage: 25.5% used. 191997.89MB available.
GPU 0 Memory Usage: 1902.00MB reserved. 1955.44MB max allocated. 1900.59MB currently allocated.
GPU 1 Memory Usage: 3788.00MB reserved. 3716.46MB max allocated. 3704.45MB currently allocated.


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Step,Training Loss
1,5.7684
2,5.6615
3,5.5184
4,5.4043
5,4.9693
6,4.5203
7,4.6159
8,4.0214
9,3.9778
10,3.7144


Training finished.
Memory Usage: 28.4% used. 184461.53MB available.
GPU 0 Memory Usage: 8066.00MB reserved. 8039.13MB max allocated. 1954.84MB currently allocated.
GPU 1 Memory Usage: 22588.00MB reserved. 21963.49MB max allocated. 3834.70MB currently allocated.


VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/grad_norm,█▄▂▁▁▁▁▃▃▂▂▃▁▂▁▁▂▂▁▂▁▂▁▂▁▁▂▁▂▂▁▂▁▁▁▁▁▂▂▁
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁▁
train/loss,█▆▅▅▅▄▄▄▄▄▅▅▃▄▄▄▄▄▄▃▂▅▄▄▄▃▄▃▃▃▄▃▂▃▃▁▂▃▃▃

0,1
total_flos,7.812073095891517e+17
train/epoch,0.99964
train/global_step,1053.0
train/grad_norm,0.7594
train/learning_rate,0.0
train/loss,2.7642
train_loss,2.92072
train_runtime,13246.2151
train_samples_per_second,10.178
train_steps_per_second,0.079


Custom validation loop
device = cuda 0
Move linear layer to device


The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Memory cache cleared before validation.
Memory Usage: 28.4% used. 184458.85MB available.
GPU 0 Memory Usage: 1990.00MB reserved. 8039.13MB max allocated. 1960.85MB currently allocated.
GPU 1 Memory Usage: 4086.00MB reserved. 21963.49MB max allocated. 3828.69MB currently allocated.
Now generate predictions
Validation step 0
Memory Usage: 28.4% used. 184458.85MB available.
GPU 0 Memory Usage: 1990.00MB reserved. 8039.13MB max allocated. 1960.85MB currently allocated.
GPU 1 Memory Usage: 4086.00MB reserved. 21963.49MB max allocated. 3828.69MB currently allocated.
Embeddings shape before linear layer: torch.Size([2, 768])
Embeddings shape after linear layer: torch.Size([2, 4096])
Input IDs shape: torch.Size([2, 1])
Reshaped Embeddings shape: torch.Size([2, 1, 4096])
Checking memory before generation
Memory Usage: 28.4% used. 184458.85MB available.
GPU 0 Memory Usage: 1990.00MB reserved. 8039.13MB max allocated. 1960.87MB currently allocated.
GPU 1 Memory Usage: 4086.00MB reserved. 21963.49

: 

Me di cuenta de que había dejado el batch size validation a 2. Así que lo quise interrumpir para guardarlo (porque tardaría DEMASIADO si no y me auto-desconectaría igual), pero ha crusheado el kernel.

Sadge, pero tenemos al menos los prints. Guardaremos tras repetir con parámetros.

# 🔴 Ajuste de hiperparámetros

Voy a bajar topk y parámetros similares

outputs = model.generate(
    input_ids=input_ids,
    inputs_embeds=embeddings,
    max_length=128,  # Limit length to prevent excessively long texts
    num_beams=2,  # Consider fewer sequences
    do_sample=True,
    top_k=3,  # Consider fewer top words
    top_p=0.9,  # Consider words with a cumulative probability of 90%
    temperature=0.8,  # Make the model more conservative
    repetition_penalty=3,  # Increase repetition penalty to avoid word repetitions
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id
)



Beneficios Potenciales de Ajustar Estos Parámetros:
Mejora de Coherencia: Reducir top_k, top_p, y temperature puede ayudar a que las respuestas sean más coherentes y menos propensas a incluir palabras irrelevantes o inesperadas.
Reducción de Repeticiones: Ajustar repetition_penalty puede ayudar a evitar la repetición excesiva de palabras o frases, mejorando la fluidez del texto.
Menor Costo Computacional: Bajar num_beams puede reducir el tiempo y los recursos computacionales necesarios para la generación de texto.

Función para guardar el modelo

In [1]:
import os
from datetime import datetime

def save_model_and_tokenizer(trainer, tokenizer):
    """Guardar el modelo y el tokenizador en una carpeta con la fecha y hora actual."""
    # Crear la carpeta con la fecha y hora actual
    current_time = datetime.now().strftime("%Y%m%d-%H%M%S")
    save_dir = os.path.join("saved-finetuned-llamas", current_time)
    os.makedirs(save_dir, exist_ok=True)

    # Guardar el modelo y el tokenizador
    trainer.model.save_pretrained(save_dir)
    tokenizer.save_pretrained(save_dir)



Nos aseguramos de no acaparar todas las gráficas

In [2]:
import os

# Hacer visibles solo las GPUs 1 y 2
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"

import torch

# Ahora PyTorch solo verá las GPUs 1 y 2
print(torch.cuda.device_count())  # Debería imprimir 2
print(torch.cuda.get_device_name(0))  # Nombre de la primera GPU visible (anteriormente GPU 1)
print(torch.cuda.get_device_name(1))  # Nombre de la segunda GPU visible (anteriormente GPU 2)

2
NVIDIA GeForce RTX 3090
NVIDIA GeForce RTX 3090


Recuperamos el cálculo de métricas

In [3]:
%pip install rouge

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [4]:
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge

Ahora sí, entrenamiento, ajustando hiperparámetros y modificando el bucle de validación para calcular métricas

In [5]:
import re
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import (
    LoraConfig,
    get_peft_model,
)
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import wandb
from trl import SFTTrainer
import pickle
import json
import psutil
import signal

# Preprocessing function
def preprocess_text(text):
    """Preprocess the text by removing links and replacing @mentions with [USR]"""
    text = re.sub(r"http\S+", "", text)  # remove links
    text = re.sub(r"@\w+", "[USR]", text)  # replace mentions
    return text

# Dataset class for training
class TextDataset(Dataset):
    def __init__(self, data_dict, embeddings, tokenizer, max_length=128):
        self.data_dict = data_dict
        print(f"Training data size: {len(self.data_dict)}")
        self.embeddings = embeddings
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data_dict)

    def __getitem__(self, idx):
        tweet_id = list(self.data_dict.keys())[idx]
        tweet_info = self.data_dict[tweet_id]
        embedding = self.embeddings[tweet_id]

        tweet_text = preprocess_text(tweet_info['tweet_text'])
        inputs = self.tokenizer(tweet_text, return_tensors="pt", max_length=self.max_length, padding='max_length', truncation=True)

        labels = inputs.input_ids.squeeze(0).clone()
        input_ids = inputs.input_ids.squeeze(0).clone()
        input_ids = torch.cat([torch.tensor([self.tokenizer.pad_token_id]), input_ids[:-1]])

        return {
            "embedding": torch.tensor(embedding, dtype=torch.float16),
            "input_ids": input_ids,
            "attention_mask": inputs.attention_mask.squeeze(0),
            "labels": labels,
            "tweet_text": tweet_text,
            "inputs": inputs.input_ids.squeeze(0)
        }

# Dataset class for validation
class ValidationDataset(Dataset):
    def __init__(self, data_dict, embeddings, tokenizer, max_length=128):
        self.data_dict = data_dict
        print(f"Validation data size: {len(self.data_dict)}")
        self.embeddings = embeddings
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data_dict)

    def __getitem__(self, idx):
        tweet_id = list(self.data_dict.keys())[idx]
        tweet_info = self.data_dict[tweet_id]
        embedding = self.embeddings[tweet_id]

        tweet_text = preprocess_text(tweet_info['tweet_text'])

        return {
            "embedding": torch.tensor(embedding, dtype=torch.float16),
            "tweet_id": tweet_id,
            "tweet_text": tweet_text
        }

# Custom DataLoader for validation to bypass DataCollator
class TimeoutException(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutException

# Register the signal function handler
signal.signal(signal.SIGALRM, timeout_handler)

def custom_validation_loop(model, dataloader, device, tokenizer, linear_layer, print_every_n_steps=10):
    model.eval()
    predictions = []
    total_bleu_score = 0
    total_rouge_score = {'rouge-1': {'f': 0, 'p': 0, 'r': 0},
                         'rouge-2': {'f': 0, 'p': 0, 'r': 0},
                         'rouge-l': {'f': 0, 'p': 0, 'r': 0}}
    num_predictions = 0    
    rouge = Rouge()
    
    for step, batch in enumerate(dataloader):
        print("-----------------------------------")
        print(f"Validation step {step}")
        #print_memory_usage()
        embeddings = batch["embedding"].to(device)  # Ensure embeddings are in float16
        tweet_texts = batch["tweet_text"]
        
        #print(f"Embeddings shape before linear layer: {embeddings.shape}")
        
        with torch.no_grad():
            embeddings = linear_layer(embeddings).to(device)  # Apply the linear layer to project embeddings
            #print(f"Embeddings shape after linear layer: {embeddings.shape}")
            input_ids = torch.full((embeddings.size(0), 1), tokenizer.pad_token_id, dtype=torch.long).to(device)
            
            #print(f"Input IDs shape: {input_ids.shape}")
            
            # Ensure embeddings has batch size dimension
            if len(embeddings.shape) == 2:
                embeddings = embeddings.unsqueeze(1)
                #print(f"Reshaped Embeddings shape: {embeddings.shape}")
            
            #print("Checking memory before generation")
            #print_memory_usage()
            
            try:
                # Set the alarm for 30 seconds
                signal.alarm(30)
                
                outputs = model.generate(
                    input_ids=input_ids,
                    inputs_embeds=embeddings,
                    max_length=256,  # Limit length to prevent excessively long texts
                    num_beams=2,
                    do_sample=True,
                    top_k=5,
                    top_p=0.9,
                    temperature=0.8,
                    repetition_penalty=3.0,  # Increase repetition penalty to avoid word repetitions
                    pad_token_id=tokenizer.pad_token_id,
                    eos_token_id=tokenizer.eos_token_id
                )
                #print(f"Outputs shape: {outputs.shape}")
                decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
                
                # Disable the alarm
                signal.alarm(0)
            except TimeoutException:
                print("Generation timed out")
                continue
            except Exception as e:
                print(f"Error during generation: {e}")
                continue

        for tweet_text, pred_text in zip(tweet_texts, decoded_outputs):
            predictions.append((tweet_text, pred_text))
            
            # Calculate BLEU score
            reference = [tweet_text.split()]
            candidate = pred_text.split()
            bleu_score = sentence_bleu(reference, candidate)
            total_bleu_score += bleu_score
            
            # Calculate ROUGE score
            rouge_scores = rouge.get_scores(pred_text, tweet_text, avg=True)
            for key in total_rouge_score.keys():
                total_rouge_score[key]['f'] += rouge_scores[key]['f']
                total_rouge_score[key]['p'] += rouge_scores[key]['p']
                total_rouge_score[key]['r'] += rouge_scores[key]['r']

            num_predictions += 1

            if step % print_every_n_steps == 0:
                print(f"Step {step} - Original: {tweet_text}")
                print(f"Step {step} - Generated: {pred_text}")
                print(f"BLEU Score: {bleu_score:.4f}")
                print(f"ROUGE Scores: {rouge_scores}")
                print()


    # Print average BLEU and ROUGE
    avg_bleu_score = total_bleu_score / num_predictions if num_predictions > 0 else 0
    avg_rouge_score = {key: {metric: score / num_predictions for metric, score in scores.items()} for key, scores in total_rouge_score.items()} if num_predictions > 0 else total_rouge_score

    print(f"Average BLEU Score: {avg_bleu_score:.4f}")
    print(f"Average ROUGE Score: {avg_rouge_score}")
    
    return predictions

def print_memory_usage():
    mem = psutil.virtual_memory()
    print(f"Memory Usage: {mem.percent}% used. {mem.available / 1024 ** 2:.2f}MB available.")

    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            gpu_mem = torch.cuda.memory_reserved(i) / 1024 ** 2
            gpu_max_mem = torch.cuda.max_memory_allocated(i) / 1024 ** 2
            gpu_mem_alloc = torch.cuda.memory_allocated(i) / 1024 ** 2
            print(f"GPU {i} Memory Usage: {gpu_mem:.2f}MB reserved. {gpu_max_mem:.2f}MB max allocated. {gpu_mem_alloc:.2f}MB currently allocated.")

# Function to train the model
def main_train_decoder():
    base_path = "./"
    with open(f'{base_path}/MMHS150K_GT.json', 'r') as f:
        data = json.load(f)

    with open(f'{base_path}/splits/train_ids.txt', 'r') as f:
        id_train = f.read().split()
    with open(f'{base_path}/splits/val_ids.txt', 'r') as f:
        id_val = f.read().split()

    dict_train = {x: data[x] for x in id_train if x in data}
    dict_val = {x: data[x] for x in id_val if x in data}

    with open('image_embeddings.pkl', 'rb') as f:
        embeddings = pickle.load(f)

    token = 'tu_token_hf'  # Asegúrate de que este token sea el correcto
    api_key = 'tu_api_key_wandb'

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )

    print_memory_usage()
    llama_model = AutoModelForCausalLM.from_pretrained(
        "meta-llama/Meta-Llama-3-8B-Instruct",
        quantization_config=bnb_config,
        device_map="auto",  # Mantener el mapeo automático de dispositivos
    )
    print("Model loaded.")
    print_memory_usage()

    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token=token)

    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

    peft_config = LoraConfig(
        r=16,
        lora_alpha=48, # 32
        lora_dropout=0.08, # 0.05
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
    )
    llama_model = get_peft_model(llama_model, peft_config)

    train_dataset = TextDataset(dict_train, embeddings, tokenizer)
    val_dataset = ValidationDataset(dict_val, embeddings, tokenizer)

    train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False) 

    # Add linear layer for projecting embeddings
    linear_layer = nn.Linear(768, 4096).to('cuda').to(torch.float16)

    # Inicializar wandb antes del entrenamiento
    wandb.login(key=api_key)
    run = wandb.init(
        project='Fine-tune Llama 3 8B on Image Embeddings', 
        job_type="training", 
        anonymous="allow"
    )

    training_arguments = TrainingArguments(
        output_dir="llama-3-8b-meme-poster",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        gradient_accumulation_steps=8,
        optim="paged_adamw_32bit",
        num_train_epochs=2,
        evaluation_strategy="no",  # Disable automatic evaluation during training
        logging_steps=1,
        warmup_steps=10,
        logging_strategy="steps",
        learning_rate=5e-4,
        fp16=True,
        bf16=False,
        group_by_length=True,
        report_to="wandb"    
    )

    trainer = SFTTrainer(
        model=llama_model,
        train_dataset=train_dataset,
        eval_dataset=None,  # Disable automatic evaluation
        peft_config=peft_config,
        max_seq_length=256,
        dataset_text_field="tweet_text",
        tokenizer=tokenizer,
        args=training_arguments,
        packing=False,
    )

    print("Starting training.")
    print_memory_usage()
    trainer.train()
    print("Training finished.")
    print_memory_usage()
    
    
    
    # Llamar a la función para guardar el modelo y el tokenizador
    save_model_and_tokenizer(trainer, tokenizer)

    wandb.finish()

    # Custom validation loop  
    print("Custom validation loop") 
    print("device = cuda 0")
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 
    print("Move linear layer to device")
    linear_layer.to(device)  # Move the linear layer to the same device
    
    # Liberar memoria antes de la validación
    torch.cuda.empty_cache()
    print("Memory cache cleared before validation.")
    print_memory_usage() 
     
    print("Now generate predictions")
    
    try:
        predictions = custom_validation_loop(llama_model, val_dataloader, device, tokenizer, linear_layer, print_every_n_steps=1)
        print(predictions)
    except:
        print("Error al generar predicciones")
    

if __name__ == "__main__":
    main_train_decoder()

2024-07-05 03:45:28.764769: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-05 03:45:28.816024: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[2024-07-05 03:45:30,761] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
Memory Usage: 40.2% used. 153930.11MB available.
GPU 0 Memory Usage: 0.00MB reserved. 0.00MB max allocated. 0.00MB currently allocated.
GPU 1 Memory Usage: 0.00MB reserved. 0.00MB max allocated. 0.00MB currently allocated.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded.
Memory Usage: 40.4% used. 153470.68MB available.
GPU 0 Memory Usage: 1862.00MB reserved. 1955.44MB max allocated. 1860.59MB currently allocated.
GPU 1 Memory Usage: 3668.00MB reserved. 3694.41MB max allocated. 3578.44MB currently allocated.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Training data size: 134823
Validation data size: 5000


[34m[1mwandb[0m: Currently logged in as: [33mjsantamariag[0m ([33mj-santamariag[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/javiermo/.netrc



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Starting training.
Memory Usage: 40.5% used. 153355.98MB available.
GPU 0 Memory Usage: 1902.00MB reserved. 1955.44MB max allocated. 1900.59MB currently allocated.
GPU 1 Memory Usage: 3788.00MB reserved. 3716.46MB max allocated. 3704.45MB currently allocated.


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Step,Training Loss
1,5.7684
2,5.6615
3,5.4615
4,5.2612
5,4.7302
6,4.4635
7,4.5567
8,3.8559
9,3.9573
10,3.8221


Training finished.
Memory Usage: 41.8% used. 149820.30MB available.
GPU 0 Memory Usage: 8066.00MB reserved. 8039.13MB max allocated. 1954.84MB currently allocated.
GPU 1 Memory Usage: 22588.00MB reserved. 21963.49MB max allocated. 3834.70MB currently allocated.


VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/grad_norm,█▁▂▁▄▂▁▅▃▂▄▂▁▃▂▂▁▂▂▂▁▃▄▃▃▅▃▃▄▃▄▃▃▄▄▄▄▃▄▃
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁▁
train/loss,█▇▆▆▇▆▅▆▇▇█▇▆▅▆▇▇▆▄▇▃▁▂▂▂▃▁▃▂▂▃▂▄▃▁▃▂▂▂▁

0,1
total_flos,1.562362455228678e+18
train/epoch,1.99929
train/global_step,2106.0
train/grad_norm,1.40721
train/learning_rate,0.0
train/loss,2.5072
train_loss,2.72234
train_runtime,26979.3527
train_samples_per_second,9.995
train_steps_per_second,0.078


Custom validation loop
device = cuda 0
Move linear layer to device


The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Memory cache cleared before validation.
Memory Usage: 41.8% used. 149804.32MB available.
GPU 0 Memory Usage: 1990.00MB reserved. 8039.13MB max allocated. 1960.85MB currently allocated.
GPU 1 Memory Usage: 4086.00MB reserved. 21963.49MB max allocated. 3828.69MB currently allocated.
Now generate predictions
-----------------------------------
Validation step 0
Generation timed out
-----------------------------------
Validation step 1
Generation timed out
-----------------------------------
Validation step 2
Generation timed out
-----------------------------------
Validation step 3
Generation timed out
-----------------------------------
Validation step 4
Generation timed out
-----------------------------------
Validation step 5
Generation timed out
-----------------------------------
Validation step 6
Generation timed out
-----------------------------------
Validation step 7


(El output se ha truncado, pero llegó como al 300)


Con estos ajustes tiende a generar textos infinitos (por eso los timeout) y, cuando no, genera textos sin sentido (símbolos y caracteres random). Experimento descartado.

# 🔴Nuevo intento con los parámetros originales pero con ROUGE y más val batch size

In [1]:
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge


import os
from datetime import datetime

def save_model_and_tokenizer(trainer, tokenizer):
    """Guardar el modelo y el tokenizador en una carpeta con la fecha y hora actual."""
    # Crear la carpeta con la fecha y hora actual
    current_time = datetime.now().strftime("%Y%m%d-%H%M%S")
    save_dir = os.path.join("saved-finetuned-llamas", current_time)
    os.makedirs(save_dir, exist_ok=True)

    # Guardar el modelo y el tokenizador
    trainer.model.save_pretrained(save_dir)
    tokenizer.save_pretrained(save_dir)
    

# Hacer visibles solo las GPUs 1 y 2
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"

import torch

# Ahora PyTorch solo verá las GPUs 1 y 2
print(torch.cuda.device_count())  # Debería imprimir 2
print(torch.cuda.get_device_name(0))  # Nombre de la primera GPU visible (anteriormente GPU 1)
print(torch.cuda.get_device_name(1))  # Nombre de la segunda GPU visible (anteriormente GPU 2)


2
NVIDIA GeForce RTX 3090
NVIDIA GeForce RTX 3090


In [2]:
import re
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import (
    LoraConfig,
    get_peft_model,
)
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import wandb
from trl import SFTTrainer
import pickle
import json
import psutil
import signal

# Preprocessing function
def preprocess_text(text):
    """Preprocess the text by removing links and replacing @mentions with [USR]"""
    text = re.sub(r"http\S+", "", text)  # remove links
    text = re.sub(r"@\w+", "[USR]", text)  # replace mentions
    return text

# Dataset class for training
class TextDataset(Dataset):
    def __init__(self, data_dict, embeddings, tokenizer, max_length=128):
        self.data_dict = data_dict
        print(f"Training data size: {len(self.data_dict)}")
        self.embeddings = embeddings
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data_dict)

    def __getitem__(self, idx):
        tweet_id = list(self.data_dict.keys())[idx]
        tweet_info = self.data_dict[tweet_id]
        embedding = self.embeddings[tweet_id]

        tweet_text = preprocess_text(tweet_info['tweet_text'])
        inputs = self.tokenizer(tweet_text, return_tensors="pt", max_length=self.max_length, padding='max_length', truncation=True)

        labels = inputs.input_ids.squeeze(0).clone()
        input_ids = inputs.input_ids.squeeze(0).clone()
        input_ids = torch.cat([torch.tensor([self.tokenizer.pad_token_id]), input_ids[:-1]])

        return {
            "embedding": torch.tensor(embedding, dtype=torch.float16),
            "input_ids": input_ids,
            "attention_mask": inputs.attention_mask.squeeze(0),
            "labels": labels,
            "tweet_text": tweet_text,
            "inputs": inputs.input_ids.squeeze(0)
        }

# Dataset class for validation
class ValidationDataset(Dataset):
    def __init__(self, data_dict, embeddings, tokenizer, max_length=128):
        self.data_dict = data_dict
        print(f"Validation data size: {len(self.data_dict)}")
        self.embeddings = embeddings
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data_dict)

    def __getitem__(self, idx):
        tweet_id = list(self.data_dict.keys())[idx]
        tweet_info = self.data_dict[tweet_id]
        embedding = self.embeddings[tweet_id]

        tweet_text = preprocess_text(tweet_info['tweet_text'])

        return {
            "embedding": torch.tensor(embedding, dtype=torch.float16),
            "tweet_id": tweet_id,
            "tweet_text": tweet_text
        }

# Custom DataLoader for validation to bypass DataCollator
class TimeoutException(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutException

# Register the signal function handler
signal.signal(signal.SIGALRM, timeout_handler)

def custom_validation_loop(model, dataloader, device, tokenizer, linear_layer, print_every_n_steps=10):
    model.eval()
    predictions = []
    total_bleu_score = 0
    total_rouge_score = {'rouge-1': {'f': 0, 'p': 0, 'r': 0},
                         'rouge-2': {'f': 0, 'p': 0, 'r': 0},
                         'rouge-l': {'f': 0, 'p': 0, 'r': 0}}
    num_predictions = 0    
    rouge = Rouge()
    
    for step, batch in enumerate(dataloader):
        print("----------------------------------")
        print(f"Validation step {step}")
        #print_memory_usage()
        embeddings = batch["embedding"].to(device)  # Ensure embeddings are in float16
        tweet_texts = batch["tweet_text"]
        
        #print(f"Embeddings shape before linear layer: {embeddings.shape}")
        
        with torch.no_grad():
            embeddings = linear_layer(embeddings).to(device)  # Apply the linear layer to project embeddings
            #print(f"Embeddings shape after linear layer: {embeddings.shape}")
            input_ids = torch.full((embeddings.size(0), 1), tokenizer.pad_token_id, dtype=torch.long).to(device)
            
            #print(f"Input IDs shape: {input_ids.shape}")
            
            # Ensure embeddings has batch size dimension
            if len(embeddings.shape) == 2:
                embeddings = embeddings.unsqueeze(1)
                #print(f"Reshaped Embeddings shape: {embeddings.shape}")
            
            #print("Checking memory before generation")
            #print_memory_usage()
            
            try:
                # Set the alarm for 30 seconds
                signal.alarm(30)
                
                outputs = model.generate(
                    input_ids=input_ids,
                    inputs_embeds=embeddings,
                    max_length=256,  # Limit length to prevent excessively long texts
                    num_beams=2,
                    do_sample=True,
                    top_k=30, # Menos que 50 pero no tanto
                    top_p=0.95,
                    temperature=1.0,
                    repetition_penalty=2.0,  # Increase repetition penalty to avoid word repetitions
                    pad_token_id=tokenizer.pad_token_id,
                    eos_token_id=tokenizer.eos_token_id
                )
                #print(f"Outputs shape: {outputs.shape}")
                decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
                
                # Disable the alarm
                signal.alarm(0)
            except TimeoutException:
                print("Generation timed out")
                continue
            except Exception as e:
                print(f"Error during generation: {e}")
                continue

        for tweet_text, pred_text in zip(tweet_texts, decoded_outputs):
            predictions.append((tweet_text, pred_text))
            
            # Calculate BLEU score
            reference = [tweet_text.split()]
            candidate = pred_text.split()
            bleu_score = sentence_bleu(reference, candidate)
            total_bleu_score += bleu_score
            
            # Calculate ROUGE score
            rouge_scores = rouge.get_scores(pred_text, tweet_text, avg=True)
            for key in total_rouge_score.keys():
                total_rouge_score[key]['f'] += rouge_scores[key]['f']
                total_rouge_score[key]['p'] += rouge_scores[key]['p']
                total_rouge_score[key]['r'] += rouge_scores[key]['r']

            num_predictions += 1
            
            if step % print_every_n_steps == 0:
                print(f"Step {step} - Original: {tweet_text}")
                print(f"Step {step} - Generated: {pred_text}")
                print(f"BLEU Score: {bleu_score:.4f}")
                print(f"ROUGE Scores: {rouge_scores}")
                print()

    # Print average BLEU and ROUGE
    avg_bleu_score = total_bleu_score / num_predictions if num_predictions > 0 else 0
    avg_rouge_score = {key: {metric: score / num_predictions for metric, score in scores.items()} for key, scores in total_rouge_score.items()} if num_predictions > 0 else total_rouge_score

    print(f"Average BLEU Score: {avg_bleu_score:.4f}")
    print(f"Average ROUGE Score: {avg_rouge_score}")
    return predictions

def print_memory_usage():
    mem = psutil.virtual_memory()
    print(f"Memory Usage: {mem.percent}% used. {mem.available / 1024 ** 2:.2f}MB available.")

    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            gpu_mem = torch.cuda.memory_reserved(i) / 1024 ** 2
            gpu_max_mem = torch.cuda.max_memory_allocated(i) / 1024 ** 2
            gpu_mem_alloc = torch.cuda.memory_allocated(i) / 1024 ** 2
            print(f"GPU {i} Memory Usage: {gpu_mem:.2f}MB reserved. {gpu_max_mem:.2f}MB max allocated. {gpu_mem_alloc:.2f}MB currently allocated.")

# Function to train the model
def main_train_decoder():
    base_path = "./"
    with open(f'{base_path}/MMHS150K_GT.json', 'r') as f:
        data = json.load(f)

    with open(f'{base_path}/splits/train_ids.txt', 'r') as f:
        id_train = f.read().split()
    with open(f'{base_path}/splits/val_ids.txt', 'r') as f:
        id_val = f.read().split()

    dict_train = {x: data[x] for x in id_train if x in data}
    dict_val = {x: data[x] for x in id_val if x in data}

    with open('image_embeddings.pkl', 'rb') as f:
        embeddings = pickle.load(f)

    token = 'tu_token_hf'  # Asegúrate de que este token sea el correcto
    api_key = 'tu_api_key_wandb'

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )

    print_memory_usage()
    llama_model = AutoModelForCausalLM.from_pretrained(
        "meta-llama/Meta-Llama-3-8B-Instruct",
        quantization_config=bnb_config,
        device_map="auto",  # Mantener el mapeo automático de dispositivos
    )
    print("Model loaded.")
    print_memory_usage()

    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token=token)

    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

    peft_config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
    )
    llama_model = get_peft_model(llama_model, peft_config)

    train_dataset = TextDataset(dict_train, embeddings, tokenizer)
    val_dataset = ValidationDataset(dict_val, embeddings, tokenizer)

    train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)  # Reducir tamaño del lote para validación

    # Add linear layer for projecting embeddings
    linear_layer = nn.Linear(768, 4096).to('cuda').to(torch.float16)

    # Inicializar wandb antes del entrenamiento
    wandb.login(key=api_key)
    run = wandb.init(
        project='Fine-tune Llama 3 8B on Image Embeddings', 
        job_type="training", 
        anonymous="allow"
    )

    training_arguments = TrainingArguments(
        output_dir="llama-3-8b-meme-poster",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        gradient_accumulation_steps=8,
        optim="paged_adamw_32bit",
        num_train_epochs=2,
        evaluation_strategy="no",  # Disable automatic evaluation during training
        logging_steps=1,
        warmup_steps=10,
        logging_strategy="steps",
        learning_rate=5e-4,
        fp16=True,
        bf16=False,
        group_by_length=True,
        report_to="wandb"    
    )

    trainer = SFTTrainer(
        model=llama_model,
        train_dataset=train_dataset,
        eval_dataset=None,  # Disable automatic evaluation
        peft_config=peft_config,
        max_seq_length=256,
        dataset_text_field="tweet_text",
        tokenizer=tokenizer,
        args=training_arguments,
        packing=False,
    )

    print("Starting training.")
    print_memory_usage()
    trainer.train()
    print("Training finished.")
    print_memory_usage()
    
    
     
    # Llamar a la función para guardar el modelo y el tokenizador
    save_model_and_tokenizer(trainer, tokenizer)
    

    wandb.finish()

    # Custom validation loop  
    print("Custom validation loop") 
    print("device = cuda 0")
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 
    print("Move linear layer to device")
    linear_layer.to(device)  # Move the linear layer to the same device
    
    # Liberar memoria antes de la validación
    torch.cuda.empty_cache()
    print("Memory cache cleared before validation.")
    print_memory_usage() 
     
    print("Now generate predictions")
    
    try:
        predictions = custom_validation_loop(llama_model, val_dataloader, device, tokenizer, linear_layer, print_every_n_steps=1)
        print(predictions)
    except:
        print("Error al generar predicciones")
    

if __name__ == "__main__":
    main_train_decoder()

2024-07-05 17:34:42.052622: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-05 17:34:42.122260: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[2024-07-05 17:34:44,517] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
Memory Usage: 40.6% used. 153015.15MB available.
GPU 0 Memory Usage: 0.00MB reserved. 0.00MB max allocated. 0.00MB currently allocated.
GPU 1 Memory Usage: 0.00MB reserved. 0.00MB max allocated. 0.00MB currently allocated.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded.
Memory Usage: 40.9% used. 152329.49MB available.
GPU 0 Memory Usage: 1862.00MB reserved. 1955.44MB max allocated. 1860.59MB currently allocated.
GPU 1 Memory Usage: 3668.00MB reserved. 3694.41MB max allocated. 3578.44MB currently allocated.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Training data size: 134823
Validation data size: 5000


[34m[1mwandb[0m: Currently logged in as: [33mjsantamariag[0m ([33mj-santamariag[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/javiermo/.netrc



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Starting training.
Memory Usage: 40.9% used. 152195.85MB available.
GPU 0 Memory Usage: 1902.00MB reserved. 1955.44MB max allocated. 1900.59MB currently allocated.
GPU 1 Memory Usage: 3788.00MB reserved. 3716.46MB max allocated. 3704.45MB currently allocated.


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Step,Training Loss
1,5.7684
2,5.6615
3,5.5187
4,5.4054
5,4.9691
6,4.5286
7,4.6213
8,4.0455
9,4.0281
10,3.7232


Training finished.
Memory Usage: 39.8% used. 154987.49MB available.
GPU 0 Memory Usage: 8066.00MB reserved. 8039.13MB max allocated. 1954.84MB currently allocated.
GPU 1 Memory Usage: 22588.00MB reserved. 21963.49MB max allocated. 3834.70MB currently allocated.


VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/grad_norm,▃▄▂▁▅▂▂▆▂▂▄▃▂▄▃▄▃▃▄▃▂▄▆▄▆█▆▅▅▅▆▅▆▆▆▅▆▆▆▆
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁▁
train/loss,█▇▆▆▆▆▅▆▇▇█▇▆▅▆▇▇▆▄▇▄▁▃▂▂▃▁▃▂▂▃▂▄▃▁▃▂▂▂▂

0,1
total_flos,1.562362455228678e+18
train/epoch,1.99929
train/global_step,2106.0
train/grad_norm,1.08491
train/learning_rate,0.0
train/loss,2.5335
train_loss,2.70449
train_runtime,27449.1186
train_samples_per_second,9.823
train_steps_per_second,0.077


Custom validation loop
device = cuda 0
Move linear layer to device


The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Memory cache cleared before validation.
Memory Usage: 39.8% used. 154972.09MB available.
GPU 0 Memory Usage: 1990.00MB reserved. 8039.13MB max allocated. 1960.85MB currently allocated.
GPU 1 Memory Usage: 4086.00MB reserved. 21963.49MB max allocated. 3828.69MB currently allocated.
Now generate predictions
----------------------------------
Validation step 0
Generation timed out
----------------------------------
Validation step 1
Generation timed out
----------------------------------
Validation step 2
Generation timed out
----------------------------------
Validation step 3
Generation timed out
----------------------------------
Validation step 4
Generation timed out
----------------------------------
Validation step 5


Dos épocas es demasiado, sobreentrena. Dejémoslo en una.

# 🔴 Una sola época de nuevo y mejorada la validación

In [1]:
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge


import os
from datetime import datetime

def save_model_and_tokenizer(trainer, tokenizer):
    """Guardar el modelo y el tokenizador en una carpeta con la fecha y hora actual."""
    # Crear la carpeta con la fecha y hora actual
    current_time = datetime.now().strftime("%Y%m%d-%H%M%S")
    save_dir = os.path.join("saved-finetuned-llamas", current_time)
    os.makedirs(save_dir, exist_ok=True)

    # Guardar el modelo y el tokenizador
    trainer.model.save_pretrained(save_dir)
    tokenizer.save_pretrained(save_dir)
    

# Hacer visibles solo las GPUs 1 y 2
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"

import torch

# Ahora PyTorch solo verá las GPUs 1 y 2
print(torch.cuda.device_count())  # Debería imprimir 2
print(torch.cuda.get_device_name(0))  # Nombre de la primera GPU visible (anteriormente GPU 1)
print(torch.cuda.get_device_name(1))  # Nombre de la segunda GPU visible (anteriormente GPU 2)


2
NVIDIA GeForce RTX 3090
NVIDIA GeForce RTX 3090


In [2]:
import re
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import (
    LoraConfig,
    get_peft_model,
)
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import wandb
from trl import SFTTrainer
import pickle
import json
import psutil
import signal

# Preprocessing function
def preprocess_text(text):
    """Preprocess the text by removing links and replacing @mentions with [USR]"""
    text = re.sub(r"http\S+", "", text)  # remove links
    text = re.sub(r"@\w+", "[USR]", text)  # replace mentions
    return text

# Dataset class for training
class TextDataset(Dataset):
    def __init__(self, data_dict, embeddings, tokenizer, max_length=128):
        self.data_dict = data_dict
        print(f"Training data size: {len(self.data_dict)}")
        self.embeddings = embeddings
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data_dict)

    def __getitem__(self, idx):
        tweet_id = list(self.data_dict.keys())[idx]
        tweet_info = self.data_dict[tweet_id]
        embedding = self.embeddings[tweet_id]

        tweet_text = preprocess_text(tweet_info['tweet_text'])
        inputs = self.tokenizer(tweet_text, return_tensors="pt", max_length=self.max_length, padding='max_length', truncation=True)

        labels = inputs.input_ids.squeeze(0).clone()
        input_ids = inputs.input_ids.squeeze(0).clone()
        input_ids = torch.cat([torch.tensor([self.tokenizer.pad_token_id]), input_ids[:-1]])

        return {
            "embedding": torch.tensor(embedding, dtype=torch.float16),
            "input_ids": input_ids,
            "attention_mask": inputs.attention_mask.squeeze(0),
            "labels": labels,
            "tweet_text": tweet_text,
            "inputs": inputs.input_ids.squeeze(0)
        }

# Dataset class for validation
class ValidationDataset(Dataset):
    def __init__(self, data_dict, embeddings, tokenizer, max_length=128):
        self.data_dict = data_dict
        print(f"Validation data size: {len(self.data_dict)}")
        self.embeddings = embeddings
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data_dict)

    def __getitem__(self, idx):
        tweet_id = list(self.data_dict.keys())[idx]
        tweet_info = self.data_dict[tweet_id]
        embedding = self.embeddings[tweet_id]

        tweet_text = preprocess_text(tweet_info['tweet_text'])

        return {
            "embedding": torch.tensor(embedding, dtype=torch.float16),
            "tweet_id": tweet_id,
            "tweet_text": tweet_text
        }

# Custom DataLoader for validation to bypass DataCollator
class TimeoutException(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutException

# Register the signal function handler
signal.signal(signal.SIGALRM, timeout_handler)

def custom_validation_loop(model, dataloader, device, tokenizer, linear_layer, print_every_n_steps=10):
    model.eval()
    predictions = []
    for step, batch in enumerate(dataloader):
        print(f"Validation step {step}")
        print_memory_usage()
        embeddings = batch["embedding"].to(device)  # Ensure embeddings are in float16
        tweet_texts = batch["tweet_text"]
        
        print(f"Embeddings shape before linear layer: {embeddings.shape}")
        
        with torch.no_grad():
            embeddings = linear_layer(embeddings).to(device)  # Apply the linear layer to project embeddings
            print(f"Embeddings shape after linear layer: {embeddings.shape}")
            input_ids = torch.full((embeddings.size(0), 1), tokenizer.pad_token_id, dtype=torch.long).to(device)
            
            print(f"Input IDs shape: {input_ids.shape}")
            
            # Ensure embeddings has batch size dimension
            if len(embeddings.shape) == 2:
                embeddings = embeddings.unsqueeze(1)
                print(f"Reshaped Embeddings shape: {embeddings.shape}")
            
            print("Checking memory before generation")
            print_memory_usage()
            
            try:
                # Set the alarm for 30 seconds
                signal.alarm(30)
                
                outputs = model.generate(
                    input_ids=input_ids,
                    inputs_embeds=embeddings,
                    max_length=256,  # Limit length to prevent excessively long texts
                    num_beams=2,
                    do_sample=True,
                    top_k=30,
                    top_p=0.95,
                    temperature=1.0,
                    repetition_penalty=2.0,  # Increase repetition penalty to avoid word repetitions
                    pad_token_id=tokenizer.pad_token_id,
                    eos_token_id=tokenizer.eos_token_id
                )
                print(f"Outputs shape: {outputs.shape}")
                decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
                
                # Disable the alarm
                signal.alarm(0)
            except TimeoutException:
                print("Generation timed out")
                continue
            except Exception as e:
                print(f"Error during generation: {e}")
                continue

        for tweet_text, pred_text in zip(tweet_texts, decoded_outputs):
            predictions.append((tweet_text, pred_text))

            if step % print_every_n_steps == 0:
                print(f"Step {step} - Original: {tweet_text}")
                print(f"Step {step} - Generated: {pred_text}")
                print()

    return predictions

def print_memory_usage():
    mem = psutil.virtual_memory()
    print(f"Memory Usage: {mem.percent}% used. {mem.available / 1024 ** 2:.2f}MB available.")

    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            gpu_mem = torch.cuda.memory_reserved(i) / 1024 ** 2
            gpu_max_mem = torch.cuda.max_memory_allocated(i) / 1024 ** 2
            gpu_mem_alloc = torch.cuda.memory_allocated(i) / 1024 ** 2
            print(f"GPU {i} Memory Usage: {gpu_mem:.2f}MB reserved. {gpu_max_mem:.2f}MB max allocated. {gpu_mem_alloc:.2f}MB currently allocated.")

# Function to train the model
def main_train_decoder():
    base_path = "./"
    with open(f'{base_path}/MMHS150K_GT.json', 'r') as f:
        data = json.load(f)

    with open(f'{base_path}/splits/train_ids.txt', 'r') as f:
        id_train = f.read().split()
    with open(f'{base_path}/splits/val_ids.txt', 'r') as f:
        id_val = f.read().split()

    dict_train = {x: data[x] for x in id_train if x in data}
    dict_val = {x: data[x] for x in id_val if x in data}

    with open('image_embeddings.pkl', 'rb') as f:
        embeddings = pickle.load(f)

    token = 'tu_token_hf'  # Asegúrate de que este token sea el correcto
    api_key = 'tu_api_key_wandb'


    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )

    print_memory_usage()
    llama_model = AutoModelForCausalLM.from_pretrained(
        "meta-llama/Meta-Llama-3-8B-Instruct",
        quantization_config=bnb_config,
        device_map="auto",  # Mantener el mapeo automático de dispositivos
    )
    print("Model loaded.")
    print_memory_usage()

    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token=token)

    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

    peft_config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
    )
    llama_model = get_peft_model(llama_model, peft_config)

    train_dataset = TextDataset(dict_train, embeddings, tokenizer)
    val_dataset = ValidationDataset(dict_val, embeddings, tokenizer)

    train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)  # Reducir tamaño del lote para validación

    # Add linear layer for projecting embeddings
    linear_layer = nn.Linear(768, 4096).to('cuda').to(torch.float16)

    # Inicializar wandb antes del entrenamiento
    wandb.login(key=api_key)
    run = wandb.init(
        project='Fine-tune Llama 3 8B on Image Embeddings', 
        job_type="training", 
        anonymous="allow"
    )

    training_arguments = TrainingArguments(
        output_dir="llama-3-8b-meme-poster",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        gradient_accumulation_steps=8,
        optim="paged_adamw_32bit",
        num_train_epochs=1,
        evaluation_strategy="no",  # Disable automatic evaluation during training
        logging_steps=1,
        warmup_steps=10,
        logging_strategy="steps",
        learning_rate=5e-4,
        fp16=True,
        bf16=False,
        group_by_length=True,
        report_to="wandb"    
    )

    trainer = SFTTrainer(
        model=llama_model,
        train_dataset=train_dataset,
        eval_dataset=None,  # Disable automatic evaluation
        peft_config=peft_config,
        max_seq_length=256,
        dataset_text_field="tweet_text",
        tokenizer=tokenizer,
        args=training_arguments,
        packing=False,
    )

    print("Starting training.")
    print_memory_usage()
    trainer.train()
    print("Training finished.")
    print_memory_usage()

    wandb.finish()

    # Custom validation loop  
    print("Custom validation loop") 
    print("device = cuda 0")
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 
    print("Move linear layer to device")
    linear_layer.to(device)  # Move the linear layer to the same device
    
    # Liberar memoria antes de la validación
    torch.cuda.empty_cache()
    print("Memory cache cleared before validation.")
    print_memory_usage() 
     
    print("Now generate predictions")
    
    try:
        predictions = custom_validation_loop(llama_model, val_dataloader, device, tokenizer, linear_layer, print_every_n_steps=1)
        print(predictions)
    except:
        print("Error al generar predicciones")
    

if __name__ == "__main__":
    main_train_decoder()

2024-07-06 10:08:31.404453: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-06 10:08:31.497146: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[2024-07-06 10:08:33,464] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
Memory Usage: 49.8% used. 129327.26MB available.
GPU 0 Memory Usage: 0.00MB reserved. 0.00MB max allocated. 0.00MB currently allocated.
GPU 1 Memory Usage: 0.00MB reserved. 0.00MB max allocated. 0.00MB currently allocated.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded.
Memory Usage: 50.1% used. 128417.68MB available.
GPU 0 Memory Usage: 1862.00MB reserved. 1955.44MB max allocated. 1860.59MB currently allocated.
GPU 1 Memory Usage: 3668.00MB reserved. 3694.41MB max allocated. 3578.44MB currently allocated.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Training data size: 134823
Validation data size: 5000


[34m[1mwandb[0m: Currently logged in as: [33mjsantamariag[0m ([33mj-santamariag[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/javiermo/.netrc



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Starting training.
Memory Usage: 50.2% used. 128274.27MB available.
GPU 0 Memory Usage: 1902.00MB reserved. 1955.44MB max allocated. 1900.59MB currently allocated.
GPU 1 Memory Usage: 3788.00MB reserved. 3716.46MB max allocated. 3704.45MB currently allocated.


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Step,Training Loss
1,5.7684
2,5.6615
3,5.5218
4,5.4209
5,4.9991
6,4.5548
7,4.6395
8,4.0991
9,4.0901
10,3.7612


Training finished.
Memory Usage: 54.0% used. 118516.11MB available.
GPU 0 Memory Usage: 8066.00MB reserved. 8039.13MB max allocated. 1954.84MB currently allocated.
GPU 1 Memory Usage: 22588.00MB reserved. 21963.49MB max allocated. 3834.70MB currently allocated.


VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/grad_norm,█▂▂▁▁▂▁▂▂▂▃▃▂▂▂▂▂▃▁▂▁▂▁▁▂▁▂▁▂▂▂▂▂▂▂▂▁▁▂▁
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁▁
train/loss,█▆▅▅▅▄▄▄▄▄▅▅▃▄▄▄▄▄▄▃▂▅▄▄▄▃▄▃▃▃▄▃▂▃▃▁▂▃▃▃

0,1
total_flos,7.812073095891517e+17
train/epoch,0.99964
train/global_step,1053.0
train/grad_norm,0.76946
train/learning_rate,0.0
train/loss,2.7698
train_loss,2.92178
train_runtime,13611.7741
train_samples_per_second,9.905
train_steps_per_second,0.077


Custom validation loop
device = cuda 0
Move linear layer to device


The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Memory cache cleared before validation.
Memory Usage: 54.0% used. 118471.71MB available.
GPU 0 Memory Usage: 1990.00MB reserved. 8039.13MB max allocated. 1960.85MB currently allocated.
GPU 1 Memory Usage: 4086.00MB reserved. 21963.49MB max allocated. 3828.69MB currently allocated.
Now generate predictions
Validation step 0
Memory Usage: 54.0% used. 118471.71MB available.
GPU 0 Memory Usage: 1990.00MB reserved. 8039.13MB max allocated. 1960.85MB currently allocated.
GPU 1 Memory Usage: 4086.00MB reserved. 21963.49MB max allocated. 3828.69MB currently allocated.
Embeddings shape before linear layer: torch.Size([32, 768])
Embeddings shape after linear layer: torch.Size([32, 4096])
Input IDs shape: torch.Size([32, 1])
Reshaped Embeddings shape: torch.Size([32, 1, 4096])
Checking memory before generation
Memory Usage: 54.0% used. 118471.71MB available.
GPU 0 Memory Usage: 1990.00MB reserved. 8039.13MB max allocated. 1961.10MB currently allocated.
GPU 1 Memory Usage: 4086.00MB reserved. 2196

: 

Ha habido un pequeño error, al no haber incrementado el tiempo máximo, pero haber duplicado el batch size. No le da tiempo a generar los textos.


Además, ha crusheado el kernel en el proceso, pero se guardó el checkpoint del modelo.

Vamos a ajustar el tiempo máximo de generación y ejecutar la validación de nuevo:

# 🔴 Último intento de mejorar resultados

In [1]:
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge


import os
from datetime import datetime

def save_model_and_tokenizer(trainer, tokenizer):
    """Guardar el modelo y el tokenizador en una carpeta con la fecha y hora actual."""
    # Crear la carpeta con la fecha y hora actual
    current_time = datetime.now().strftime("%Y%m%d-%H%M%S")
    save_dir = os.path.join("saved-finetuned-llamas", current_time)
    os.makedirs(save_dir, exist_ok=True)

    # Guardar el modelo y el tokenizador
    trainer.model.save_pretrained(save_dir)
    tokenizer.save_pretrained(save_dir)
    

# Hacer visibles solo las GPUs 1 y 2
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"

import torch

# Ahora PyTorch solo verá las GPUs 1 y 2
print(torch.cuda.device_count())  # Debería imprimir 2
print(torch.cuda.get_device_name(0))  # Nombre de la primera GPU visible (anteriormente GPU 1)
print(torch.cuda.get_device_name(1))  # Nombre de la segunda GPU visible (anteriormente GPU 2)

2
NVIDIA GeForce RTX 3090
NVIDIA GeForce RTX 3090


In [2]:
import re
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import (
    LoraConfig,
    get_peft_model,
)
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import wandb
from trl import SFTTrainer
import pickle
import json
import psutil
import signal
from rouge import Rouge  # NEW: Import ROUGE metric

# Preprocessing function
def preprocess_text(text):
    """Preprocess the text by removing links and replacing @mentions with [USR]"""
    text = re.sub(r"http\S+", "", text)  # remove links
    text = re.sub(r"@\w+", "[USR]", text)  # replace mentions
    return text

# Dataset class for training
class TextDataset(Dataset):
    def __init__(self, data_dict, embeddings, tokenizer, max_length=128):
        self.data_dict = data_dict
        print(f"Training data size: {len(self.data_dict)}")
        self.embeddings = embeddings
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data_dict)

    def __getitem__(self, idx):
        tweet_id = list(self.data_dict.keys())[idx]
        tweet_info = self.data_dict[tweet_id]
        embedding = self.embeddings[tweet_id]

        tweet_text = preprocess_text(tweet_info['tweet_text'])
        inputs = self.tokenizer(tweet_text, return_tensors="pt", max_length=self.max_length, padding='max_length', truncation=True)

        labels = inputs.input_ids.squeeze(0).clone()
        input_ids = inputs.input_ids.squeeze(0).clone()
        input_ids = torch.cat([torch.tensor([self.tokenizer.pad_token_id]), input_ids[:-1]])

        return {
            "embedding": torch.tensor(embedding, dtype=torch.float16),
            "input_ids": input_ids,
            "attention_mask": inputs.attention_mask.squeeze(0),
            "labels": labels,
            "tweet_text": tweet_text,
            "inputs": inputs.input_ids.squeeze(0)
        }

# Dataset class for validation
class ValidationDataset(Dataset):
    def __init__(self, data_dict, embeddings, tokenizer, max_length=128):
        self.data_dict = data_dict
        print(f"Validation data size: {len(self.data_dict)}")
        self.embeddings = embeddings
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data_dict)

    def __getitem__(self, idx):
        tweet_id = list(self.data_dict.keys())[idx]
        tweet_info = self.data_dict[tweet_id]
        embedding = self.embeddings[tweet_id]

        tweet_text = preprocess_text(tweet_info['tweet_text'])

        return {
            "embedding": torch.tensor(embedding, dtype=torch.float16),
            "tweet_id": tweet_id,
            "tweet_text": tweet_text
        }

# Custom DataLoader for validation to bypass DataCollator
class TimeoutException(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutException

# Register the signal function handler
signal.signal(signal.SIGALRM, timeout_handler)

def custom_validation_loop(model, dataloader, device, tokenizer, linear_layer, print_every_n_steps=10):
    model.eval()
    predictions = []
    references = []  # NEW: Store references for ROUGE calculation
    for step, batch in enumerate(dataloader):
        print(f"Validation step {step}")
        print_memory_usage()
        embeddings = batch["embedding"].to(device)  # Ensure embeddings are in float16
        tweet_texts = batch["tweet_text"]
        
        print(f"Embeddings shape before linear layer: {embeddings.shape}")
        
        with torch.no_grad():
            embeddings = linear_layer(embeddings).to(device)  # Apply the linear layer to project embeddings
            print(f"Embeddings shape after linear layer: {embeddings.shape}")
            input_ids = torch.full((embeddings.size(0), 1), tokenizer.pad_token_id, dtype=torch.long).to(device)
            
            print(f"Input IDs shape: {input_ids.shape}")
            
            # Ensure embeddings has batch size dimension
            if len(embeddings.shape) == 2:
                embeddings = embeddings.unsqueeze(1)
                print(f"Reshaped Embeddings shape: {embeddings.shape}")
            
            print("Checking memory before generation")
            print_memory_usage()

            decoded_outputs = []
            for i in range(embeddings.size(0)):
                try:
                    # Set the alarm for 30 seconds per text
                    signal.alarm(30)
                    
                    output = model.generate(
                        input_ids=input_ids[i].unsqueeze(0),
                        inputs_embeds=embeddings[i].unsqueeze(0),
                        max_length=256,  # Limit length to prevent excessively long texts
                        num_beams=2,
                        do_sample=True,
                        top_k=30,
                        top_p=0.95,
                        temperature=1.0,
                        repetition_penalty=2.0,  # Increase repetition penalty to avoid word repetitions
                        pad_token_id=tokenizer.pad_token_id,
                        eos_token_id=tokenizer.eos_token_id
                    )
                    print(f"Output shape: {output.shape}")
                    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
                    
                    # Disable the alarm
                    signal.alarm(0)
                except TimeoutException:
                    print("Generation timed out for one text")
                    decoded_output = ""
                except Exception as e:
                    print(f"Error during generation for one text: {e}")
                    decoded_output = ""

                decoded_outputs.append(decoded_output)

        for tweet_text, pred_text in zip(tweet_texts, decoded_outputs):
            predictions.append((tweet_text, pred_text))
            references.append(tweet_text)  # NEW: Append reference text for ROUGE calculation

            if step % print_every_n_steps == 0:
                print(f"Step {step} - Original: {tweet_text}")
                print(f"Step {step} - Generated: {pred_text}")
                print()

    # NEW: Calculate ROUGE scores
    rouge = Rouge()
    rouge_result = rouge.get_scores([pred[1] for pred in predictions], references, avg=True)
    print(f"ROUGE scores: {rouge_result}")

    return predictions

def print_memory_usage():
    mem = psutil.virtual_memory()
    print(f"Memory Usage: {mem.percent}% used. {mem.available / 1024 ** 2:.2f}MB available.")

    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            gpu_mem = torch.cuda.memory_reserved(i) / 1024 ** 2
            gpu_max_mem = torch.cuda.max_memory_allocated(i) / 1024 ** 2
            gpu_mem_alloc = torch.cuda.memory_allocated(i) / 1024 ** 2
            print(f"GPU {i} Memory Usage: {gpu_mem:.2f}MB reserved. {gpu_max_mem:.2f}MB max allocated. {gpu_mem_alloc:.2f}MB currently allocated.")

# Function to train the model
def main_train_decoder():
    base_path = "./"
    with open(f'{base_path}/MMHS150K_GT.json', 'r') as f:
        data = json.load(f)

    with open(f'{base_path}/splits/train_ids.txt', 'r') as f:
        id_train = f.read().split()
    with open(f'{base_path}/splits/val_ids.txt', 'r') as f:
        id_val = f.read().split()

    dict_train = {x: data[x] for x in id_train if x in data}
    dict_val = {x: data[x] for x in id_val if x in data}

    with open('image_embeddings.pkl', 'rb') as f:
        embeddings = pickle.load(f)

    token = 'tu_token_hf'  # Asegúrate de que este token sea el correcto
    api_key = 'tu_key_wandb'

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )

    print_memory_usage()
    llama_model = AutoModelForCausalLM.from_pretrained(
        "meta-llama/Meta-Llama-3-8B-Instruct",
        quantization_config=bnb_config,
        device_map="auto",  # Mantener el mapeo automático de dispositivos
    )
    print("Model loaded.")
    print_memory_usage()

    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token=token)

    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

    peft_config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
    )
    llama_model = get_peft_model(llama_model, peft_config)

    train_dataset = TextDataset(dict_train, embeddings, tokenizer)
    val_dataset = ValidationDataset(dict_val, embeddings, tokenizer)

    train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)  # Reducir tamaño del lote para validación

    # Add linear layer for projecting embeddings
    linear_layer = nn.Linear(768, 4096).to('cuda').to(torch.float16)

    # Inicializar wandb antes del entrenamiento
    wandb.login(key=api_key)
    run = wandb.init(
        project='Fine-tune Llama 3 8B on Image Embeddings', 
        job_type="training", 
        anonymous="allow"
    )

    training_arguments = TrainingArguments(
        output_dir="llama-3-8b-meme-poster",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        gradient_accumulation_steps=8,
        optim="paged_adamw_32bit",
        num_train_epochs=1,
        evaluation_strategy="no",  # Disable automatic evaluation during training
        logging_steps=1,
        warmup_steps=10,
        logging_strategy="steps",
        learning_rate=5e-4,
        fp16=True,
        bf16=False,
        group_by_length=True,
        report_to="wandb"    
    )

    trainer = SFTTrainer(
        model=llama_model,
        train_dataset=train_dataset,
        eval_dataset=None,  # Disable automatic evaluation
        peft_config=peft_config,
        max_seq_length=256,
        dataset_text_field="tweet_text",
        tokenizer=tokenizer,
        args=training_arguments,
        packing=False,
    )

    print("Starting training.")
    print_memory_usage()
    trainer.train()
    print("Training finished.")
    print_memory_usage()

     
    # Llamar a la función para guardar el modelo y el tokenizador
    save_model_and_tokenizer(trainer, tokenizer)

    wandb.finish()

    # Custom validation loop  
    print("Custom validation loop") 
    print("device = cuda 0")
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 
    print("Move linear layer to device")
    linear_layer.to(device)  # Move the linear layer to the same device
    
    # Liberar memoria antes de la validación
    torch.cuda.empty_cache()
    print("Memory cache cleared before validation.")
    print_memory_usage() 
     
    print("Now generate predictions")
    
    try:
        predictions = custom_validation_loop(llama_model, val_dataloader, device, tokenizer, linear_layer, print_every_n_steps=1)
        print(predictions)
    except:
        print("Error al generar predicciones")
    

if __name__ == "__main__":
    main_train_decoder()


2024-07-06 15:28:09.949316: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-06 15:28:10.026958: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[2024-07-06 15:28:11,457] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
Memory Usage: 53.0% used. 121087.41MB available.
GPU 0 Memory Usage: 0.00MB reserved. 0.00MB max allocated. 0.00MB currently allocated.
GPU 1 Memory Usage: 0.00MB reserved. 0.00MB max allocated. 0.00MB currently allocated.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded.
Memory Usage: 53.3% used. 120395.90MB available.
GPU 0 Memory Usage: 1862.00MB reserved. 1955.44MB max allocated. 1860.59MB currently allocated.
GPU 1 Memory Usage: 3668.00MB reserved. 3694.41MB max allocated. 3578.44MB currently allocated.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Training data size: 134823
Validation data size: 5000


[34m[1mwandb[0m: Currently logged in as: [33mjsantamariag[0m ([33mj-santamariag[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/javiermo/.netrc



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Starting training.
Memory Usage: 53.3% used. 120283.44MB available.
GPU 0 Memory Usage: 1902.00MB reserved. 1955.44MB max allocated. 1900.59MB currently allocated.
GPU 1 Memory Usage: 3788.00MB reserved. 3716.46MB max allocated. 3704.45MB currently allocated.


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Step,Training Loss
1,5.7684
2,5.6615
3,5.5194
4,5.4073
5,4.9717
6,4.527
7,4.6162
8,4.2385
9,4.4413
10,3.8714


Training finished.
Memory Usage: 18.6% used. 209565.66MB available.
GPU 0 Memory Usage: 8066.00MB reserved. 8039.13MB max allocated. 1954.84MB currently allocated.
GPU 1 Memory Usage: 22588.00MB reserved. 21963.49MB max allocated. 3834.70MB currently allocated.


VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/grad_norm,█▂▂▁▁▁▁▁▂▁▂▂▁▂▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁▁
train/loss,█▅▅▅▅▄▄▄▄▄▅▅▃▄▄▄▄▄▄▃▁▅▃▄▃▃▄▃▃▃▄▃▂▃▃▁▁▃▃▃

0,1
total_flos,7.812073095891517e+17
train/epoch,0.99964
train/global_step,1053.0
train/grad_norm,0.76773
train/learning_rate,0.0
train/loss,2.7693
train_loss,2.92365
train_runtime,13570.9575
train_samples_per_second,9.935
train_steps_per_second,0.078


Custom validation loop
device = cuda 0
Move linear layer to device


The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Memory cache cleared before validation.
Memory Usage: 18.6% used. 209614.32MB available.
GPU 0 Memory Usage: 1990.00MB reserved. 8039.13MB max allocated. 1960.85MB currently allocated.
GPU 1 Memory Usage: 4086.00MB reserved. 21963.49MB max allocated. 3828.69MB currently allocated.
Now generate predictions
Validation step 0
Memory Usage: 18.6% used. 209614.32MB available.
GPU 0 Memory Usage: 1990.00MB reserved. 8039.13MB max allocated. 1960.85MB currently allocated.
GPU 1 Memory Usage: 4086.00MB reserved. 21963.49MB max allocated. 3828.69MB currently allocated.
Embeddings shape before linear layer: torch.Size([32, 768])
Embeddings shape after linear layer: torch.Size([32, 4096])
Input IDs shape: torch.Size([32, 1])
Reshaped Embeddings shape: torch.Size([32, 1, 4096])
Checking memory before generation
Memory Usage: 18.6% used. 209614.32MB available.
GPU 0 Memory Usage: 1990.00MB reserved. 8039.13MB max allocated. 1961.10MB currently allocated.
GPU 1 Memory Usage: 4086.00MB reserved. 2196