In [1]:
%pip install transformers==4.37.0 -q
%pip install -U accelerate bitsandbytes -q

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
# check if there are gpu available
!nvidia-smi

Tue Apr 30 12:25:54 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.171.04             Driver Version: 535.171.04   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        Off | 00000000:18:00.0 Off |                  N/A |
| 65%   65C    P2             342W / 350W |   4587MiB / 24576MiB |     96%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 3090        Off | 00000000:1C:00.0 Off |  

In [3]:
import transformers
import torch

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
HF_TOKEN = "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"
CACHE_DIR = "/home/inaki/.cache/huggingface/hub"

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=HF_TOKEN,
    cache_dir=CACHE_DIR
)

    # initialize the model
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    cache_dir=CACHE_DIR,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=HF_TOKEN
)

tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=HF_TOKEN,
    cache_dir=CACHE_DIR
)

model.eval()

pipeline = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    #return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    temperature=0.1,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # mex number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)




Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


#### Probamos a traducir una frase

In [19]:
%time
messages = [
    {"role": "system", "content": "you are a professional translator who translates phrases from Spanish to English on demand. You only answere the translation without any greetings and farewells."},
    {"role": "user", "content": "me llamo iñaki y mi comida favorita son los arroces de marisco"},
]

prompt = pipeline.tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
)

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = pipeline(
    prompt,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.2,
    top_p=0.9,
)
print(outputs[0]["generated_text"][len(prompt):])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


CPU times: user 11 µs, sys: 3 µs, total: 14 µs
Wall time: 31 µs
My name is Iñaki and my favorite food is seafood paella.


#### probamos a pedirle una variación de la frase

In [24]:
messages = [
    {"role": "system", "content": "You are a professional writer who provides a modification of the given sentence on demand. It is very important to keep the general meaning of the sentence, so in general only words or expressions should be replaced by synonyms, the more changes the better. Additionally, it is also advisable to change the style of writing. It is very important that You only respond the modified sentence, without any greetings and farewells."},
    {"role": "user", "content": "My name is Iñaki and my favorite food is seafood paella."},
]

prompt = pipeline.tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
)

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = pipeline(
    prompt,
    max_new_tokens=512,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.2,
    top_p=0.9,
)
print(outputs[0]["generated_text"][len(prompt):])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


I'm Ignatius, but my friends call me Iñaki, and my gastronomic weakness is undoubtedly a succulent serving of seafood paella.


#### Hacemos data augmentation con el dataset original en ingles

In [28]:
import tqdm
import time

# load dataset train_es_data.pth
datadir = "/home/inaki/host_data/dataset_oppositional/"
train_en_dataset_path = datadir + "train_en_data.pth"
train_en_dataset = torch.load(train_en_dataset_path)
system_prompt = "You are a professional writer who provides a modification of the given sentence on demand. It is very important to keep the general meaning of the sentence, so in general only words or expressions should be replaced by synonyms, the more changes the better. Additionally, it is also advisable to change the style of writing. It is very important that You only respond the modified sentence, without any greetings and farewells."
variations_number = 2

# iterate over the dataset and translate the spanish sentences to english
augmented = []
labels = []

start_time = time.time()
pbar = tqdm.tqdm(range(len(train_en_dataset[0])))
for i in pbar:
    for j in range(variations_number):
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": train_en_dataset[0][i]},
        ]

        prompt = pipeline.tokenizer.apply_chat_template(
                messages, 
                tokenize=False, 
                add_generation_prompt=True
        )

        terminators = [
            pipeline.tokenizer.eos_token_id,
            pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
        ]

        outputs = pipeline(
            prompt,
            max_new_tokens=512,
            eos_token_id=terminators,
            do_sample=True,
            temperature=0.2,
            top_p=0.9,
        )
        # print(train_en_dataset[0][i])
        # print(outputs[0]["generated_text"][len(prompt):])
        # print("--------------------")
        augmented += [outputs[0]["generated_text"][len(prompt):]]
        labels += [train_en_dataset[1][i]]
    elapsed_time = time.time() - start_time
    remaining_time = elapsed_time / (i+1) * (len(train_en_dataset[0]) - (i+1))
    pbar.set_postfix({'estimated time remaining': f'{remaining_time:.2f} seconds'})

# save new dataset
new_dataset_path = datadir + f"train_en_data_AUG_1-{variations_number}.pth"
torch.save([augmented, labels], new_dataset_path)

  0%|          | 0/3600 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 1/3600 [00:06<6:55:07,  6.92s/it, estimated time remaining=24914.18 seconds]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 2/3600 [00:36<20:04:42, 20.09s/it, estimated time remaining=65178.15 seconds]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 3/3600 [00:44<14:27:12, 14.47s/it, estimated time remaining=52760.21 seconds]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 4/3600 [00:52<12:13:12, 12.23s/it, estimated time remaining=47481.51 seconds]S

#### Traducimos el dataset español a ingles y lo guardamos

In [17]:
import tqdm

# load dataset train_es_data.pth
datadir = "/home/iñaki/host_data/dataset_oppositional/"
train_es_dataset_path = datadir + "train_es_data.pth"
train_es_dataset = torch.load(train_es_dataset_path)

# iterate over the dataset and translate the spanish sentences to english
translated = []
for i in tqdm.tqdm(range(len(train_es_dataset[0]))):
    messages = [
        {"role": "system", "content": "you are a professional translator who translates phrases from Spanish to English on demand. You only answere the translation without any greetings and farewells."},
        {"role": "user", "content": train_es_dataset[0][i]},
    ]

    prompt = pipeline.tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
    )

    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = pipeline(
        prompt,
        max_new_tokens=512,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.1,
        top_p=0.9,
    )
    print(train_es_dataset[0][i])
    print(outputs[0]["generated_text"][len(prompt):])
    print("--------------------")
    translated += [outputs[0]["generated_text"][len(prompt):]]
    break

  0%|          | 0/3600 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 0/3600 [08:54<?, ?it/s]

El G7 realiza una simulación de “ Pandemia de Viruela de leopardo ” : los ministros de salud se reúnen “ contra la próxima crisis ” Los ministros de salud del G7 ya están practicando “ la próxima pandemia ” . En Berlín , realizan una simulación del curso de una pandemia de viruela en 2023 , según informa el periódico alemán Bild . Cabe recordar que el mismo escenario futuro , en base esta vez a una modificación de la viruela , viene siendo anunciado por el magnate Bill Gates .   | https :// trikooba . blog / 45408 . html   Síguenos en :   Nuevo TELEGRAM : https :// t . me / trikooba2022   Nuevo FACEBOOK : https :// bit . ly / 38b1CEr INSTAGRAM : instagram . com / trikooba TWITTER : twitter . com / 3Kooba _ com MEWE : bit . ly / 3dxxenE VK : vk . com / trikoobanews 
The G7 is conducting a simulation of a "Leopard Virus Pandemic": health ministers meet "against the next crisis". The G7 health ministers are already practicing "the next pandemic". In Berlin, they are conducting a simulatio




In [20]:
# save new dataset
train_es_translated_data_path = datadir + "train_es-translated_data.pth"
torch.save([translated, train_es_dataset[1]], train_es_translated_data_path)

In [None]:
from IPython.display import display_html
display_html("<script>Jupyter.notebook.kernel.restart()</script>", raw=True)