<a href="https://colab.research.google.com/github/jfcasasp/ProyectoNPL/blob/main/FineTuningSarcasmo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install bitsandbytes datasets accelerate loralib
!pip install git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git
!pip install accelerate
!pip install -i https://test.pypi.org/simple/ bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.41.2.post2-py3-none-any.whl (92.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting loralib
  Downloading loralib-0.1.2-py3-none-any.whl (10 kB)
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.6 MB/s

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model_id = 'bertin-project/bertin-gpt-j-6B'

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map='auto',
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
new_tokens = ["<SC>","<EC>"]

num_added_toks = tokenizer.add_tokens(new_tokens)
print("We have added", num_added_toks, "tokens")
# Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
model.resize_token_embeddings(len(tokenizer))

We have added 2 tokens


Embedding(50259, 4096)

In [None]:
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 7340032 || all params: 6057067603 || trainable%: 0.12118127914511903


In [None]:
import transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from datasets import load_dataset
from datasets import Dataset

In [None]:
data = pd.read_csv('/content/drive/MyDrive/sarcasmo.tsv', sep='\t')

In [None]:
df_filtered = data[data['Sarcasmo'] == True]

In [None]:
dataset = Dataset.from_pandas(df_filtered)

In [None]:
dataset

Dataset({
    features: ['Locutor', 'Locución', 'Sarcasmo', 'Utsumi', 'hipérbole', 'Haverkate', 'Grice', 'Nakamura', 'Acto de habla', '__index_level_0__'],
    num_rows: 90
})

In [None]:
def format_ds(example):
  example["Locución"] = "<SC>" + example['Locución'] + "<EC>"
  return example

In [None]:
dataset = dataset.map(format_ds, remove_columns=['Locutor','Sarcasmo', 'Utsumi', 'hipérbole', 'Haverkate', 'Grice', 'Nakamura', 'Acto de habla'])

Map:   0%|          | 0/959 [00:00<?, ? examples/s]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(dataset, test_size=0.2, random_state=42)
dataset['Locución'][0]

'Claro, él que fuma junto a la niña'

In [None]:
dataset = dataset.map(lambda samples: tokenizer(samples['Locución']), batched=True)

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

In [None]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=X_train,
    eval_dataset=X_test,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=8,
        gradient_accumulation_steps=8,
        warmup_steps=100,
        max_steps=1000,
        learning_rate=2e-4,
        fp16=False,
        logging_steps=5,
        output_dir='outputs'
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

In [None]:
!pip install --upgrade peft



In [None]:
model.push_to_hub("patrejoss/bertin-gpt-j-6B-es-finetuned-sarcasm-spanish-1000", use_auth_token=True)



adapter_model.safetensors:   0%|          | 0.00/29.4M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/patrejoss/bertin-gpt-j-6B-es-finetuned-sarcasm-spanish-1000/commit/5a6690d03f8bb3a5027d2a2a9f2e1e2f8566ab9a', commit_message='Upload model', commit_description='', oid='5a6690d03f8bb3a5027d2a2a9f2e1e2f8566ab9a', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
tokenizer.push_to_hub("patrejoss/bertin-gpt-j-6B-es-finetuned-sarcasm-spanish-1000", use_auth_token=True)

CommitInfo(commit_url='https://huggingface.co/patrejoss/bertin-gpt-j-6B-es-finetuned-sarcasm-spanish-1000/commit/b9cc9ea7c9fa37de06551ed2dd5e5111aa872830', commit_message='Upload tokenizer', commit_description='', oid='b9cc9ea7c9fa37de06551ed2dd5e5111aa872830', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = "patrejoss/bertin-gpt-j-6B-es-finetuned-sarcasm-spanish-1000"

# Cargar la configuración del modelo peft
config = PeftConfig(
    peft_type="lora",
    base_model_name_or_path="patrejoss/bertin-gpt-j-6B-es-finetuned-sarcasm-spanish-1000",
)

# Cargar el modelo de lenguaje generativo preentrenado
base_model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=False, device_map='auto')

# Cargar el tokenizer
tokenizer = AutoTokenizer.from_pretrained(peft_model_id)

# Combinar el modelo base y el modelo peft
model = PeftModel(base_model, config=config)

# Ejemplo de uso del modelo (puedes ajustar esto según tus necesidades específicas)
input_text = "Ejemplo de texto para generar continuación."
input_ids = tokenizer(input_text, return_tensors="pt")["input_ids"]
output = model.generate(input_ids)

# Imprimir la salida generada
print(tokenizer.decode(output[0], skip_special_tokens=True))



In [None]:
def gen_sarcasm(text):
  text = "<SC>" + text
  batch = tokenizer(text, return_tensors='pt')
  with torch.cuda.amp.autocast():
    output_tokens = model.generate(**batch, max_new_tokens=256, eos_token_id=50258)

  print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=False))

In [None]:
text = "Te quiero como amigo"
gen_sarcasm(text)