In [2]:
from datasets import load_dataset, DatasetDict, Dataset, concatenate_datasets
from transformers import (
    AutoModelForCausalLM,
    GPT2LMHeadModel,
    GPT2Tokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)

from peft import get_peft_model, LoraConfig, TaskType
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_old = load_dataset("KomeijiForce/Text2Emoji")

ROW_NUMBER = dataset_old.num_rows["train"]
VALIDATION_SIZE = (ROW_NUMBER * 20) // 100

In [3]:
sliced_train_dataset = dataset_old["train"].select(range(ROW_NUMBER))

dataset = dataset_old.copy()
dataset["train"] = sliced_train_dataset
dataset

{'train': Dataset({
     features: ['text', 'emoji', 'topic'],
     num_rows: 503687
 })}

In [4]:
dct = {"text": [], "emoji": [], "topic": []}
with open("./data/gpt_translate_2.txt", "r", encoding="utf-8") as file:
    for line in file:
        inx = line.rfind(",")
        text, emoji = line[:inx], line[inx + 1 :]
        dct["text"].append(text.strip())
        dct["emoji"].append(emoji.strip())
        dct["topic"].append("None")

In [5]:
extension_data = Dataset.from_dict(dct)
extended_dataset = concatenate_datasets([extension_data, dataset['train']])
dataset['train'] = extended_dataset
dataset

{'train': Dataset({
     features: ['text', 'emoji', 'topic'],
     num_rows: 555314
 })}

In [6]:
NEW_ROW = dataset["train"].num_rows
VALIDATION = (NEW_ROW * 20) // 100

In [3]:
tokenizer_input = GPT2Tokenizer.from_pretrained("gpt2")



In [8]:
def transform_features(example):
    if not all(value is not None for value in example.values()):
        return {
            "full_text": None,
            "prompt_len": 0,
        }
    prompt = example["text"].strip() + " ->"
    target = example["emoji"].strip()
    full_text = prompt + " " + target
    return {
        "full_text": full_text,
        "prompt_len": len(tokenizer_input(full_text)["input_ids"]),
    }


transformed_dataset = dataset["train"].map(
    transform_features, remove_columns=["text", "emoji", "topic"]
)


In [9]:
train_test_split = transformed_dataset.shuffle(seed=42).train_test_split(
    test_size=VALIDATION
)

final_data = DatasetDict(
    {
        "train": train_test_split["train"].select(range(NEW_ROW - VALIDATION)),
        "validation": train_test_split["test"],
    }
)

print(final_data)

DatasetDict({
    train: Dataset({
        features: ['full_text', 'prompt_len'],
        num_rows: 444252
    })
    validation: Dataset({
        features: ['full_text', 'prompt_len'],
        num_rows: 111062
    })
})


In [10]:
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [11]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [12]:
tokenizer_input.pad_token = tokenizer_input.eos_token
model.config.pad_token_id = tokenizer_input.pad_token_id

In [13]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["c_attn", "c_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

model = get_peft_model(model, lora_config)



In [14]:
model.print_trainable_parameters()

trainable params: 811,008 || all params: 125,250,816 || trainable%: 0.6475


In [15]:
final_data = final_data.filter(
    lambda example: all(value is not None for value in example.values())
)

Filter: 100%|██████████| 444252/444252 [00:03<00:00, 121651.16 examples/s]
Filter: 100%|██████████| 111062/111062 [00:01<00:00, 109039.61 examples/s]


In [16]:
def tokenize_dataset(sample):
    max_length = 64
    tokenized = tokenizer_input(
        sample["full_text"],
        padding="max_length",
        max_length=max_length,
        truncation=True,
        return_attention_mask=False,
    )["input_ids"]

    return {"tokenized": tokenized}


In [17]:
shuffled_dataset = final_data.shuffle(seed=42)
tokenized_dataset = shuffled_dataset.map(tokenize_dataset, batched=True)

Map: 100%|██████████| 444249/444249 [01:39<00:00, 4452.67 examples/s]
Map: 100%|██████████| 111060/111060 [00:22<00:00, 4976.34 examples/s]


In [18]:
training_args = TrainingArguments(
    output_dir="./gpt_finetuned",
    eval_strategy="steps",
    per_device_train_batch_size=128,
    per_device_eval_batch_size=32,
    logging_steps=1000,
    save_steps=500,
    eval_steps=8000,
    warmup_steps=2000,
    overwrite_output_dir=True,
    save_total_limit=3,
    fp16=True,
    logging_first_step=True,
    num_train_epochs=3,
)


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer_input,
    mlm=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"]["tokenized"],
    eval_dataset=tokenized_dataset["validation"]["tokenized"],
    tokenizer=tokenizer_input,
    data_collator=data_collator,
)


trainer.train()

  0%|          | 0/10413 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
  0%|          | 1/10413 [00:01<3:01:22,  1.05s/it]

{'loss': 4.4222, 'grad_norm': 0.43945491313934326, 'learning_rate': 2.5000000000000002e-08, 'epoch': 0.0}


 10%|▉         | 1000/10413 [09:39<1:37:54,  1.60it/s]

{'loss': 3.7197, 'grad_norm': 0.3509596586227417, 'learning_rate': 2.5e-05, 'epoch': 0.29}


 19%|█▉        | 2000/10413 [20:10<1:28:32,  1.58it/s]

{'loss': 3.046, 'grad_norm': 0.4644668400287628, 'learning_rate': 5e-05, 'epoch': 0.58}


 29%|██▉       | 3000/10413 [29:37<1:05:27,  1.89it/s]

{'loss': 2.8834, 'grad_norm': 0.5349088907241821, 'learning_rate': 4.405681683109474e-05, 'epoch': 0.86}


 38%|███▊      | 4000/10413 [38:28<56:27,  1.89it/s]  

{'loss': 2.8059, 'grad_norm': 0.6350641250610352, 'learning_rate': 3.811363366218947e-05, 'epoch': 1.15}


 48%|████▊     | 5000/10413 [47:18<47:48,  1.89it/s]  

{'loss': 2.7574, 'grad_norm': 0.654656708240509, 'learning_rate': 3.21704504932842e-05, 'epoch': 1.44}


 58%|█████▊    | 6000/10413 [56:08<38:51,  1.89it/s]  

{'loss': 2.7234, 'grad_norm': 0.7149441242218018, 'learning_rate': 2.622726732437894e-05, 'epoch': 1.73}


 67%|██████▋   | 7000/10413 [1:04:58<30:05,  1.89it/s]

{'loss': 2.6997, 'grad_norm': 0.763678252696991, 'learning_rate': 2.0284084155473674e-05, 'epoch': 2.02}


 77%|███████▋  | 8000/10413 [1:13:48<21:15,  1.89it/s]

{'loss': 2.683, 'grad_norm': 0.702614426612854, 'learning_rate': 1.4340900986568407e-05, 'epoch': 2.3}


                                                      
 77%|███████▋  | 8000/10413 [1:17:21<21:15,  1.89it/s]

{'eval_loss': 2.5277597904205322, 'eval_runtime': 212.7242, 'eval_samples_per_second': 522.084, 'eval_steps_per_second': 16.317, 'epoch': 2.3}


 86%|████████▋ | 9000/10413 [1:26:13<12:26,  1.89it/s]   

{'loss': 2.6714, 'grad_norm': 0.6536557674407959, 'learning_rate': 8.403661000832046e-06, 'epoch': 2.59}


 96%|█████████▌| 10000/10413 [1:34:59<03:36,  1.90it/s]

{'loss': 2.664, 'grad_norm': 0.70386803150177, 'learning_rate': 2.4664210150956854e-06, 'epoch': 2.88}


100%|██████████| 10413/10413 [1:38:37<00:00,  1.76it/s]

{'train_runtime': 5917.4768, 'train_samples_per_second': 225.222, 'train_steps_per_second': 1.76, 'train_loss': 2.8574056418384175, 'epoch': 3.0}





TrainOutput(global_step=10413, training_loss=2.8574056418384175, metrics={'train_runtime': 5917.4768, 'train_samples_per_second': 225.222, 'train_steps_per_second': 1.76, 'total_flos': 4.394457496741478e+16, 'train_loss': 2.8574056418384175, 'epoch': 3.0})

In [5]:
save_directory = f"./model_gpt_{503687}_emoji"

In [None]:

if not os.path.exists(save_directory):
    os.mkdir(save_directory)
model.save_pretrained(save_directory)

In [6]:
device = "cuda"
model = AutoModelForCausalLM.from_pretrained(save_directory).to(device)

In [17]:
input_text = "Travelling across the globe."
inputs = tokenizer_input(input_text, return_tensors="pt").to(device)

outputs = model.generate(
    inputs.input_ids,
    max_length=40,
    temperature=0.7,
    do_sample=True,
    pad_token_id=tokenizer_input.pad_token_id,
)

output_ids = outputs[0].cpu().tolist()
generated_text = tokenizer_input.decode(output_ids, skip_special_tokens=True).strip()
print("Generated Output:")
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Output:
Travelling across the globe. The adrenaline rush is pure bliss! -> 🚶‍♀️🌤️🏃‍♀️💪�
