In [1]:
! pip install accelerate -U
! pip install tokenizers
! pip install transformers datasets evaluate
!CUDA_LAUNCH_BLOCKING=1

Collecting accelerate
  Downloading accelerate-1.0.1-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.0.1-py3-none-any.whl (330 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m330.9/330.9 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.34.2
    Uninstalling accelerate-0.34.2:
      Successfully uninstalled accelerate-0.34.2
Successfully installed accelerate-1.0.1
Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multipr

In [2]:
from datasets import load_dataset, DatasetDict

In [3]:
poetry = load_dataset("jakartaresearch/poem-tweets", split="train")
raw_datasets = poetry.train_test_split(test_size=0.2)

raw_datasets

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/2.93k [00:00<?, ?B/s]

poem-tweets.py:   0%|          | 0.00/2.75k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16427 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'screen_name', 'text'],
        num_rows: 13141
    })
    test: Dataset({
        features: ['id', 'screen_name', 'text'],
        num_rows: 3286
    })
})

In [4]:
for key in raw_datasets["train"][0]:
    print(f"{key.upper()}: {raw_datasets['train'][0][key]}")

ID: 1355307493501485058
SCREEN_NAME: Bait_Puisi
TEXT: Puisi itu kamu. https://t.co/ysNeXMAFeF


In [5]:
from transformers import AutoTokenizer

context_length = 32
tokenizer = AutoTokenizer.from_pretrained("flax-community/gpt2-small-indonesian")

outputs = tokenizer(
    raw_datasets["train"][:2]["text"],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/207 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/467k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Input IDs length: 3
Input chunk lengths: [18, 32, 1]
Chunk mapping: [0, 1, 1]




In [6]:
def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)
tokenized_datasets

Map:   0%|          | 0/13141 [00:00<?, ? examples/s]

Map:   0%|          | 0/3286 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 9683
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 2428
    })
})

In [9]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "flax-community/gpt2-small-indonesian",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

In [10]:
model = GPT2LMHeadModel(config)

In [11]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [12]:
out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([5, 32])
attention_mask shape: torch.Size([5, 32])
labels shape: torch.Size([5, 32])


In [13]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="puisi-ds",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=1_00,
    logging_steps=1_00,
    gradient_accumulation_steps=8,
    num_train_epochs=4,
    weight_decay=0.1,
    warmup_steps=1_00,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)



In [8]:
import wandb
wandb.init(mode="disabled")

In [14]:
%%time
trainer.train()
trainer.save_model('/content/puisi-ds')



Step,Training Loss,Validation Loss
100,6.504,5.064033


CPU times: user 5min 8s, sys: 4 s, total: 5min 12s
Wall time: 5min 32s


In [15]:
import torch
from transformers import pipeline

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
pipe = pipeline(
    "text-generation", model="./puisi-ds", device=device
)

In [16]:
txt = """apa itu kecewa?"""
print(pipe(txt)[0]["generated_text"])

apa itu kecewa?
dita-debu yang 
yang kau akan tumbuh, 
berjauh dan luka dalam.

#sajakmalam ada luka di sini kau datang dan aku, 
Sampaipuisi: Tuhan tak perlu
