In [46]:
from transformers import AutoTokenizer, AutoConfig, AutoModel, DistilBertTokenizer, GPT2LMHeadModel, GPT2Config, GPT2Model
from transformers import TrainingArguments, Trainer

from datasets import load_dataset
from datasets import Dataset
import pandas as pd
import re
import torch
import numpy as np
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from transformers_tutorial.networks.attention_head import MultiHeadAttention, FeedForward
import datetime

# Load data and preprocess

In [2]:
df_poem_raw = pd.json_normalize(pd.read_json("../data/verse_202412132333.json").iloc[:,0])

def preprocess(df):
    df_ = df.copy()
    diacritics_pattern = r'[\u064E\u064F\u0650\u0651\u0652\u0640]'
    
    df_['text'] = df_['text'].apply(lambda x: re.sub(diacritics_pattern, '', x))
    df_['verse_index'] = (df_['vorder']-1) // 2

    df_output = (
        df_.sort_values("position", ascending=True)
        .groupby(["poem_id", "verse_index"])["text"]
        .agg(lambda x: " - ".join(x.tolist())
            ).reset_index()
    )
    
    # df_output['text_reverse'] = df_output['text'].apply(lambda x: " ".join(reversed(x.strip().split(" "))))

    return df_output

In [3]:
df_prep = preprocess(df_poem_raw)

In [4]:
df_poem_raw.iloc[2:4]

Unnamed: 0,poem_id,vorder,position,text
2,700000,3,0,همچو شاهین به هوا جلوه کنان می گذرم
3,700000,4,1,تیزرو بالی و تازنده پری داده مرا


In [5]:
df_prep[['text']].iloc[1].values

array(['همچو شاهین به هوا جلوه کنان می گذرم - تیزرو  بالی و تازنده پری داده مرا'],
      dtype=object)

# Load tokenizer

In [6]:
tokenizer = AutoTokenizer.from_pretrained("mitra-mir/BERT-Persian-Poetry")

In [7]:
def encode_inputs(df_, max_length=64):
    return tokenizer(df_['text'].values.tolist(), 
                     padding='max_length', 
                     truncation=True, 
                     max_length=max_length, 
                     return_tensors="pt")

def decode_tokens(tokens_, skip_special_tokens=False):
    decoded = tokenizer.batch_decode(tokens_, skip_special_tokens=skip_special_tokens)
    return decoded

In [8]:
tokens = encode_inputs(df_prep)

Make sure that tokens orders are correct.

In [9]:
tokens['input_ids'][1], decode_tokens(tokens['input_ids'][1][:20]) #, decode_tokens(tokens['input_ids'][1:2])

(tensor([    2,  2164,  1112, 10880,  1923,  2595,  6618, 23051,  1924,  4479,
          1113,   120, 12679,  1112, 11976,   623,  4685,  2097,  6037,  2218,
          2426,     3,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]),
 ['[CLS]',
  'همچ',
  '##و',
  'شاهین',
  'به',
  'هوا',
  'جلوه',
  'کنان',
  'می',
  'گذر',
  '##م',
  '-',
  'تیزر',
  '##و',
  'بالی',
  'و',
  'تاز',
  '##نده',
  'پری',
  'داده'])

In [10]:
targets = torch.zeros_like(tokens['input_ids'])
targets[:, :-1] = tokens['input_ids'][:, 1:]

tokens['label'] = targets

## Generate training dataset

In [11]:
full_tokens = Dataset.from_dict(tokens)
full_tokens.set_format("pt")

In [12]:
full_tokens['input_ids'], full_tokens['label']

(tensor([[   2, 3656, 6916,  ...,    0,    0,    0],
         [   2, 2164, 1112,  ...,    0,    0,    0],
         [   2, 2063, 6079,  ...,    0,    0,    0],
         ...,
         [   2, 9910, 2441,  ...,    0,    0,    0],
         [   2, 5613, 8071,  ...,    0,    0,    0],
         [   2, 2143,  607,  ...,    0,    0,    0]]),
 tensor([[ 3656,  6916,  1932,  ...,     0,     0,     0],
         [ 2164,  1112, 10880,  ...,     0,     0,     0],
         [ 2063,  6079,  3156,  ...,     0,     0,     0],
         ...,
         [ 9910,  2441,  1932,  ...,     0,     0,     0],
         [ 5613,  8071,  1921,  ...,     0,     0,     0],
         [ 2143,   607, 34351,  ...,     0,     0,     0]]))

## Train / validation split

In [13]:
N_FULL_DATASET = targets.shape[0]
TRAIN_FRAC = 0.9
TRAIN_SIZE = int(N_FULL_DATASET * TRAIN_FRAC)

SEQ_LEN = full_tokens['input_ids'].shape[1]

In [34]:
train_data = Dataset.from_dict(full_tokens[:10])
validation_data = Dataset.from_dict(full_tokens[TRAIN_SIZE:])

In [35]:
validation_data.set_format("pt"), train_data.set_format("pt")

(None, None)

In [36]:
validation_data.shape, train_data.shape

((414, 4), (10, 4))

In [37]:
train_data

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
    num_rows: 10
})

# Decoder Transformers

In [38]:
model = GPT2LMHeadModel(GPT2Config())
model.resize_token_embeddings(tokenizer.vocab_size)  # Adjust model vocab size

Embedding(42000, 768)

In [39]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(42000, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=42000, bias=False)
)

In [40]:
with torch.no_grad():
    _ = model(**{k: v for k,v in train_data[:2].items() if k in {"attention_mask", "input_ids"}})

In [41]:
_.logits.shape

torch.Size([2, 64, 42000])

# Training

In [42]:
device = "gpu" if torch.cuda.is_available() else "cpu"

In [43]:
training_args = TrainingArguments(
    output_dir="./results",
    logging_steps=10, 
    evaluation_strategy="epoch",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    learning_rate=1e-4,
    weight_decay=0.01,
)

In [44]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=train_data,
    eval_dataset=validation_data,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [45]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,3.240806


TrainOutput(global_step=3, training_loss=5.688831965128581, metrics={'train_runtime': 47.1551, 'train_samples_per_second': 0.212, 'train_steps_per_second': 0.064, 'total_flos': 326615040000.0, 'train_loss': 5.688831965128581, 'epoch': 1.0})

# Generate

In [None]:
def generate_text(model_, tokenizer_, seq_len_, initial_text, max_length = 100):
    output = initial_text.split(" ")
    print(initial_text, end=" ")

    for i in range(0, min(seq_len_, max_length)):
        current_text = " ".join(output)
        
        with torch.no_grad():
            tokens_ = tokenizer_(current_text, padding='max_length', truncation=True, max_length=seq_len_, return_tensors="pt")

            chosen_token = torch.softmax(model_(**tokens_).logits, dim=-1)[:, i, :].argmax()
            next_word = tokenizer_.decode(chosen_token, skip_special_tokens=False)
            print(next_word, end=" ")
            output += next_word
            
    return " ".join(output)

In [None]:
_ = generate_text(model, tokenizer, SEQ_LEN, "همچو")

# Save model

In [56]:
# trainer.save_model(f'./trained_model_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}')