In [1]:
from transformers import AutoTokenizer, AutoConfig, AutoModel, DistilBertTokenizer
from datasets import load_dataset
from datasets import Dataset
import pandas as pd
import re
import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from transformers_tutorial.networks.attention_head import MultiHeadAttention, FeedForward

# Load data and preprocess

In [2]:
df_poem_raw = pd.json_normalize(pd.read_json("../data/verse_202412132333.json").iloc[:,0])

def preprocess(df):
    df_ = df.copy()
    diacritics_pattern = r'[\u064E\u064F\u0650\u0651\u0652\u0640]'
    
    df_['text'] = df_['text'].apply(lambda x: re.sub(diacritics_pattern, '', x))
    df_['verse_index'] = (df_['vorder']-1) // 2

    df_output = (
        df_.sort_values("position", ascending=True)
        .groupby(["poem_id", "verse_index"])["text"]
        .agg(lambda x: " - ".join(x.tolist())
            ).reset_index()
    )
    
    # df_output['text_reverse'] = df_output['text'].apply(lambda x: " ".join(reversed(x.strip().split(" "))))

    return df_output

In [3]:
df_prep = preprocess(df_poem_raw)

In [4]:
df_poem_raw.iloc[2:4]

Unnamed: 0,poem_id,vorder,position,text
2,700000,3,0,همچو شاهین به هوا جلوه کنان می گذرم
3,700000,4,1,تیزرو بالی و تازنده پری داده مرا


In [5]:
df_prep[['text']].iloc[1].values

array(['همچو شاهین به هوا جلوه کنان می گذرم - تیزرو  بالی و تازنده پری داده مرا'],
      dtype=object)

# Load tokenizer

In [6]:
tokenizer = AutoTokenizer.from_pretrained("mitra-mir/BERT-Persian-Poetry")

In [7]:
def encode_inputs(df_):
    return tokenizer(df_['text'].values.tolist(), padding=False)

def decode_tokens(tokens_, skip_special_tokens=False):
    decoded = tokenizer.batch_decode(tokens_, skip_special_tokens=skip_special_tokens)
    return decoded

In [8]:
tokens = encode_inputs(df_prep)

Make sure that tokens orders are correct.

In [9]:
tokens['input_ids'][1][:5], decode_tokens(tokens['input_ids'][1][:20]), decode_tokens(tokens['input_ids'][1:2])

([2, 2164, 1112, 10880, 1923],
 ['[CLS]',
  'همچ',
  '##و',
  'شاهین',
  'به',
  'هوا',
  'جلوه',
  'کنان',
  'می',
  'گذر',
  '##م',
  '-',
  'تیزر',
  '##و',
  'بالی',
  'و',
  'تاز',
  '##نده',
  'پری',
  'داده'],
 ['[CLS] همچو شاهین به هوا جلوه کنان می گذرم - تیزرو بالی و تازنده پری داده مرا [SEP]'])

## Generate training dataset

In [10]:
def generate_sequences(tokens_):
    output = []
    target = []
    for seq in tokens_['input_ids']:
        for ix in range(1, len(seq)):
            output += [torch.tensor(seq[:ix])]
            target.append(seq[ix])

    padded_tensor = pad_sequence(output, batch_first=True, padding_value=0)
    attention_mask_tensor = (padded_tensor != 0).int()
          
    return {"input_ids": padded_tensor, "attention_mask": attention_mask_tensor}, torch.tensor(target)

In [11]:
full_tokens, targets = generate_sequences(tokens)

In [12]:
full_tokens['input_ids'].shape

torch.Size([71097, 39])

Check if target is correct

In [13]:
decode_tokens(full_tokens['input_ids'][:10], skip_special_tokens=True), decode_tokens(targets[:20], skip_special_tokens=True)

(['',
  'خواب',
  'خواب دیدم',
  'خواب دیدم که',
  'خواب دیدم که خدا',
  'خواب دیدم که خدا بال',
  'خواب دیدم که خدا بال و',
  'خواب دیدم که خدا بال و پری',
  'خواب دیدم که خدا بال و پری داده',
  'خواب دیدم که خدا بال و پری داده مرا'],
 ['خواب',
  'دیدم',
  'که',
  'خدا',
  'بال',
  'و',
  'پری',
  'داده',
  'مرا',
  '-',
  'در',
  'هوا',
  'قوت',
  'سیر',
  'و',
  'سفری',
  'داده',
  'مرا',
  '',
  'همچ'])

In [14]:
full_tokens['attention_mask'][:10,:10]

tensor([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=torch.int32)

## Train / validation split

In [15]:
N_FULL_DATASET = targets.shape[0]
TRAIN_FRAC = 0.9
TRAIN_SIZE = int(N_FULL_DATASET * TRAIN_FRAC)

SEQ_LEN = full_tokens['input_ids'].shape[1]

In [21]:
train_data = Dataset.from_dict({key: val[:TRAIN_SIZE] for key, val in full_tokens.items()})

#.add_column("target", targets[:TRAIN_SIZE])
train_target = targets[:TRAIN_SIZE]

validation_data = Dataset.from_dict({key: val[TRAIN_SIZE:] for key, val in full_tokens.items()})
validation_target = targets[TRAIN_SIZE:]

In [35]:
validation_data.set_format("pt"), train_data.set_format("pt")

(None, None)

In [23]:
validation_targets.shape, train_targets.shape

(torch.Size([7110]), torch.Size([63987]))

# Decoder Transformers

In [27]:
class TransformerDecoderLayer(nn.Module):
    def __init__(self, vocab_size, hidden_dim, n_heads, intermediate_dim, device, p_dropout=0.2, seq_len=None):
        super().__init__()

        self.device = device

        config = AutoConfig.from_pretrained("bert-base-uncased")
        config.vocab_size = vocab_size
        if seq_len:
            config.max_position_embeddings = seq_len
            
        self.embeddings = AutoModel.from_config(config).embeddings

        
        self.multi_head_attention = MultiHeadAttention(
            emb_dim=config.hidden_size, hidden_dim=hidden_dim, n_heads=n_heads, is_decoder=True,
        )
        self.ff = FeedForward(
            hidden_dim=hidden_dim,
            intermediate_dim=intermediate_dim,
            p_dropout=p_dropout,
        )
        self.layer_norm_1 = nn.LayerNorm(hidden_dim)
        self.layer_norm_2 = nn.LayerNorm(hidden_dim)

        self.linear = nn.Linear(hidden_dim, vocab_size)


    def forward(self, input_):
        data = {
            k: input_[k].to(self.device)
            for k in input_.keys()
            if k in ["attention_mask", "input_ids"]
        }

        x = self.embeddings(data['input_ids'])
        
        residual = x
        
        x = residual + self.multi_head_attention(x, data["attention_mask"])
        x = self.layer_norm_1(x)

        residual = x
        x = residual + self.ff(x)
        x = self.layer_norm_2(x)
        
        return self.linear(x)

In [28]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [29]:
VOCAB_SIZE = tokenizer.vocab_size
HIDDEN_DIM = 1024
INTERMEDIATE_DIM = HIDDEN_DIM * 4
N_HEADS = 12

trasnformer_decoder = TransformerDecoderLayer(
    vocab_size=VOCAB_SIZE, 
    hidden_dim=HIDDEN_DIM, 
    n_heads=N_HEADS, 
    intermediate_dim=INTERMEDIATE_DIM,
    seq_len=SEQ_LEN,
    device=device,
).to(device)

In [30]:
trasnformer_decoder

TransformerDecoderLayer(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(42000, 768, padding_idx=0)
    (position_embeddings): Embedding(39, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (multi_head_attention): MultiHeadAttention(
    (heads): ModuleList(
      (0-11): 12 x AttentionHead(
        (q): Linear(in_features=768, out_features=85, bias=True)
        (k): Linear(in_features=768, out_features=85, bias=True)
        (v): Linear(in_features=768, out_features=85, bias=True)
      )
    )
    (dense): Linear(in_features=1024, out_features=1024, bias=True)
  )
  (ff): FeedForward(
    (layers): Sequential(
      (layer_1): Linear(in_features=1024, out_features=4096, bias=True)
      (gelu): GELU(approximate='none')
      (layer_2): Linear(in_features=4096, out_features=1024, bias=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
 

In [36]:
validation_data[:]

{'input_ids': tensor([[   2, 9669,    0,  ...,    0,    0,    0],
         [   2, 9669, 1110,  ...,    0,    0,    0],
         [   2, 9669, 1110,  ...,    0,    0,    0],
         ...,
         [   2, 2143,  607,  ...,    0,    0,    0],
         [   2, 2143,  607,  ...,    0,    0,    0],
         [   2, 2143,  607,  ...,    0,    0,    0]]),
 'attention_mask': tensor([[1, 1, 0,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [38]:
with torch.no_grad():
    trasnformer_decoder(validation_data[:2])

RuntimeError: mat1 and mat2 shapes cannot be multiplied (78x1020 and 1024x1024)

# Training

In [None]:
for i in trasnformer_decoder.named_parameters():
    if i[1].requires_grad:
        print(i[0])

In [None]:
optimizer = torch.optim.AdamW(
    params={p for p in trasnformer_decoder.parameters() if p.requires_grad}, 
    lr=1e-5, weight_decay=0.01
)
loss_fn = nn.CrossEntropyLoss()


trainer = Trainer(optimizer=optimizer, loss=loss_fn, model=transf_clf)