In [None]:
!pip install transformers[sentencepiece]
!pip install datasets evaluate transformers[sentencepiece]

In [None]:
import tensorflow as tf
import transformers
import numpy as np
from transformers import AutoTokenizer,AutoModel,TFGPT2LMHeadModel,AutoConfig,DataCollatorForLanguageModeling
from datasets import load_dataset
from transformers import create_optimizer,pipeline
from tensorflow.keras.preprocessing.sequence import pad_sequences
from datasets import Dataset, DatasetDict
from transformers import AdamWeightDecay, get_scheduler
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import ExponentialDecay

In [None]:
dataset=load_dataset('csv',data_files='Pushkin.csv',encoding='latin-1')

In [None]:
context_length = 128
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def sequense_process(x,step,length,sequence):
    if x+step<=length:
      return sequence[x:x+step]
    else:
      return pad_sequences([sequence[x:]], maxlen=step, padding='pre', truncating='pre',value=50256).flatten().tolist() #eos_token

In [None]:
# since the instruction in hugging worng and only produce one output for tokenize funtion
data_list=[]
for element in dataset['train']:
    token=tokenizer.encode(element['Content'],add_special_tokens=False)
    length=len(token)
    for i in range(0,length,context_length):
      tmp=[]
      if i+context_length<=length:
        tmp = token[i:i+context_length]
      else:
        tmp = pad_sequences([token[i:]], maxlen=context_length, padding='post', truncating='pre',value=50256).flatten().tolist() #50256=eos_token=pad_token

      data_list.append(tmp)
      # data_list.append(sequense_process(i,context_length,length,token))
data_list={'input_ids': data_list}

In [None]:
data_list = Dataset.from_dict(data_list)
data_list = DatasetDict({"train": data_list})
data_list = data_list['train'].train_test_split(train_size=0.9,seed=23)

In [None]:
def map_funtion(element):
  return {'input_ids': element['input_ids']}

In [None]:
data_map=data_list.map(map_funtion)

In [None]:
config=AutoConfig.from_pretrained(
    'gpt2',
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id
)

In [None]:
model=TFGPT2LMHeadModel(config)
model(model.dummy_inputs)
model.summary()

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="tf")

In [None]:
tf_train_dataset = model.prepare_tf_dataset(
    data_map["train"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=16,
)
tf_eval_dataset = model.prepare_tf_dataset(
    data_map["test"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

In [None]:
def masked_loss(label, pred):
    mask = label != 50256
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')
    loss = loss_object(label, pred)

    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask

    loss = tf.reduce_sum(loss) / tf.reduce_sum(mask)
    return loss

In [None]:
def masked_accuracy(label, pred):
    pred = tf.argmax(pred, axis=2)
    label = tf.cast(label, pred.dtype)
    match = label == pred

    mask = label != 50256

    match = match & mask

    match = tf.cast(match, dtype=tf.float32)
    mask = tf.cast(mask, dtype=tf.float32)
    return tf.reduce_sum(match) / tf.reduce_sum(mask)

In [None]:
initial_learning_rate=0.00005
decay_steps=2000
decay_rate=0.96

lr_schedule=ExponentialDecay(
      initial_learning_rate,
      decay_steps,
      decay_rate,
      staircase=True
  )
optimizer=Adam(lr_schedule)

model.compile(optimizer=optimizer,metrics=['accuracy'])
tf.keras.mixed_precision.set_global_policy("mixed_float16")

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
model.fit(tf_train_dataset,epochs=100,validation_data=tf_eval_dataset)

In [None]:
pipe = pipeline(
    "text-generation", model=model, tokenizer=tokenizer, device=0
)

In [None]:
txt='I love you'
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])