In [None]:
!pip install transformers[sentencepiece]
!pip install datasets evaluate transformers[sentencepiece]

In [None]:
import tensorflow as tf
import transformers
import numpy as np
from transformers import AutoTokenizer,AutoModel,TFGPT2LMHeadModel,AutoConfig,DataCollatorForLanguageModeling
from datasets import load_dataset
from transformers import create_optimizer,pipeline
from tensorflow.keras.preprocessing.sequence import pad_sequences
from datasets import Dataset, DatasetDict

In [None]:
dataset=load_dataset('csv',data_files='Pushkin.csv',encoding='latin-1')

In [None]:
context_length = 128
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def sequense_process(x,step,length,sequence):
    if x+step<=length:
      return sequence[x:x+step]
    else:
      return pad_sequences([sequence[x:]], maxlen=step, padding='pre', truncating='pre',value=50256).flatten().tolist() #eos_token

In [None]:
# since the instruction in hugging worng and only produce one output for tokenize funtion
data_list=[]
for element in dataset['train']:
    token=tokenizer.encode(element['Content'],add_special_tokens=False)
    length=len(token)
    for i in range(0,length,context_length):
      tmp=[]
      if i+context_length<=length:
        tmp = token[i:i+context_length]
      else:
        tmp = pad_sequences([token[i:]], maxlen=context_length, padding='pre', truncating='pre',value=50256).flatten().tolist() #50256=eos_token=pad_token

      data_list.append(tmp)
      # data_list.append(sequense_process(i,context_length,length,token))
data_list={'input_ids': data_list}

In [None]:
data_list = Dataset.from_dict(data_list)
data_list = DatasetDict({"train": data_list})
data_list = data_list['train'].train_test_split(train_size=0.9,seed=23)

In [None]:
def map_funtion(element):
  return {'input_ids': element['input_ids']}

In [None]:
data_map=data_list.map(map_funtion)

Map:   0%|          | 0/196 [00:00<?, ? examples/s]

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

In [None]:
config=AutoConfig.from_pretrained(
    'gpt2',
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    # bos_token_id=tokenizer.bos_token_id,
    # eos_token_id=tokenizer.eos_token_id
)

In [None]:
model=TFGPT2LMHeadModel(config)
model(model.dummy_inputs)
model.summary()

Model: "tfgpt2lm_head_model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 transformer (TFGPT2MainLaye  multiple                 124439808 
 r)                                                              
                                                                 
Total params: 124,439,808
Trainable params: 124,439,808
Non-trainable params: 0
_________________________________________________________________


In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="tf")

In [None]:
out = data_collator([data_map["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: (5, 128)
attention_mask shape: (5, 128)
labels shape: (5, 128)


In [None]:
tokenizer.is_fast

True

In [None]:
tf_train_dataset = model.prepare_tf_dataset(
    data_map["train"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=32,
)
tf_eval_dataset = model.prepare_tf_dataset(
    data_map["test"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=32,
)

In [None]:
num_train_steps=len(tf_train_dataset)
optimizer,schedule=create_optimizer(
    init_lr=5e-5,
    num_warmup_steps=1000,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01
)
model.compile(optimizer=optimizer)
tf.keras.mixed_precision.set_global_policy("mixed_float16")

In [None]:
model.fit(tf_train_dataset,epochs=100)

In [None]:
pipe = pipeline(
    "text-generation", model=model, tokenizer=tokenizer, device=0
)

In [None]:
txt='I love you'
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])